2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
81 #include <kern/kern_types.h>
82 #include <kern/simple_lock.h>
83 #include <kern/queue.h>
84 #include <kern/sched_prim.h>
85 #include <kern/cpu_number.h>
87 #include <libkern/OSAtomic.h>
88 #include <libkern/libkern.h>
90 #include <IOKit/IOMapper.h>
92 #include <machine/limits.h>
93 #include <machine/machine_routines.h>
96 #include <security/mac_framework.h>
99 #include <sys/mcache.h>
102 * MBUF IMPLEMENTATION NOTES.
104 * There is a total of 5 per-CPU caches:
107 * This is a cache of rudimentary objects of MSIZE in size; each
108 * object represents an mbuf structure. This cache preserves only
109 * the m_type field of the mbuf during its transactions.
112 * This is a cache of rudimentary objects of MCLBYTES in size; each
113 * object represents a mcluster structure. This cache does not
114 * preserve the contents of the objects during its transactions.
117 * This is a cache of rudimentary objects of NBPG in size; each
118 * object represents a mbigcluster structure. This cache does not
119 * preserve the contents of the objects during its transaction.
122 * This is a cache of mbufs each having a cluster attached to it.
123 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
124 * fields of the mbuf related to the external cluster are preserved
125 * during transactions.
128 * This is a cache of mbufs each having a big cluster attached to it.
129 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
130 * fields of the mbuf related to the external cluster are preserved
131 * during transactions.
135 * Allocation requests are handled first at the per-CPU (mcache) layer
136 * before falling back to the slab layer. Performance is optimal when
137 * the request is satisfied at the CPU layer because global data/lock
138 * never gets accessed. When the slab layer is entered for allocation,
139 * the slab freelist will be checked first for available objects before
140 * the VM backing store is invoked. Slab layer operations are serialized
141 * for all of the caches as the mbuf global lock is held most of the time.
142 * Allocation paths are different depending on the class of objects:
144 * a. Rudimentary object:
146 * { m_get_common(), m_clattach(), m_mclget(),
147 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
148 * composite object allocation }
151 * | +-----------------------+
153 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
156 * [CPU cache] -------> (found?) -------+
159 * mbuf_slab_alloc() |
162 * +---------> [freelist] -------> (found?) -------+
168 * +---<<---- kmem_mb_alloc()
170 * b. Composite object:
172 * { m_getpackets_internal(), m_allocpacket_internal() }
175 * | +------ (done) ---------+
177 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
180 * [CPU cache] -------> (found?) -------+
183 * mbuf_cslab_alloc() |
186 * [freelist] -------> (found?) -------+
189 * (rudimentary object) |
190 * mcache_alloc/mcache_alloc_ext() ------>>-----+
192 * Auditing notes: If auditing is enabled, buffers will be subjected to
193 * integrity checks by the audit routine. This is done by verifying their
194 * contents against DEADBEEF (free) pattern before returning them to caller.
195 * As part of this step, the routine will also record the transaction and
196 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
197 * also restore any constructed data structure fields if necessary.
199 * OBJECT DEALLOCATION:
201 * Freeing an object simply involves placing it into the CPU cache; this
202 * pollutes the cache to benefit subsequent allocations. The slab layer
203 * will only be entered if the object is to be purged out of the cache.
204 * During normal operations, this happens only when the CPU layer resizes
205 * its bucket while it's adjusting to the allocation load. Deallocation
206 * paths are different depending on the class of objects:
208 * a. Rudimentary object:
210 * { m_free(), m_freem_list(), composite object deallocation }
213 * | +------ (done) ---------+
215 * mcache_free/mcache_free_ext() |
218 * mbuf_slab_audit() |
221 * [CPU cache] ---> (not purging?) -----+
227 * [freelist] ----------->>------------+
228 * (objects never get purged to VM)
230 * b. Composite object:
232 * { m_free(), m_freem_list() }
235 * | +------ (done) ---------+
237 * mcache_free/mcache_free_ext() |
240 * mbuf_cslab_audit() |
243 * [CPU cache] ---> (not purging?) -----+
246 * mbuf_cslab_free() |
249 * [freelist] ---> (not purging?) -----+
252 * (rudimentary object) |
253 * mcache_free/mcache_free_ext() ------->>------+
255 * Auditing notes: If auditing is enabled, the audit routine will save
256 * any constructed data structure fields (if necessary) before filling the
257 * contents of the buffers with DEADBEEF (free) pattern and recording the
258 * transaction. Buffers that are freed (whether at CPU or slab layer) are
259 * expected to contain the free pattern.
263 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
264 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
265 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
266 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Note
267 * that debugging consumes more CPU and memory.
269 * Each object is associated with exactly one mcache_audit_t structure that
270 * contains the information related to its last buffer transaction. Given
271 * an address of an object, the audit structure can be retrieved by finding
272 * the position of the object relevant to the base address of the cluster:
274 * +------------+ +=============+
275 * | mbuf addr | | mclaudit[i] |
276 * +------------+ +=============+
278 * i = MTOCL(addr) +-------------+
279 * | +-----> | cl_audit[1] | -----> mcache_audit_t
280 * b = CLTOM(i) | +-------------+
282 * x = MCLIDX(b, addr) | +-------------+
283 * | | | cl_audit[7] |
284 * +-----------------+ +-------------+
287 * The mclaudit[] array is allocated at initialization time, but its contents
288 * get populated when the corresponding cluster is created. Because a cluster
289 * can be turned into NMBPCL number of mbufs, we preserve enough space for the
290 * mbufs so that there is a 1-to-1 mapping between them. A cluster that never
291 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
292 * remaining entries unused. For big clusters, only one entry is allocated
293 * and used for the entire cluster pair.
296 /* TODO: should be in header file */
297 /* kernel translater */
298 extern vm_offset_t
kmem_mb_alloc(vm_map_t
, int);
299 extern ppnum_t
pmap_find_phys(pmap_t pmap
, addr64_t va
);
300 extern vm_map_t mb_map
; /* special map */
303 static lck_mtx_t
*mbuf_mlock
;
304 static lck_attr_t
*mbuf_mlock_attr
;
305 static lck_grp_t
*mbuf_mlock_grp
;
306 static lck_grp_attr_t
*mbuf_mlock_grp_attr
;
308 /* Back-end (common) layer */
309 static void *mbuf_worker_run
; /* wait channel for worker thread */
310 static int mbuf_worker_ready
; /* worker thread is runnable */
311 static int mbuf_expand_mcl
; /* number of cluster creation requets */
312 static int mbuf_expand_big
; /* number of big cluster creation requests */
313 static int mbuf_expand_16k
; /* number of 16K cluster creation requests */
314 static int ncpu
; /* number of CPUs */
315 static int *mcl_paddr
; /* Array of cluster physical addresses */
316 static ppnum_t mcl_paddr_base
; /* Handle returned by IOMapper::iovmAlloc() */
317 static mcache_t
*ref_cache
; /* Cache of cluster reference & flags */
318 static mcache_t
*mcl_audit_con_cache
; /* Audit contents cache */
319 static unsigned int mbuf_debug
; /* patchable mbuf mcache flags */
320 static unsigned int mb_normalized
; /* number of packets "normalized" */
323 MC_MBUF
= 0, /* Regular mbuf */
325 MC_BIGCL
, /* Large (4K) cluster */
326 MC_16KCL
, /* Jumbo (16K) cluster */
327 MC_MBUF_CL
, /* mbuf + cluster */
328 MC_MBUF_BIGCL
, /* mbuf + large (4K) cluster */
329 MC_MBUF_16KCL
/* mbuf + jumbo (16K) cluster */
332 #define MBUF_CLASS_MIN MC_MBUF
333 #define MBUF_CLASS_MAX MC_MBUF_16KCL
334 #define MBUF_CLASS_LAST MC_16KCL
335 #define MBUF_CLASS_VALID(c) \
336 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
337 #define MBUF_CLASS_COMPOSITE(c) \
338 ((int)(c) > MBUF_CLASS_LAST)
342 * mbuf specific mcache allocation request flags.
344 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
347 * Per-cluster slab structure.
349 * A slab is a cluster control structure that contains one or more object
350 * chunks; the available chunks are chained in the slab's freelist (sl_head).
351 * Each time a chunk is taken out of the slab, the slab's reference count
352 * gets incremented. When all chunks have been taken out, the empty slab
353 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
354 * returned to a slab causes the slab's reference count to be decremented;
355 * it also causes the slab to be reinserted back to class's slab list, if
356 * it's not already done.
358 * Compartmentalizing of the object chunks into slabs allows us to easily
359 * merge one or more slabs together when the adjacent slabs are idle, as
360 * well as to convert or move a slab from one class to another; e.g. the
361 * mbuf cluster slab can be converted to a regular cluster slab when all
362 * mbufs in the slab have been freed.
364 * A slab may also span across multiple clusters for chunks larger than
365 * a cluster's size. In this case, only the slab of the first cluster is
366 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
367 * that they are part of the larger slab.
369 typedef struct mcl_slab
{
370 struct mcl_slab
*sl_next
; /* neighboring slab */
371 u_int8_t sl_class
; /* controlling mbuf class */
372 int8_t sl_refcnt
; /* outstanding allocations */
373 int8_t sl_chunks
; /* chunks (bufs) in this slab */
374 u_int16_t sl_flags
; /* slab flags (see below) */
375 u_int16_t sl_len
; /* slab length */
376 void *sl_base
; /* base of allocated memory */
377 void *sl_head
; /* first free buffer */
378 TAILQ_ENTRY(mcl_slab
) sl_link
; /* next/prev slab on freelist */
381 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
382 #define SLF_PARTIAL 0x0002 /* part of another slab */
383 #define SLF_DETACHED 0x0004 /* not in slab freelist */
386 * The array of slabs are broken into groups of arrays per 1MB of kernel
387 * memory to reduce the footprint. Each group is allocated on demand
388 * whenever a new piece of memory mapped in from the VM crosses the 1MB
391 #define MBSHIFT 20 /* 1MB */
392 #define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */
394 typedef struct mcl_slabg
{
395 mcl_slab_t slg_slab
[NSLABSPMB
]; /* group of slabs */
399 * Per-cluster audit structure.
402 mcache_audit_t
*cl_audit
[NMBPCL
]; /* array of audits */
405 #if CONFIG_MBUF_NOEXPAND
406 static unsigned int maxmbufcl
;
407 #endif /* CONFIG_MBUF_NOEXPAND */
410 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
411 * and m_ext structures. If auditing is enabled, we allocate a shadow
412 * mbuf structure of this size inside each audit structure, and the
413 * contents of the real mbuf gets copied into it when the mbuf is freed.
414 * This allows us to pattern-fill the mbuf for integrity check, and to
415 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
416 * Note that we don't save the contents of clusters when they are freed;
417 * we simply pattern-fill them.
419 #if defined(__LP64__)
420 #define AUDIT_CONTENTS_SIZE 160
422 #define AUDIT_CONTENTS_SIZE 80
423 #endif /* __LP64__ */
426 * mbuf specific mcache audit flags
428 #define MB_INUSE 0x01 /* object has not been returned to slab */
429 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
430 #define MB_SCVALID 0x04 /* object has valid saved contents */
433 * Each of the following two arrays hold up to nmbclusters elements.
435 static mcl_audit_t
*mclaudit
; /* array of cluster audit information */
436 static mcl_slabg_t
**slabstbl
; /* cluster slabs table */
437 static unsigned int maxslabgrp
; /* max # of entries in slabs table */
438 static unsigned int slabgrp
; /* # of entries in slabs table */
441 int nclusters
; /* # of clusters for non-jumbo (legacy) sizes */
442 int njcl
; /* # of clusters for jumbo sizes */
443 int njclbytes
; /* size of a jumbo cluster */
444 union mcluster
*mbutl
; /* first mapped cluster address */
445 union mcluster
*embutl
; /* ending virtual address of mclusters */
446 int max_linkhdr
; /* largest link-level header */
447 int max_protohdr
; /* largest protocol header */
448 int max_hdr
; /* largest link+protocol header */
449 int max_datalen
; /* MHLEN - max_hdr */
451 /* TODO: should be in header file */
454 /* The minimum number of objects that are allocated, to start. */
456 #define MINBIGCL (MINCL >> 1)
457 #define MIN16KCL (MINCL >> 2)
459 /* Low watermarks (only map in pages once free counts go below) */
460 #define MCL_LOWAT MINCL
461 #define MBIGCL_LOWAT MINBIGCL
462 #define M16KCL_LOWAT MIN16KCL
465 mbuf_class_t mtbl_class
; /* class type */
466 mcache_t
*mtbl_cache
; /* mcache for this buffer class */
467 TAILQ_HEAD(mcl_slhead
, mcl_slab
) mtbl_slablist
; /* slab list */
468 mcache_obj_t
*mtbl_cobjlist
; /* composite objects freelist */
469 mb_class_stat_t
*mtbl_stats
; /* statistics fetchable via sysctl */
470 u_int32_t mtbl_maxsize
; /* maximum buffer size */
471 int mtbl_minlimit
; /* minimum allowed */
472 int mtbl_maxlimit
; /* maximum allowed */
473 u_int32_t mtbl_wantpurge
; /* purge during next reclaim */
476 #define m_class(c) mbuf_table[c].mtbl_class
477 #define m_cache(c) mbuf_table[c].mtbl_cache
478 #define m_slablist(c) mbuf_table[c].mtbl_slablist
479 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
480 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
481 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
482 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
483 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
484 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
485 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
486 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
487 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
488 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
489 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
490 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
491 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
492 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
493 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
494 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
495 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
497 static mbuf_table_t mbuf_table
[] = {
499 * The caches for mbufs, regular clusters and big clusters.
501 { MC_MBUF
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF
)),
502 NULL
, NULL
, 0, 0, 0, 0 },
503 { MC_CL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL
)),
504 NULL
, NULL
, 0, 0, 0, 0 },
505 { MC_BIGCL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL
)),
506 NULL
, NULL
, 0, 0, 0, 0 },
507 { MC_16KCL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL
)),
508 NULL
, NULL
, 0, 0, 0, 0 },
510 * The following are special caches; they serve as intermediate
511 * caches backed by the above rudimentary caches. Each object
512 * in the cache is an mbuf with a cluster attached to it. Unlike
513 * the above caches, these intermediate caches do not directly
514 * deal with the slab structures; instead, the constructed
515 * cached elements are simply stored in the freelists.
517 { MC_MBUF_CL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
518 { MC_MBUF_BIGCL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
519 { MC_MBUF_16KCL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
522 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
524 static void *mb_waitchan
= &mbuf_table
; /* wait channel for all caches */
525 static int mb_waiters
; /* number of sleepers */
527 /* The following are used to serialize m_clalloc() */
528 static boolean_t mb_clalloc_busy
;
529 static void *mb_clalloc_waitchan
= &mb_clalloc_busy
;
530 static int mb_clalloc_waiters
;
532 static int mbstat_sysctl SYSCTL_HANDLER_ARGS
;
533 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS
;
534 static void mbuf_table_init(void);
535 static inline void m_incref(struct mbuf
*);
536 static inline u_int32_t
m_decref(struct mbuf
*);
537 static int m_clalloc(const u_int32_t
, const int, const u_int32_t
);
538 static void mbuf_worker_thread_init(void);
539 static mcache_obj_t
*slab_alloc(mbuf_class_t
, int);
540 static void slab_free(mbuf_class_t
, mcache_obj_t
*);
541 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t
***,
543 static void mbuf_slab_free(void *, mcache_obj_t
*, int);
544 static void mbuf_slab_audit(void *, mcache_obj_t
*, boolean_t
);
545 static void mbuf_slab_notify(void *, u_int32_t
);
546 static unsigned int cslab_alloc(mbuf_class_t
, mcache_obj_t
***,
548 static unsigned int cslab_free(mbuf_class_t
, mcache_obj_t
*, int);
549 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t
***,
551 static void mbuf_cslab_free(void *, mcache_obj_t
*, int);
552 static void mbuf_cslab_audit(void *, mcache_obj_t
*, boolean_t
);
553 static int freelist_populate(mbuf_class_t
, unsigned int, int);
554 static boolean_t
mbuf_cached_above(mbuf_class_t
, int);
555 static boolean_t
mbuf_steal(mbuf_class_t
, unsigned int);
556 static void m_reclaim(mbuf_class_t
, unsigned int, boolean_t
);
557 static int m_howmany(int, size_t);
558 static void mbuf_worker_thread(void);
559 static boolean_t
mbuf_sleep(mbuf_class_t
, unsigned int, int);
561 static void mcl_audit_init(void *, mcache_audit_t
**, mcache_obj_t
**,
562 size_t, unsigned int);
563 static mcache_audit_t
*mcl_audit_buf2mca(mbuf_class_t
, mcache_obj_t
*);
564 static void mcl_audit_mbuf(mcache_audit_t
*, void *, boolean_t
, boolean_t
);
565 static void mcl_audit_cluster(mcache_audit_t
*, void *, size_t, boolean_t
,
567 static void mcl_audit_restore_mbuf(struct mbuf
*, mcache_audit_t
*, boolean_t
);
568 static void mcl_audit_save_mbuf(struct mbuf
*, mcache_audit_t
*);
569 static void mcl_audit_mcheck_panic(struct mbuf
*);
570 static void mcl_audit_verify_nextptr(void *, mcache_audit_t
*);
572 static mcl_slab_t
*slab_get(void *);
573 static void slab_init(mcl_slab_t
*, mbuf_class_t
, u_int32_t
,
574 void *, void *, unsigned int, int, int);
575 static void slab_insert(mcl_slab_t
*, mbuf_class_t
);
576 static void slab_remove(mcl_slab_t
*, mbuf_class_t
);
577 static boolean_t
slab_inrange(mcl_slab_t
*, void *);
578 static void slab_nextptr_panic(mcl_slab_t
*, void *);
579 static void slab_detach(mcl_slab_t
*);
580 static boolean_t
slab_is_detached(mcl_slab_t
*);
583 * This flag is set for all mbufs that come out of and into the composite
584 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
585 * are marked with such a flag have clusters attached to them, and will be
586 * treated differently when they are freed; instead of being placed back
587 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
588 * are placed back into the appropriate composite cache's freelist, and the
589 * actual freeing is deferred until the composite objects are purged. At
590 * such a time, this flag will be cleared from the mbufs and the objects
591 * will be freed into their own separate freelists.
593 #define EXTF_COMPOSITE 0x1
595 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
596 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
597 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
598 #define MBUF_IS_COMPOSITE(m) \
599 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE))
602 * Macros used to verify the integrity of the mbuf.
604 #define _MCHECK(m) { \
605 if ((m)->m_type != MT_FREE) { \
606 if (mclaudit == NULL) \
607 panic("MCHECK: m_type=%d m=%p", \
608 (u_int16_t)(m)->m_type, m); \
610 mcl_audit_mcheck_panic(m); \
614 #define MBUF_IN_MAP(addr) \
615 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
617 #define MRANGE(addr) { \
618 if (!MBUF_IN_MAP(addr)) \
619 panic("MRANGE: address out of range 0x%p", addr); \
623 * Macro version of mtod.
625 #define MTOD(m, t) ((t)((m)->m_data))
628 * Macros to obtain cluster index and base cluster address.
630 #define MTOCL(x) (((char *)(x) - (char *)mbutl) >> MCLSHIFT)
631 #define CLTOM(x) ((union mcluster *)(mbutl + (x)))
634 * Macro to find the mbuf index relative to the cluster base.
636 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> 8)
639 * Macros used during mbuf and cluster initialization.
641 #define MBUF_INIT(m, pkthdr, type) { \
643 (m)->m_next = (m)->m_nextpkt = NULL; \
645 (m)->m_type = type; \
646 if ((pkthdr) == 0) { \
647 (m)->m_data = (m)->m_dat; \
650 (m)->m_data = (m)->m_pktdat; \
651 (m)->m_flags = M_PKTHDR; \
652 (m)->m_pkthdr.rcvif = NULL; \
653 (m)->m_pkthdr.len = 0; \
654 (m)->m_pkthdr.header = NULL; \
655 (m)->m_pkthdr.csum_flags = 0; \
656 (m)->m_pkthdr.csum_data = 0; \
657 (m)->m_pkthdr.reserved0 = NULL; \
658 (m)->m_pkthdr.vlan_tag = 0; \
659 (m)->m_pkthdr.socket_id = 0; \
664 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
665 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
666 (m)->m_flags |= M_EXT; \
667 (m)->m_ext.ext_size = (size); \
668 (m)->m_ext.ext_free = (free); \
669 (m)->m_ext.ext_arg = (arg); \
670 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
671 &(m)->m_ext.ext_refs; \
672 MEXT_RFA(m) = (rfa); \
673 MEXT_REF(m) = (ref); \
674 MEXT_FLAGS(m) = (flag); \
677 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
678 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
680 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
681 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
683 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
684 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
687 * Macro to convert BSD malloc sleep flag to mcache's
689 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
692 * The structure that holds all mbuf class statistics exportable via sysctl.
693 * Similar to mbstat structure, the mb_stat structure is protected by the
694 * global mbuf lock. It contains additional information about the classes
695 * that allows for a more accurate view of the state of the allocator.
697 struct mb_stat
*mb_stat
;
699 #define MB_STAT_SIZE(n) \
700 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
703 * The legacy structure holding all of the mbuf allocation statistics.
704 * The actual statistics used by the kernel are stored in the mbuf_table
705 * instead, and are updated atomically while the global mbuf lock is held.
706 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
707 * Unlike before, the kernel no longer relies on the contents of mbstat for
708 * its operations (e.g. cluster expansion) because the structure is exposed
709 * to outside and could possibly be modified, therefore making it unsafe.
710 * With the exception of the mbstat.m_mtypes array (see below), all of the
711 * statistics are updated as they change.
713 struct mbstat mbstat
;
715 #define MBSTAT_MTYPES_MAX \
716 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
719 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
720 * atomically and stored in a per-CPU structure which is lock-free; this is
721 * done in order to avoid writing to the global mbstat data structure which
722 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
723 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
724 * array and returned to the application. Any updates for types greater or
725 * equal than MT_MAX would be done atomically to the mbstat; this slows down
726 * performance but is okay since the kernel uses only up to MT_MAX-1 while
727 * anything beyond that (up to type 255) is considered a corner case.
730 unsigned int cpu_mtypes
[MT_MAX
];
731 } __attribute__((aligned(CPU_CACHE_SIZE
), packed
)) mtypes_cpu_t
;
734 mtypes_cpu_t mbs_cpu
[1];
737 static mbuf_mtypes_t
*mbuf_mtypes
; /* per-CPU statistics */
739 #define MBUF_MTYPES_SIZE(n) \
740 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
742 #define MTYPES_CPU(p) \
743 ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
745 /* This should be in a header file */
746 #define atomic_add_32(a, n) ((void) OSAddAtomic(n, (volatile SInt32 *)a))
748 #define mtype_stat_add(type, n) { \
749 if ((unsigned)(type) < MT_MAX) { \
750 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
751 atomic_add_32(&mbs->cpu_mtypes[type], n); \
752 } else if ((unsigned)(type) < MBSTAT_MTYPES_MAX) { \
753 atomic_add_32(&mbstat.m_mtypes[type], n); \
757 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
758 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
759 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
762 mbstat_sysctl SYSCTL_HANDLER_ARGS
764 #pragma unused(oidp, arg1, arg2)
768 bzero(&mtc
, sizeof (mtc
));
769 for (m
= 0; m
< ncpu
; m
++) {
770 mtypes_cpu_t
*scp
= &mbuf_mtypes
->mbs_cpu
[m
];
773 bcopy(&scp
->cpu_mtypes
, &temp
.cpu_mtypes
,
774 sizeof (temp
.cpu_mtypes
));
776 for (n
= 0; n
< MT_MAX
; n
++)
777 mtc
.cpu_mtypes
[n
] += temp
.cpu_mtypes
[n
];
779 lck_mtx_lock(mbuf_mlock
);
780 for (n
= 0; n
< MT_MAX
; n
++)
781 mbstat
.m_mtypes
[n
] = mtc
.cpu_mtypes
[n
];
782 lck_mtx_unlock(mbuf_mlock
);
784 return (SYSCTL_OUT(req
, &mbstat
, sizeof (mbstat
)));
788 mb_stat_sysctl SYSCTL_HANDLER_ARGS
790 #pragma unused(oidp, arg1, arg2)
796 lck_mtx_lock(mbuf_mlock
);
797 for (k
= 0; k
< NELEM(mbuf_table
); k
++) {
799 ccp
= &cp
->mc_cpu
[0];
800 bktsize
= ccp
->cc_bktsize
;
801 sp
= mbuf_table
[k
].mtbl_stats
;
803 if (cp
->mc_flags
& MCF_NOCPUCACHE
)
804 sp
->mbcl_mc_state
= MCS_DISABLED
;
805 else if (cp
->mc_purge_cnt
> 0)
806 sp
->mbcl_mc_state
= MCS_PURGING
;
807 else if (bktsize
== 0)
808 sp
->mbcl_mc_state
= MCS_OFFLINE
;
810 sp
->mbcl_mc_state
= MCS_ONLINE
;
812 sp
->mbcl_mc_cached
= 0;
813 for (m
= 0; m
< ncpu
; m
++) {
814 ccp
= &cp
->mc_cpu
[m
];
815 if (ccp
->cc_objs
> 0)
816 sp
->mbcl_mc_cached
+= ccp
->cc_objs
;
817 if (ccp
->cc_pobjs
> 0)
818 sp
->mbcl_mc_cached
+= ccp
->cc_pobjs
;
820 sp
->mbcl_mc_cached
+= (cp
->mc_full
.bl_total
* bktsize
);
821 sp
->mbcl_active
= sp
->mbcl_total
- sp
->mbcl_mc_cached
-
824 sp
->mbcl_mc_waiter_cnt
= cp
->mc_waiter_cnt
;
825 sp
->mbcl_mc_wretry_cnt
= cp
->mc_wretry_cnt
;
826 sp
->mbcl_mc_nwretry_cnt
= cp
->mc_nwretry_cnt
;
828 /* Calculate total count specific to each class */
829 sp
->mbcl_ctotal
= sp
->mbcl_total
;
830 switch (m_class(k
)) {
832 /* Deduct mbufs used in composite caches */
833 sp
->mbcl_ctotal
-= (m_total(MC_MBUF_CL
) +
834 m_total(MC_MBUF_BIGCL
));
838 /* Deduct clusters used in composite cache and mbufs */
839 sp
->mbcl_ctotal
-= (m_total(MC_MBUF_CL
) +
840 (P2ROUNDUP(m_total(MC_MBUF
), NMBPCL
)/NMBPCL
));
844 /* Deduct clusters used in composite cache */
845 sp
->mbcl_ctotal
-= m_total(MC_MBUF_BIGCL
);
849 /* Deduct clusters used in composite cache */
850 sp
->mbcl_ctotal
-= m_total(MC_MBUF_16KCL
);
857 lck_mtx_unlock(mbuf_mlock
);
859 return (SYSCTL_OUT(req
, mb_stat
, MB_STAT_SIZE(NELEM(mbuf_table
))));
863 m_incref(struct mbuf
*m
)
866 volatile UInt32
*addr
= (volatile UInt32
*)&MEXT_REF(m
);
872 } while (!OSCompareAndSwap(old
, new, addr
));
875 static inline u_int32_t
876 m_decref(struct mbuf
*m
)
879 volatile UInt32
*addr
= (volatile UInt32
*)&MEXT_REF(m
);
885 } while (!OSCompareAndSwap(old
, new, addr
));
891 mbuf_table_init(void)
895 MALLOC(mb_stat
, mb_stat_t
*, MB_STAT_SIZE(NELEM(mbuf_table
)),
896 M_TEMP
, M_WAITOK
| M_ZERO
);
897 VERIFY(mb_stat
!= NULL
);
899 mb_stat
->mbs_cnt
= NELEM(mbuf_table
);
900 for (m
= 0; m
< NELEM(mbuf_table
); m
++)
901 mbuf_table
[m
].mtbl_stats
= &mb_stat
->mbs_class
[m
];
903 #if CONFIG_MBUF_JUMBO
905 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
906 * this only on platforms where jumbo cluster pool is enabled.
908 njcl
= nmbclusters
/ 3;
909 njclbytes
= M16KCLBYTES
;
910 #endif /* CONFIG_MBUF_JUMBO */
913 * nclusters is going to be split in 2 to hold both the 2K
914 * and the 4K pools, so make sure each half is even.
916 nclusters
= P2ROUNDDOWN(nmbclusters
- njcl
, 4);
919 * Each jumbo cluster takes 8 2K clusters, so make
920 * sure that the pool size is evenly divisible by 8.
922 njcl
= P2ROUNDDOWN(nmbclusters
- nclusters
, 8);
925 #if CONFIG_MBUF_NOEXPAND
926 /* Only use 4k clusters if we're setting aside more than 256k */
927 if (nmbclusters
<= 128) {
928 maxmbufcl
= nmbclusters
/ 4;
930 /* Half to big clusters, half to small */
931 maxmbufcl
= (nmbclusters
/ 4) * 3;
933 #endif /* CONFIG_MBUF_NOEXPAND */
936 * 1/2 of the map is reserved for 2K clusters. Out of this, 1/16th
937 * of the total number of 2K clusters allocated is reserved and cannot
938 * be turned into mbufs. It can only be used for pure cluster objects.
940 m_minlimit(MC_CL
) = (nclusters
>> 5);
941 m_maxlimit(MC_CL
) = (nclusters
>> 1);
942 m_maxsize(MC_CL
) = m_size(MC_CL
) = MCLBYTES
;
943 (void) snprintf(m_cname(MC_CL
), MAX_MBUF_CNAME
, "cl");
946 * The remaining (15/16th) can be turned into mbufs.
948 m_minlimit(MC_MBUF
) = 0;
949 m_maxlimit(MC_MBUF
) = (m_maxlimit(MC_CL
) - m_minlimit(MC_CL
)) * NMBPCL
;
950 m_maxsize(MC_MBUF
) = m_size(MC_MBUF
) = MSIZE
;
951 (void) snprintf(m_cname(MC_MBUF
), MAX_MBUF_CNAME
, "mbuf");
954 * The other 1/2 of the map is reserved for 4K clusters.
956 m_minlimit(MC_BIGCL
) = 0;
957 m_maxlimit(MC_BIGCL
) = m_maxlimit(MC_CL
) >> 1;
958 m_maxsize(MC_BIGCL
) = m_size(MC_BIGCL
) = NBPG
;
959 (void) snprintf(m_cname(MC_BIGCL
), MAX_MBUF_CNAME
, "bigcl");
962 * Set limits for the composite classes.
964 m_minlimit(MC_MBUF_CL
) = 0;
965 m_maxlimit(MC_MBUF_CL
) = m_maxlimit(MC_CL
) - m_minlimit(MC_CL
);
966 m_maxsize(MC_MBUF_CL
) = MCLBYTES
;
967 m_size(MC_MBUF_CL
) = m_size(MC_MBUF
) + m_size(MC_CL
);
968 (void) snprintf(m_cname(MC_MBUF_CL
), MAX_MBUF_CNAME
, "mbuf_cl");
970 m_minlimit(MC_MBUF_BIGCL
) = 0;
971 m_maxlimit(MC_MBUF_BIGCL
) = m_maxlimit(MC_BIGCL
);
972 m_maxsize(MC_MBUF_BIGCL
) = NBPG
;
973 m_size(MC_MBUF_BIGCL
) = m_size(MC_MBUF
) + m_size(MC_BIGCL
);
974 (void) snprintf(m_cname(MC_MBUF_BIGCL
), MAX_MBUF_CNAME
, "mbuf_bigcl");
977 * And for jumbo classes.
979 m_minlimit(MC_16KCL
) = 0;
980 m_maxlimit(MC_16KCL
) = (njcl
>> 3);
981 m_maxsize(MC_16KCL
) = m_size(MC_16KCL
) = M16KCLBYTES
;
982 (void) snprintf(m_cname(MC_16KCL
), MAX_MBUF_CNAME
, "16kcl");
984 m_minlimit(MC_MBUF_16KCL
) = 0;
985 m_maxlimit(MC_MBUF_16KCL
) = m_maxlimit(MC_16KCL
);
986 m_maxsize(MC_MBUF_16KCL
) = M16KCLBYTES
;
987 m_size(MC_MBUF_16KCL
) = m_size(MC_MBUF
) + m_size(MC_16KCL
);
988 (void) snprintf(m_cname(MC_MBUF_16KCL
), MAX_MBUF_CNAME
, "mbuf_16kcl");
991 * Initialize the legacy mbstat structure.
993 bzero(&mbstat
, sizeof (mbstat
));
994 mbstat
.m_msize
= m_maxsize(MC_MBUF
);
995 mbstat
.m_mclbytes
= m_maxsize(MC_CL
);
996 mbstat
.m_minclsize
= MINCLSIZE
;
997 mbstat
.m_mlen
= MLEN
;
998 mbstat
.m_mhlen
= MHLEN
;
999 mbstat
.m_bigmclbytes
= m_maxsize(MC_BIGCL
);
1002 __private_extern__
void
1006 int initmcl
= MINCL
;
1010 if (nmbclusters
== 0)
1011 nmbclusters
= NMBCLUSTERS
;
1013 /* Setup the mbuf table */
1016 /* Global lock for common layer */
1017 mbuf_mlock_grp_attr
= lck_grp_attr_alloc_init();
1018 mbuf_mlock_grp
= lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr
);
1019 mbuf_mlock_attr
= lck_attr_alloc_init();
1020 mbuf_mlock
= lck_mtx_alloc_init(mbuf_mlock_grp
, mbuf_mlock_attr
);
1022 /* Allocate cluster slabs table */
1023 maxslabgrp
= P2ROUNDUP(nmbclusters
, NSLABSPMB
) / NSLABSPMB
;
1024 MALLOC(slabstbl
, mcl_slabg_t
**, maxslabgrp
* sizeof (mcl_slabg_t
*),
1025 M_TEMP
, M_WAITOK
| M_ZERO
);
1026 VERIFY(slabstbl
!= NULL
);
1028 /* Allocate audit structures if needed */
1029 PE_parse_boot_arg("mbuf_debug", &mbuf_debug
);
1030 mbuf_debug
|= mcache_getflags();
1031 if (mbuf_debug
& MCF_AUDIT
) {
1032 MALLOC(mclaudit
, mcl_audit_t
*,
1033 nmbclusters
* sizeof (*mclaudit
), M_TEMP
,
1035 VERIFY(mclaudit
!= NULL
);
1037 mcl_audit_con_cache
= mcache_create("mcl_audit_contents",
1038 AUDIT_CONTENTS_SIZE
, 0, 0, MCR_SLEEP
);
1039 VERIFY(mcl_audit_con_cache
!= NULL
);
1042 /* Calculate the number of pages assigned to the cluster pool */
1043 mcl_pages
= nmbclusters
/(NBPG
/CLBYTES
);
1044 MALLOC(mcl_paddr
, int *, mcl_pages
* sizeof (int), M_TEMP
, M_WAITOK
);
1045 VERIFY(mcl_paddr
!= NULL
);
1047 /* Register with the I/O Bus mapper */
1048 mcl_paddr_base
= IOMapperIOVMAlloc(mcl_pages
);
1049 bzero((char *)mcl_paddr
, mcl_pages
* sizeof (int));
1051 embutl
= (union mcluster
*)
1052 ((unsigned char *)mbutl
+ (nmbclusters
* MCLBYTES
));
1054 PE_parse_boot_arg("initmcl", &initmcl
);
1056 lck_mtx_lock(mbuf_mlock
);
1058 if (m_clalloc(MAX(NBPG
/CLBYTES
, 1) * initmcl
, M_WAIT
, MCLBYTES
) == 0)
1059 panic("mbinit: m_clalloc failed\n");
1061 lck_mtx_unlock(mbuf_mlock
);
1063 (void) kernel_thread(kernel_task
, mbuf_worker_thread_init
);
1065 ref_cache
= mcache_create("mext_ref", sizeof (struct ext_ref
),
1068 /* Create the cache for each class */
1069 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
1070 void *allocfunc
, *freefunc
, *auditfunc
;
1074 if (m_class(m
) == MC_MBUF_CL
|| m_class(m
) == MC_MBUF_BIGCL
||
1075 m_class(m
) == MC_MBUF_16KCL
) {
1076 allocfunc
= mbuf_cslab_alloc
;
1077 freefunc
= mbuf_cslab_free
;
1078 auditfunc
= mbuf_cslab_audit
;
1080 allocfunc
= mbuf_slab_alloc
;
1081 freefunc
= mbuf_slab_free
;
1082 auditfunc
= mbuf_slab_audit
;
1086 * Disable per-CPU caches for jumbo classes if there
1087 * is no jumbo cluster pool available in the system.
1088 * The cache itself is still created (but will never
1089 * be populated) since it simplifies the code.
1091 if ((m_class(m
) == MC_MBUF_16KCL
|| m_class(m
) == MC_16KCL
) &&
1093 flags
|= MCF_NOCPUCACHE
;
1095 m_cache(m
) = mcache_create_ext(m_cname(m
), m_maxsize(m
),
1096 allocfunc
, freefunc
, auditfunc
, mbuf_slab_notify
,
1097 (void *)m
, flags
, MCR_SLEEP
);
1101 * Allocate structure for per-CPU statistics that's aligned
1102 * on the CPU cache boundary; this code assumes that we never
1103 * uninitialize this framework, since the original address
1104 * before alignment is not saved.
1106 ncpu
= ml_get_max_cpus();
1107 MALLOC(buf
, void *, MBUF_MTYPES_SIZE(ncpu
) + CPU_CACHE_SIZE
,
1109 VERIFY(buf
!= NULL
);
1111 mbuf_mtypes
= (mbuf_mtypes_t
*)P2ROUNDUP((intptr_t)buf
, CPU_CACHE_SIZE
);
1112 bzero(mbuf_mtypes
, MBUF_MTYPES_SIZE(ncpu
));
1114 printf("mbinit: done\n");
1118 * Obtain a slab of object(s) from the class's freelist.
1120 static mcache_obj_t
*
1121 slab_alloc(mbuf_class_t
class, int wait
)
1126 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1128 VERIFY(class != MC_16KCL
|| njcl
> 0);
1130 /* This should always be NULL for us */
1131 VERIFY(m_cobjlist(class) == NULL
);
1134 * Treat composite objects as having longer lifespan by using
1135 * a slab from the reverse direction, in hoping that this could
1136 * reduce the probability of fragmentation for slabs that hold
1137 * more than one buffer chunks (e.g. mbuf slabs). For other
1138 * slabs, this probably doesn't make much of a difference.
1140 if (class == MC_MBUF
&& (wait
& MCR_COMP
))
1141 sp
= (mcl_slab_t
*)TAILQ_LAST(&m_slablist(class), mcl_slhead
);
1143 sp
= (mcl_slab_t
*)TAILQ_FIRST(&m_slablist(class));
1146 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1147 /* The slab list for this class is empty */
1151 VERIFY(m_infree(class) > 0);
1152 VERIFY(!slab_is_detached(sp
));
1153 VERIFY(sp
->sl_class
== class &&
1154 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
1156 VERIFY(slab_inrange(sp
, buf
) && sp
== slab_get(buf
));
1158 if (class == MC_MBUF
) {
1159 sp
->sl_head
= buf
->obj_next
;
1160 VERIFY(sp
->sl_head
!= NULL
|| sp
->sl_refcnt
== (NMBPCL
- 1));
1164 if (sp
->sl_head
!= NULL
&& !slab_inrange(sp
, sp
->sl_head
)) {
1165 slab_nextptr_panic(sp
, sp
->sl_head
);
1166 /* In case sl_head is in the map but not in the slab */
1167 VERIFY(slab_inrange(sp
, sp
->sl_head
));
1171 /* Increment slab reference */
1174 if (mclaudit
!= NULL
) {
1175 mcache_audit_t
*mca
= mcl_audit_buf2mca(class, buf
);
1176 mca
->mca_uflags
= 0;
1177 /* Save contents on mbuf objects only */
1178 if (class == MC_MBUF
)
1179 mca
->mca_uflags
|= MB_SCVALID
;
1182 if (class == MC_CL
) {
1183 mbstat
.m_clfree
= (--m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1185 * A 2K cluster slab can have at most 1 reference.
1187 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1188 sp
->sl_len
== m_maxsize(MC_CL
) && sp
->sl_head
== NULL
);
1189 } else if (class == MC_BIGCL
) {
1190 mcl_slab_t
*nsp
= sp
->sl_next
;
1191 mbstat
.m_bigclfree
= (--m_infree(MC_BIGCL
)) +
1192 m_infree(MC_MBUF_BIGCL
);
1194 * Increment 2nd slab. A 4K big cluster takes
1195 * 2 slabs, each having at most 1 reference.
1197 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1198 sp
->sl_len
== m_maxsize(MC_BIGCL
) && sp
->sl_head
== NULL
);
1199 /* Next slab must already be present */
1200 VERIFY(nsp
!= NULL
);
1202 VERIFY(!slab_is_detached(nsp
));
1203 VERIFY(nsp
->sl_class
== MC_BIGCL
&&
1204 nsp
->sl_flags
== (SLF_MAPPED
| SLF_PARTIAL
) &&
1205 nsp
->sl_refcnt
== 1 && nsp
->sl_chunks
== 0 &&
1206 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1207 nsp
->sl_head
== NULL
);
1208 } else if (class == MC_16KCL
) {
1212 --m_infree(MC_16KCL
);
1213 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1214 sp
->sl_len
== m_maxsize(MC_16KCL
) && sp
->sl_head
== NULL
);
1216 * Increment 2nd-8th slab. A 16K big cluster takes
1217 * 8 cluster slabs, each having at most 1 reference.
1219 for (nsp
= sp
, k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1221 /* Next slab must already be present */
1222 VERIFY(nsp
!= NULL
);
1224 VERIFY(!slab_is_detached(nsp
));
1225 VERIFY(nsp
->sl_class
== MC_16KCL
&&
1226 nsp
->sl_flags
== (SLF_MAPPED
| SLF_PARTIAL
) &&
1227 nsp
->sl_refcnt
== 1 && nsp
->sl_chunks
== 0 &&
1228 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1229 nsp
->sl_head
== NULL
);
1232 ASSERT(class == MC_MBUF
);
1233 --m_infree(MC_MBUF
);
1235 * If auditing is turned on, this check is
1236 * deferred until later in mbuf_slab_audit().
1238 if (mclaudit
== NULL
)
1239 _MCHECK((struct mbuf
*)buf
);
1241 * Since we have incremented the reference count above,
1242 * an mbuf slab (formerly a 2K cluster slab that was cut
1243 * up into mbufs) must have a reference count between 1
1244 * and NMBPCL at this point.
1246 VERIFY(sp
->sl_refcnt
>= 1 &&
1247 (unsigned short)sp
->sl_refcnt
<= NMBPCL
&&
1248 sp
->sl_chunks
== NMBPCL
&& sp
->sl_len
== m_maxsize(MC_CL
));
1249 VERIFY((unsigned short)sp
->sl_refcnt
< NMBPCL
||
1250 sp
->sl_head
== NULL
);
1253 /* If empty, remove this slab from the class's freelist */
1254 if (sp
->sl_head
== NULL
) {
1255 VERIFY(class != MC_MBUF
|| sp
->sl_refcnt
== NMBPCL
);
1256 slab_remove(sp
, class);
1263 * Place a slab of object(s) back into a class's slab list.
1266 slab_free(mbuf_class_t
class, mcache_obj_t
*buf
)
1270 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1272 VERIFY(class != MC_16KCL
|| njcl
> 0);
1273 VERIFY(buf
->obj_next
== NULL
);
1275 VERIFY(sp
->sl_class
== class && slab_inrange(sp
, buf
) &&
1276 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
1278 /* Decrement slab reference */
1281 if (class == MC_CL
|| class == MC_BIGCL
) {
1282 VERIFY(IS_P2ALIGNED(buf
, MCLBYTES
));
1284 * A 2K cluster slab can have at most 1 reference
1285 * which must be 0 at this point.
1287 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_chunks
== 1 &&
1288 sp
->sl_len
== m_maxsize(class) && sp
->sl_head
== NULL
);
1289 VERIFY(slab_is_detached(sp
));
1290 if (class == MC_BIGCL
) {
1291 mcl_slab_t
*nsp
= sp
->sl_next
;
1292 VERIFY(IS_P2ALIGNED(buf
, NBPG
));
1293 /* Next slab must already be present */
1294 VERIFY(nsp
!= NULL
);
1295 /* Decrement 2nd slab reference */
1298 * A 4K big cluster takes 2 slabs, both
1299 * must now have 0 reference.
1301 VERIFY(slab_is_detached(nsp
));
1302 VERIFY(nsp
->sl_class
== MC_BIGCL
&&
1303 (nsp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) &&
1304 nsp
->sl_refcnt
== 0 && nsp
->sl_chunks
== 0 &&
1305 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1306 nsp
->sl_head
== NULL
);
1308 } else if (class == MC_16KCL
) {
1312 * A 16K cluster takes 8 cluster slabs, all must
1313 * now have 0 reference.
1315 VERIFY(IS_P2ALIGNED(buf
, NBPG
));
1316 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_chunks
== 1 &&
1317 sp
->sl_len
== m_maxsize(MC_16KCL
) && sp
->sl_head
== NULL
);
1318 VERIFY(slab_is_detached(sp
));
1319 for (nsp
= sp
, k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1321 /* Next slab must already be present */
1322 VERIFY(nsp
!= NULL
);
1324 VERIFY(slab_is_detached(nsp
));
1325 VERIFY(nsp
->sl_class
== MC_16KCL
&&
1326 (nsp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) &&
1327 nsp
->sl_refcnt
== 0 && nsp
->sl_chunks
== 0 &&
1328 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1329 nsp
->sl_head
== NULL
);
1333 * An mbuf slab has a total of NMBPL reference counts.
1334 * Since we have decremented the reference above, it
1335 * must now be between 0 and NMBPCL-1.
1337 VERIFY(sp
->sl_refcnt
>= 0 &&
1338 (unsigned short)sp
->sl_refcnt
<= (NMBPCL
- 1) &&
1339 sp
->sl_chunks
== NMBPCL
&& sp
->sl_len
== m_maxsize(MC_CL
));
1340 VERIFY(sp
->sl_refcnt
< (NMBPCL
- 1) ||
1341 (slab_is_detached(sp
) && sp
->sl_head
== NULL
));
1345 * When auditing is enabled, ensure that the buffer still
1346 * contains the free pattern. Otherwise it got corrupted
1347 * while at the CPU cache layer.
1349 if (mclaudit
!= NULL
) {
1350 mcache_audit_t
*mca
= mcl_audit_buf2mca(class, buf
);
1351 mcache_audit_free_verify(mca
, buf
, 0, m_maxsize(class));
1352 mca
->mca_uflags
&= ~MB_SCVALID
;
1355 if (class == MC_CL
) {
1356 mbstat
.m_clfree
= (++m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1357 } else if (class == MC_BIGCL
) {
1358 mbstat
.m_bigclfree
= (++m_infree(MC_BIGCL
)) +
1359 m_infree(MC_MBUF_BIGCL
);
1360 } else if (class == MC_16KCL
) {
1361 ++m_infree(MC_16KCL
);
1363 ++m_infree(MC_MBUF
);
1364 buf
->obj_next
= sp
->sl_head
;
1368 /* All mbufs are freed; return the cluster that we stole earlier */
1369 if (sp
->sl_refcnt
== 0 && class == MC_MBUF
) {
1372 m_total(MC_MBUF
) -= NMBPCL
;
1373 mbstat
.m_mbufs
= m_total(MC_MBUF
);
1374 m_infree(MC_MBUF
) -= NMBPCL
;
1375 mtype_stat_add(MT_FREE
, -NMBPCL
);
1378 struct mbuf
*m
= sp
->sl_head
;
1380 sp
->sl_head
= m
->m_next
;
1383 VERIFY(sp
->sl_head
== NULL
);
1385 /* Remove the slab from the mbuf class's slab list */
1386 slab_remove(sp
, class);
1388 /* Reinitialize it as a 2K cluster slab */
1389 slab_init(sp
, MC_CL
, sp
->sl_flags
, sp
->sl_base
, sp
->sl_base
,
1392 if (mclaudit
!= NULL
)
1393 mcache_set_pattern(MCACHE_FREE_PATTERN
,
1394 (caddr_t
)sp
->sl_head
, m_maxsize(MC_CL
));
1396 mbstat
.m_clfree
= (++m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1398 VERIFY(slab_is_detached(sp
));
1399 /* And finally switch class */
1403 /* Reinsert the slab to the class's slab list */
1404 if (slab_is_detached(sp
))
1405 slab_insert(sp
, class);
1409 * Common allocator for rudimentary objects called by the CPU cache layer
1410 * during an allocation request whenever there is no available element in the
1411 * bucket layer. It returns one or more elements from the appropriate global
1412 * freelist. If the freelist is empty, it will attempt to populate it and
1413 * retry the allocation.
1416 mbuf_slab_alloc(void *arg
, mcache_obj_t
***plist
, unsigned int num
, int wait
)
1418 mbuf_class_t
class = (mbuf_class_t
)arg
;
1419 unsigned int need
= num
;
1420 mcache_obj_t
**list
= *plist
;
1422 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1425 lck_mtx_lock(mbuf_mlock
);
1428 if ((*list
= slab_alloc(class, wait
)) != NULL
) {
1429 (*list
)->obj_next
= NULL
;
1430 list
= *plist
= &(*list
)->obj_next
;
1434 * If the number of elements in freelist has
1435 * dropped below low watermark, asynchronously
1436 * populate the freelist now rather than doing
1437 * it later when we run out of elements.
1439 if (!mbuf_cached_above(class, wait
) &&
1440 m_infree(class) < m_total(class) >> 5) {
1441 (void) freelist_populate(class, 1,
1447 VERIFY(m_infree(class) == 0 || class == MC_CL
);
1449 (void) freelist_populate(class, 1,
1450 (wait
& MCR_NOSLEEP
) ? M_DONTWAIT
: M_WAIT
);
1452 if (m_infree(class) > 0)
1455 /* Check if there's anything at the cache layer */
1456 if (mbuf_cached_above(class, wait
))
1459 /* We have nothing and cannot block; give up */
1460 if (wait
& MCR_NOSLEEP
) {
1461 if (!(wait
& MCR_TRYHARD
)) {
1462 m_fail_cnt(class)++;
1469 * If the freelist is still empty and the caller is
1470 * willing to be blocked, sleep on the wait channel
1471 * until an element is available. Otherwise, if
1472 * MCR_TRYHARD is set, do our best to satisfy the
1473 * request without having to go to sleep.
1475 if (mbuf_worker_ready
&&
1476 mbuf_sleep(class, need
, wait
))
1479 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1483 m_alloc_cnt(class) += num
- need
;
1484 lck_mtx_unlock(mbuf_mlock
);
1486 return (num
- need
);
1490 * Common de-allocator for rudimentary objects called by the CPU cache
1491 * layer when one or more elements need to be returned to the appropriate
1495 mbuf_slab_free(void *arg
, mcache_obj_t
*list
, __unused
int purged
)
1497 mbuf_class_t
class = (mbuf_class_t
)arg
;
1498 mcache_obj_t
*nlist
;
1499 unsigned int num
= 0;
1502 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1504 lck_mtx_lock(mbuf_mlock
);
1507 nlist
= list
->obj_next
;
1508 list
->obj_next
= NULL
;
1509 slab_free(class, list
);
1511 if ((list
= nlist
) == NULL
)
1514 m_free_cnt(class) += num
;
1516 if ((w
= mb_waiters
) > 0)
1519 lck_mtx_unlock(mbuf_mlock
);
1522 wakeup(mb_waitchan
);
1526 * Common auditor for rudimentary objects called by the CPU cache layer
1527 * during an allocation or free request. For the former, this is called
1528 * after the objects are obtained from either the bucket or slab layer
1529 * and before they are returned to the caller. For the latter, this is
1530 * called immediately during free and before placing the objects into
1531 * the bucket or slab layer.
1534 mbuf_slab_audit(void *arg
, mcache_obj_t
*list
, boolean_t alloc
)
1536 mbuf_class_t
class = (mbuf_class_t
)arg
;
1537 mcache_audit_t
*mca
;
1539 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1541 while (list
!= NULL
) {
1542 lck_mtx_lock(mbuf_mlock
);
1543 mca
= mcl_audit_buf2mca(class, list
);
1545 /* Do the sanity checks */
1546 if (class == MC_MBUF
) {
1547 mcl_audit_mbuf(mca
, list
, FALSE
, alloc
);
1548 ASSERT(mca
->mca_uflags
& MB_SCVALID
);
1550 mcl_audit_cluster(mca
, list
, m_maxsize(class),
1552 ASSERT(!(mca
->mca_uflags
& MB_SCVALID
));
1554 /* Record this transaction */
1555 mcache_buffer_log(mca
, list
, m_cache(class));
1557 mca
->mca_uflags
|= MB_INUSE
;
1559 mca
->mca_uflags
&= ~MB_INUSE
;
1560 /* Unpair the object (unconditionally) */
1561 mca
->mca_uptr
= NULL
;
1562 lck_mtx_unlock(mbuf_mlock
);
1564 list
= list
->obj_next
;
1569 * Common notify routine for all caches. It is called by mcache when
1570 * one or more objects get freed. We use this indication to trigger
1571 * the wakeup of any sleeping threads so that they can retry their
1572 * allocation requests.
1575 mbuf_slab_notify(void *arg
, u_int32_t reason
)
1577 mbuf_class_t
class = (mbuf_class_t
)arg
;
1580 ASSERT(MBUF_CLASS_VALID(class));
1582 if (reason
!= MCN_RETRYALLOC
)
1585 lck_mtx_lock(mbuf_mlock
);
1586 if ((w
= mb_waiters
) > 0) {
1587 m_notified(class)++;
1590 lck_mtx_unlock(mbuf_mlock
);
1593 wakeup(mb_waitchan
);
1597 * Obtain object(s) from the composite class's freelist.
1600 cslab_alloc(mbuf_class_t
class, mcache_obj_t
***plist
, unsigned int num
)
1602 unsigned int need
= num
;
1603 mcl_slab_t
*sp
, *clsp
, *nsp
;
1605 mcache_obj_t
**list
= *plist
;
1609 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1610 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1612 /* Get what we can from the freelist */
1613 while ((*list
= m_cobjlist(class)) != NULL
) {
1616 m
= (struct mbuf
*)*list
;
1618 cl
= m
->m_ext
.ext_buf
;
1619 clsp
= slab_get(cl
);
1620 VERIFY(m
->m_flags
== M_EXT
&& cl
!= NULL
);
1621 VERIFY(MEXT_RFA(m
) != NULL
&& MBUF_IS_COMPOSITE(m
));
1622 VERIFY(clsp
->sl_refcnt
== 1);
1623 if (class == MC_MBUF_BIGCL
) {
1624 nsp
= clsp
->sl_next
;
1625 /* Next slab must already be present */
1626 VERIFY(nsp
!= NULL
);
1627 VERIFY(nsp
->sl_refcnt
== 1);
1628 } else if (class == MC_MBUF_16KCL
) {
1630 for (nsp
= clsp
, k
= 1;
1631 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1633 /* Next slab must already be present */
1634 VERIFY(nsp
!= NULL
);
1635 VERIFY(nsp
->sl_refcnt
== 1);
1639 if ((m_cobjlist(class) = (*list
)->obj_next
) != NULL
&&
1640 !MBUF_IN_MAP(m_cobjlist(class))) {
1641 slab_nextptr_panic(sp
, m_cobjlist(class));
1644 (*list
)->obj_next
= NULL
;
1645 list
= *plist
= &(*list
)->obj_next
;
1650 m_infree(class) -= (num
- need
);
1652 return (num
- need
);
1656 * Place object(s) back into a composite class's freelist.
1659 cslab_free(mbuf_class_t
class, mcache_obj_t
*list
, int purged
)
1661 mcache_obj_t
*o
, *tail
;
1662 unsigned int num
= 0;
1663 struct mbuf
*m
, *ms
;
1664 mcache_audit_t
*mca
= NULL
;
1665 mcache_obj_t
*ref_list
= NULL
;
1666 mcl_slab_t
*clsp
, *nsp
;
1669 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1670 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1671 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1675 while ((m
= ms
= (struct mbuf
*)o
) != NULL
) {
1676 mcache_obj_t
*rfa
, *nexto
= o
->obj_next
;
1678 /* Do the mbuf sanity checks */
1679 if (mclaudit
!= NULL
) {
1680 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
1681 mcache_audit_free_verify(mca
, m
, 0, m_maxsize(MC_MBUF
));
1682 ms
= (struct mbuf
*)mca
->mca_contents
;
1685 /* Do the cluster sanity checks */
1686 cl
= ms
->m_ext
.ext_buf
;
1687 clsp
= slab_get(cl
);
1688 if (mclaudit
!= NULL
) {
1690 if (class == MC_MBUF_CL
)
1691 size
= m_maxsize(MC_CL
);
1692 else if (class == MC_MBUF_BIGCL
)
1693 size
= m_maxsize(MC_BIGCL
);
1695 size
= m_maxsize(MC_16KCL
);
1696 mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL
,
1697 (mcache_obj_t
*)cl
), cl
, 0, size
);
1699 VERIFY(ms
->m_type
== MT_FREE
);
1700 VERIFY(ms
->m_flags
== M_EXT
);
1701 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
1702 VERIFY(clsp
->sl_refcnt
== 1);
1703 if (class == MC_MBUF_BIGCL
) {
1704 nsp
= clsp
->sl_next
;
1705 /* Next slab must already be present */
1706 VERIFY(nsp
!= NULL
);
1707 VERIFY(nsp
->sl_refcnt
== 1);
1708 } else if (class == MC_MBUF_16KCL
) {
1710 for (nsp
= clsp
, k
= 1;
1711 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1713 /* Next slab must already be present */
1714 VERIFY(nsp
!= NULL
);
1715 VERIFY(nsp
->sl_refcnt
== 1);
1720 * If we're asked to purge, restore the actual mbuf using
1721 * contents of the shadow structure (if auditing is enabled)
1722 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1723 * about to free it and the attached cluster into their caches.
1726 /* Restore constructed mbuf fields */
1727 if (mclaudit
!= NULL
)
1728 mcl_audit_restore_mbuf(m
, mca
, TRUE
);
1733 rfa
= (mcache_obj_t
*)MEXT_RFA(m
);
1734 rfa
->obj_next
= ref_list
;
1738 m
->m_type
= MT_FREE
;
1739 m
->m_flags
= m
->m_len
= 0;
1740 m
->m_next
= m
->m_nextpkt
= NULL
;
1742 /* Save mbuf fields and make auditing happy */
1743 if (mclaudit
!= NULL
)
1744 mcl_audit_mbuf(mca
, o
, FALSE
, FALSE
);
1746 VERIFY(m_total(class) > 0);
1751 slab_free(MC_MBUF
, o
);
1753 /* And free the cluster */
1754 ((mcache_obj_t
*)cl
)->obj_next
= NULL
;
1755 if (class == MC_MBUF_CL
)
1756 slab_free(MC_CL
, cl
);
1757 else if (class == MC_MBUF_BIGCL
)
1758 slab_free(MC_BIGCL
, cl
);
1760 slab_free(MC_16KCL
, cl
);
1769 tail
->obj_next
= m_cobjlist(class);
1770 m_cobjlist(class) = list
;
1771 m_infree(class) += num
;
1772 } else if (ref_list
!= NULL
) {
1773 mcache_free_ext(ref_cache
, ref_list
);
1780 * Common allocator for composite objects called by the CPU cache layer
1781 * during an allocation request whenever there is no available element in
1782 * the bucket layer. It returns one or more composite elements from the
1783 * appropriate global freelist. If the freelist is empty, it will attempt
1784 * to obtain the rudimentary objects from their caches and construct them
1785 * into composite mbuf + cluster objects.
1788 mbuf_cslab_alloc(void *arg
, mcache_obj_t
***plist
, unsigned int needed
,
1791 mbuf_class_t
class = (mbuf_class_t
)arg
;
1792 mcache_t
*cp
= NULL
;
1793 unsigned int num
= 0, cnum
= 0, want
= needed
;
1794 mcache_obj_t
*ref_list
= NULL
;
1795 mcache_obj_t
*mp_list
= NULL
;
1796 mcache_obj_t
*clp_list
= NULL
;
1797 mcache_obj_t
**list
;
1798 struct ext_ref
*rfa
;
1802 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1805 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1807 /* There should not be any slab for this class */
1808 VERIFY(m_slab_cnt(class) == 0 &&
1809 m_slablist(class).tqh_first
== NULL
&&
1810 m_slablist(class).tqh_last
== NULL
);
1812 lck_mtx_lock(mbuf_mlock
);
1814 /* Try using the freelist first */
1815 num
= cslab_alloc(class, plist
, needed
);
1817 if (num
== needed
) {
1818 m_alloc_cnt(class) += num
;
1819 lck_mtx_unlock(mbuf_mlock
);
1823 lck_mtx_unlock(mbuf_mlock
);
1826 * We could not satisfy the request using the freelist alone;
1827 * allocate from the appropriate rudimentary caches and use
1828 * whatever we can get to construct the composite objects.
1833 * Mark these allocation requests as coming from a composite cache.
1834 * Also, if the caller is willing to be blocked, mark the request
1835 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
1836 * slab layer waiting for the individual object when one or more
1837 * of the already-constructed composite objects are available.
1840 if (!(wait
& MCR_NOSLEEP
))
1843 needed
= mcache_alloc_ext(m_cache(MC_MBUF
), &mp_list
, needed
, wait
);
1845 ASSERT(mp_list
== NULL
);
1848 if (class == MC_MBUF_CL
)
1849 cp
= m_cache(MC_CL
);
1850 else if (class == MC_MBUF_BIGCL
)
1851 cp
= m_cache(MC_BIGCL
);
1853 cp
= m_cache(MC_16KCL
);
1854 needed
= mcache_alloc_ext(cp
, &clp_list
, needed
, wait
);
1856 ASSERT(clp_list
== NULL
);
1859 needed
= mcache_alloc_ext(ref_cache
, &ref_list
, needed
, wait
);
1861 ASSERT(ref_list
== NULL
);
1866 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
1867 * overs will get freed accordingly before we return to caller.
1869 for (cnum
= 0; cnum
< needed
; cnum
++) {
1872 m
= ms
= (struct mbuf
*)mp_list
;
1873 mp_list
= mp_list
->obj_next
;
1876 clp_list
= clp_list
->obj_next
;
1877 ((mcache_obj_t
*)cl
)->obj_next
= NULL
;
1879 rfa
= (struct ext_ref
*)ref_list
;
1880 ref_list
= ref_list
->obj_next
;
1881 ((mcache_obj_t
*)rfa
)->obj_next
= NULL
;
1884 * If auditing is enabled, construct the shadow mbuf
1885 * in the audit structure instead of in the actual one.
1886 * mbuf_cslab_audit() will take care of restoring the
1887 * contents after the integrity check.
1889 if (mclaudit
!= NULL
) {
1890 mcache_audit_t
*mca
, *cl_mca
;
1893 lck_mtx_lock(mbuf_mlock
);
1894 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
1895 ms
= ((struct mbuf
*)mca
->mca_contents
);
1896 cl_mca
= mcl_audit_buf2mca(MC_CL
, (mcache_obj_t
*)cl
);
1899 * Pair them up. Note that this is done at the time
1900 * the mbuf+cluster objects are constructed. This
1901 * information should be treated as "best effort"
1902 * debugging hint since more than one mbufs can refer
1903 * to a cluster. In that case, the cluster might not
1904 * be freed along with the mbuf it was paired with.
1906 mca
->mca_uptr
= cl_mca
;
1907 cl_mca
->mca_uptr
= mca
;
1909 ASSERT(mca
->mca_uflags
& MB_SCVALID
);
1910 ASSERT(!(cl_mca
->mca_uflags
& MB_SCVALID
));
1911 lck_mtx_unlock(mbuf_mlock
);
1913 /* Technically, they are in the freelist */
1914 mcache_set_pattern(MCACHE_FREE_PATTERN
, m
,
1915 m_maxsize(MC_MBUF
));
1916 if (class == MC_MBUF_CL
)
1917 size
= m_maxsize(MC_CL
);
1918 else if (class == MC_MBUF_BIGCL
)
1919 size
= m_maxsize(MC_BIGCL
);
1921 size
= m_maxsize(MC_16KCL
);
1922 mcache_set_pattern(MCACHE_FREE_PATTERN
, cl
, size
);
1925 MBUF_INIT(ms
, 0, MT_FREE
);
1926 if (class == MC_MBUF_16KCL
) {
1927 MBUF_16KCL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
1928 } else if (class == MC_MBUF_BIGCL
) {
1929 MBUF_BIGCL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
1931 MBUF_CL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
1933 VERIFY(ms
->m_flags
== M_EXT
);
1934 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
1936 *list
= (mcache_obj_t
*)m
;
1937 (*list
)->obj_next
= NULL
;
1938 list
= *plist
= &(*list
)->obj_next
;
1943 * Free up what's left of the above.
1945 if (mp_list
!= NULL
)
1946 mcache_free_ext(m_cache(MC_MBUF
), mp_list
);
1947 if (clp_list
!= NULL
)
1948 mcache_free_ext(cp
, clp_list
);
1949 if (ref_list
!= NULL
)
1950 mcache_free_ext(ref_cache
, ref_list
);
1952 lck_mtx_lock(mbuf_mlock
);
1953 if (num
> 0 || cnum
> 0) {
1954 m_total(class) += cnum
;
1955 VERIFY(m_total(class) <= m_maxlimit(class));
1956 m_alloc_cnt(class) += num
+ cnum
;
1958 if ((num
+ cnum
) < want
)
1959 m_fail_cnt(class) += (want
- (num
+ cnum
));
1960 lck_mtx_unlock(mbuf_mlock
);
1962 return (num
+ cnum
);
1966 * Common de-allocator for composite objects called by the CPU cache
1967 * layer when one or more elements need to be returned to the appropriate
1971 mbuf_cslab_free(void *arg
, mcache_obj_t
*list
, int purged
)
1973 mbuf_class_t
class = (mbuf_class_t
)arg
;
1977 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1979 lck_mtx_lock(mbuf_mlock
);
1981 num
= cslab_free(class, list
, purged
);
1982 m_free_cnt(class) += num
;
1984 if ((w
= mb_waiters
) > 0)
1987 lck_mtx_unlock(mbuf_mlock
);
1990 wakeup(mb_waitchan
);
1994 * Common auditor for composite objects called by the CPU cache layer
1995 * during an allocation or free request. For the former, this is called
1996 * after the objects are obtained from either the bucket or slab layer
1997 * and before they are returned to the caller. For the latter, this is
1998 * called immediately during free and before placing the objects into
1999 * the bucket or slab layer.
2002 mbuf_cslab_audit(void *arg
, mcache_obj_t
*list
, boolean_t alloc
)
2004 mbuf_class_t
class = (mbuf_class_t
)arg
;
2005 mcache_audit_t
*mca
;
2006 struct mbuf
*m
, *ms
;
2007 mcl_slab_t
*clsp
, *nsp
;
2011 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2013 while ((m
= ms
= (struct mbuf
*)list
) != NULL
) {
2014 lck_mtx_lock(mbuf_mlock
);
2015 /* Do the mbuf sanity checks and record its transaction */
2016 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
2017 mcl_audit_mbuf(mca
, m
, TRUE
, alloc
);
2018 mcache_buffer_log(mca
, m
, m_cache(class));
2020 mca
->mca_uflags
|= MB_COMP_INUSE
;
2022 mca
->mca_uflags
&= ~MB_COMP_INUSE
;
2025 * Use the shadow mbuf in the audit structure if we are
2026 * freeing, since the contents of the actual mbuf has been
2027 * pattern-filled by the above call to mcl_audit_mbuf().
2030 ms
= (struct mbuf
*)mca
->mca_contents
;
2032 /* Do the cluster sanity checks and record its transaction */
2033 cl
= ms
->m_ext
.ext_buf
;
2034 clsp
= slab_get(cl
);
2035 VERIFY(ms
->m_flags
== M_EXT
&& cl
!= NULL
);
2036 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
2037 VERIFY(clsp
->sl_refcnt
== 1);
2038 if (class == MC_MBUF_BIGCL
) {
2039 nsp
= clsp
->sl_next
;
2040 /* Next slab must already be present */
2041 VERIFY(nsp
!= NULL
);
2042 VERIFY(nsp
->sl_refcnt
== 1);
2043 } else if (class == MC_MBUF_16KCL
) {
2045 for (nsp
= clsp
, k
= 1;
2046 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
2048 /* Next slab must already be present */
2049 VERIFY(nsp
!= NULL
);
2050 VERIFY(nsp
->sl_refcnt
== 1);
2054 mca
= mcl_audit_buf2mca(MC_CL
, cl
);
2055 if (class == MC_MBUF_CL
)
2056 size
= m_maxsize(MC_CL
);
2057 else if (class == MC_MBUF_BIGCL
)
2058 size
= m_maxsize(MC_BIGCL
);
2060 size
= m_maxsize(MC_16KCL
);
2061 mcl_audit_cluster(mca
, cl
, size
, alloc
, FALSE
);
2062 mcache_buffer_log(mca
, cl
, m_cache(class));
2064 mca
->mca_uflags
|= MB_COMP_INUSE
;
2066 mca
->mca_uflags
&= ~MB_COMP_INUSE
;
2067 lck_mtx_unlock(mbuf_mlock
);
2069 list
= list
->obj_next
;
2074 * Allocate some number of mbuf clusters and place on cluster freelist.
2077 m_clalloc(const u_int32_t num
, const int wait
, const u_int32_t bufsize
)
2082 vm_offset_t page
= 0;
2083 mcache_audit_t
*mca_list
= NULL
;
2084 mcache_obj_t
*con_list
= NULL
;
2087 VERIFY(bufsize
== m_maxsize(MC_CL
) ||
2088 bufsize
== m_maxsize(MC_BIGCL
) || bufsize
== m_maxsize(MC_16KCL
));
2090 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2093 * Multiple threads may attempt to populate the cluster map one
2094 * after another. Since we drop the lock below prior to acquiring
2095 * the physical page(s), our view of the cluster map may no longer
2096 * be accurate, and we could end up over-committing the pages beyond
2097 * the maximum allowed for each class. To prevent it, this entire
2098 * operation (including the page mapping) is serialized.
2100 while (mb_clalloc_busy
) {
2101 mb_clalloc_waiters
++;
2102 (void) msleep(mb_clalloc_waitchan
, mbuf_mlock
,
2103 (PZERO
-1), "m_clalloc", NULL
);
2104 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2107 /* We are busy now; tell everyone else to go away */
2108 mb_clalloc_busy
= TRUE
;
2111 * Honor the caller's wish to block or not block. We have a way
2112 * to grow the pool asynchronously using the mbuf worker thread.
2114 i
= m_howmany(num
, bufsize
);
2115 if (i
== 0 || (wait
& M_DONTWAIT
))
2118 lck_mtx_unlock(mbuf_mlock
);
2120 size
= round_page_32(i
* bufsize
);
2121 page
= kmem_mb_alloc(mb_map
, size
);
2124 if (bufsize
<= m_maxsize(MC_BIGCL
)) {
2125 /* Try for 1 page if failed, only for 2KB/4KB request */
2127 page
= kmem_mb_alloc(mb_map
, size
);
2131 lck_mtx_lock(mbuf_mlock
);
2136 VERIFY(IS_P2ALIGNED(page
, NBPG
));
2137 numpages
= size
/ NBPG
;
2139 /* If auditing is enabled, allocate the audit structures now */
2140 if (mclaudit
!= NULL
) {
2144 * Yes, I realize this is a waste of memory for clusters
2145 * that never get transformed into mbufs, as we may end
2146 * up with NMBPCL-1 unused audit structures per cluster.
2147 * But doing so tremendously simplifies the allocation
2148 * strategy, since at this point we are not holding the
2149 * mbuf lock and the caller is okay to be blocked. For
2150 * the case of big clusters, we allocate one structure
2151 * for each as we never turn them into mbufs.
2153 if (bufsize
== m_maxsize(MC_CL
)) {
2154 needed
= numpages
* 2 * NMBPCL
;
2156 i
= mcache_alloc_ext(mcl_audit_con_cache
,
2157 &con_list
, needed
, MCR_SLEEP
);
2159 VERIFY(con_list
!= NULL
&& i
== needed
);
2160 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2163 needed
= numpages
/ (M16KCLBYTES
/ NBPG
);
2166 i
= mcache_alloc_ext(mcache_audit_cache
,
2167 (mcache_obj_t
**)&mca_list
, needed
, MCR_SLEEP
);
2169 VERIFY(mca_list
!= NULL
&& i
== needed
);
2172 lck_mtx_lock(mbuf_mlock
);
2174 for (i
= 0; i
< numpages
; i
++, page
+= NBPG
) {
2175 ppnum_t offset
= ((char *)page
- (char *)mbutl
) / NBPG
;
2176 ppnum_t new_page
= pmap_find_phys(kernel_pmap
,
2177 (vm_address_t
)page
);
2180 * In the case of no mapper being available the following
2181 * code noops and returns the input page; if there is a
2182 * mapper the appropriate I/O page is returned.
2184 new_page
= IOMapperInsertPage(mcl_paddr_base
, offset
, new_page
);
2185 mcl_paddr
[offset
] = new_page
<< PGSHIFT
;
2187 /* Pattern-fill this fresh page */
2188 if (mclaudit
!= NULL
)
2189 mcache_set_pattern(MCACHE_FREE_PATTERN
,
2190 (caddr_t
)page
, NBPG
);
2192 if (bufsize
== m_maxsize(MC_CL
)) {
2193 union mcluster
*mcl
= (union mcluster
*)page
;
2195 /* 1st cluster in the page */
2197 if (mclaudit
!= NULL
)
2198 mcl_audit_init(mcl
, &mca_list
, &con_list
,
2199 AUDIT_CONTENTS_SIZE
, NMBPCL
);
2201 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2202 slab_init(sp
, MC_CL
, SLF_MAPPED
,
2203 mcl
, mcl
, bufsize
, 0, 1);
2205 /* Insert this slab */
2206 slab_insert(sp
, MC_CL
);
2208 /* Update stats now since slab_get() drops the lock */
2209 mbstat
.m_clfree
= ++m_infree(MC_CL
) +
2210 m_infree(MC_MBUF_CL
);
2211 mbstat
.m_clusters
= ++m_total(MC_CL
);
2212 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2214 /* 2nd cluster in the page */
2215 sp
= slab_get(++mcl
);
2216 if (mclaudit
!= NULL
)
2217 mcl_audit_init(mcl
, &mca_list
, &con_list
,
2218 AUDIT_CONTENTS_SIZE
, NMBPCL
);
2220 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2221 slab_init(sp
, MC_CL
, SLF_MAPPED
,
2222 mcl
, mcl
, bufsize
, 0, 1);
2224 /* Insert this slab */
2225 slab_insert(sp
, MC_CL
);
2227 /* Update stats now since slab_get() drops the lock */
2228 mbstat
.m_clfree
= ++m_infree(MC_CL
) +
2229 m_infree(MC_MBUF_CL
);
2230 mbstat
.m_clusters
= ++m_total(MC_CL
);
2231 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2232 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2233 union mbigcluster
*mbc
= (union mbigcluster
*)page
;
2236 /* One for the entire page */
2238 if (mclaudit
!= NULL
)
2239 mcl_audit_init(mbc
, &mca_list
, NULL
, 0, 1);
2241 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2242 slab_init(sp
, MC_BIGCL
, SLF_MAPPED
,
2243 mbc
, mbc
, bufsize
, 0, 1);
2245 /* 2nd cluster's slab is part of the previous one */
2246 nsp
= slab_get(((union mcluster
*)page
) + 1);
2247 slab_init(nsp
, MC_BIGCL
, SLF_MAPPED
| SLF_PARTIAL
,
2248 mbc
, NULL
, 0, 0, 0);
2250 /* Insert this slab */
2251 slab_insert(sp
, MC_BIGCL
);
2253 /* Update stats now since slab_get() drops the lock */
2254 mbstat
.m_bigclfree
= ++m_infree(MC_BIGCL
) +
2255 m_infree(MC_MBUF_BIGCL
);
2256 mbstat
.m_bigclusters
= ++m_total(MC_BIGCL
);
2257 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
2258 } else if ((i
% (M16KCLBYTES
/ NBPG
)) == 0) {
2259 union m16kcluster
*m16kcl
= (union m16kcluster
*)page
;
2264 /* One for the entire 16KB */
2265 sp
= slab_get(m16kcl
);
2266 if (mclaudit
!= NULL
)
2267 mcl_audit_init(m16kcl
, &mca_list
, NULL
, 0, 1);
2269 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2270 slab_init(sp
, MC_16KCL
, SLF_MAPPED
,
2271 m16kcl
, m16kcl
, bufsize
, 0, 1);
2273 /* 2nd-8th cluster's slab is part of the first one */
2274 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
2275 nsp
= slab_get(((union mcluster
*)page
) + k
);
2276 VERIFY(nsp
->sl_refcnt
== 0 &&
2277 nsp
->sl_flags
== 0);
2278 slab_init(nsp
, MC_16KCL
,
2279 SLF_MAPPED
| SLF_PARTIAL
,
2280 m16kcl
, NULL
, 0, 0, 0);
2283 /* Insert this slab */
2284 slab_insert(sp
, MC_16KCL
);
2286 /* Update stats now since slab_get() drops the lock */
2287 m_infree(MC_16KCL
)++;
2288 m_total(MC_16KCL
)++;
2289 VERIFY(m_total(MC_16KCL
) <= m_maxlimit(MC_16KCL
));
2292 VERIFY(mca_list
== NULL
&& con_list
== NULL
);
2294 /* We're done; let others enter */
2295 mb_clalloc_busy
= FALSE
;
2296 if (mb_clalloc_waiters
> 0) {
2297 mb_clalloc_waiters
= 0;
2298 wakeup(mb_clalloc_waitchan
);
2301 if (bufsize
== m_maxsize(MC_CL
))
2302 return (numpages
<< 1);
2303 else if (bufsize
== m_maxsize(MC_BIGCL
))
2306 VERIFY(bufsize
== m_maxsize(MC_16KCL
));
2307 return (numpages
/ (M16KCLBYTES
/ NBPG
));
2310 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2312 /* We're done; let others enter */
2313 mb_clalloc_busy
= FALSE
;
2314 if (mb_clalloc_waiters
> 0) {
2315 mb_clalloc_waiters
= 0;
2316 wakeup(mb_clalloc_waitchan
);
2320 * When non-blocking we kick a thread if we have to grow the
2321 * pool or if the number of free clusters is less than requested.
2323 if (bufsize
== m_maxsize(MC_CL
)) {
2326 * Remember total number of clusters needed
2329 i
+= m_total(MC_CL
);
2330 if (i
> mbuf_expand_mcl
) {
2331 mbuf_expand_mcl
= i
;
2332 if (mbuf_worker_ready
)
2333 wakeup((caddr_t
)&mbuf_worker_run
);
2337 if (m_infree(MC_CL
) >= num
)
2339 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2342 * Remember total number of 4KB clusters needed
2345 i
+= m_total(MC_BIGCL
);
2346 if (i
> mbuf_expand_big
) {
2347 mbuf_expand_big
= i
;
2348 if (mbuf_worker_ready
)
2349 wakeup((caddr_t
)&mbuf_worker_run
);
2353 if (m_infree(MC_BIGCL
) >= num
)
2358 * Remember total number of 16KB clusters needed
2361 i
+= m_total(MC_16KCL
);
2362 if (i
> mbuf_expand_16k
) {
2363 mbuf_expand_16k
= i
;
2364 if (mbuf_worker_ready
)
2365 wakeup((caddr_t
)&mbuf_worker_run
);
2369 if (m_infree(MC_16KCL
) >= num
)
2376 * Populate the global freelist of the corresponding buffer class.
2379 freelist_populate(mbuf_class_t
class, unsigned int num
, int wait
)
2381 mcache_obj_t
*o
= NULL
;
2384 VERIFY(class == MC_MBUF
|| class == MC_CL
|| class == MC_BIGCL
||
2387 #if CONFIG_MBUF_NOEXPAND
2388 if ((mbstat
.m_mbufs
/ NMBPCL
) >= maxmbufcl
) {
2390 static int printonce
= 1;
2391 if (printonce
== 1) {
2393 printf("m_expand failed, allocated %ld out of %d "
2394 "clusters\n", mbstat
.m_mbufs
/ NMBPCL
,
2400 #endif /* CONFIG_MBUF_NOEXPAND */
2402 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2407 i
= m_clalloc(num
, wait
, m_maxsize(MC_CL
));
2409 /* Respect the 2K clusters minimum limit */
2410 if (m_total(MC_CL
) == m_maxlimit(MC_CL
) &&
2411 m_infree(MC_CL
) <= m_minlimit(MC_CL
)) {
2412 if (class != MC_CL
|| (wait
& MCR_COMP
))
2421 return (m_clalloc(num
, wait
, m_maxsize(class)) != 0);
2429 /* Steal a cluster and cut it up to create NMBPCL mbufs */
2430 if ((o
= slab_alloc(MC_CL
, wait
)) != NULL
) {
2431 struct mbuf
*m
= (struct mbuf
*)o
;
2432 mcache_audit_t
*mca
= NULL
;
2433 mcl_slab_t
*sp
= slab_get(o
);
2435 VERIFY(slab_is_detached(sp
) &&
2436 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
2438 /* Make sure that the cluster is unmolested while in freelist */
2439 if (mclaudit
!= NULL
) {
2440 mca
= mcl_audit_buf2mca(MC_CL
, o
);
2441 mcache_audit_free_verify(mca
, o
, 0, m_maxsize(MC_CL
));
2444 /* Reinitialize it as an mbuf slab */
2445 slab_init(sp
, MC_MBUF
, sp
->sl_flags
, sp
->sl_base
, NULL
,
2446 sp
->sl_len
, 0, NMBPCL
);
2448 VERIFY(m
== (struct mbuf
*)sp
->sl_base
);
2449 VERIFY(sp
->sl_head
== NULL
);
2451 m_total(MC_MBUF
) += NMBPCL
;
2452 mbstat
.m_mbufs
= m_total(MC_MBUF
);
2453 m_infree(MC_MBUF
) += NMBPCL
;
2454 mtype_stat_add(MT_FREE
, NMBPCL
);
2459 * If auditing is enabled, construct the shadow mbuf
2460 * in the audit structure instead of the actual one.
2461 * mbuf_slab_audit() will take care of restoring the
2462 * contents after the integrity check.
2464 if (mclaudit
!= NULL
) {
2466 mca
= mcl_audit_buf2mca(MC_MBUF
,
2468 ms
= ((struct mbuf
*)mca
->mca_contents
);
2469 ms
->m_type
= MT_FREE
;
2471 m
->m_type
= MT_FREE
;
2473 m
->m_next
= sp
->sl_head
;
2474 sp
->sl_head
= (void *)m
++;
2477 /* Insert it into the mbuf class's slab list */
2478 slab_insert(sp
, MC_MBUF
);
2480 if ((i
= mb_waiters
) > 0)
2483 wakeup(mb_waitchan
);
2492 * (Inaccurately) check if it might be worth a trip back to the
2493 * mcache layer due the availability of objects there. We'll
2494 * end up back here if there's nothing up there.
2497 mbuf_cached_above(mbuf_class_t
class, int wait
)
2501 if (wait
& MCR_COMP
)
2502 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL
)) ||
2503 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL
)));
2507 if (wait
& MCR_COMP
)
2508 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL
)));
2512 if (wait
& MCR_COMP
)
2513 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL
)));
2517 if (wait
& MCR_COMP
)
2518 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL
)));
2531 return (!mcache_bkt_isempty(m_cache(class)));
2535 * If possible, convert constructed objects to raw ones.
2538 mbuf_steal(mbuf_class_t
class, unsigned int num
)
2540 mcache_obj_t
*top
= NULL
;
2541 mcache_obj_t
**list
= &top
;
2542 unsigned int tot
= 0;
2544 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2556 /* Get the required number of constructed objects if possible */
2557 if (m_infree(class) > m_minlimit(class)) {
2558 tot
= cslab_alloc(class, &list
,
2559 MIN(num
, m_infree(class)));
2562 /* And destroy them to get back the raw objects */
2564 (void) cslab_free(class, top
, 1);
2572 return (tot
== num
);
2576 m_reclaim(mbuf_class_t
class, unsigned int num
, boolean_t comp
)
2580 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2582 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2583 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
2584 VERIFY(m_total(MC_16KCL
) <= m_maxlimit(MC_16KCL
));
2587 * This logic can be made smarter; for now, simply mark
2588 * all other related classes as potential victims.
2592 m_wantpurge(MC_CL
)++;
2593 m_wantpurge(MC_MBUF_CL
)++;
2594 m_wantpurge(MC_MBUF_BIGCL
)++;
2598 m_wantpurge(MC_MBUF
)++;
2600 m_wantpurge(MC_MBUF_CL
)++;
2605 m_wantpurge(MC_MBUF_BIGCL
)++;
2610 m_wantpurge(MC_MBUF_16KCL
)++;
2619 * Run through each marked class and check if we really need to
2620 * purge (and therefore temporarily disable) the per-CPU caches
2621 * layer used by the class. If so, remember the classes since
2622 * we are going to drop the lock below prior to purging.
2624 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
2625 if (m_wantpurge(m
) > 0) {
2628 * Try hard to steal the required number of objects
2629 * from the freelist of other mbuf classes. Only
2630 * purge and disable the per-CPU caches layer when
2631 * we don't have enough; it's the last resort.
2633 if (!mbuf_steal(m
, num
))
2638 lck_mtx_unlock(mbuf_mlock
);
2641 /* drain is performed in pfslowtimo(), to avoid deadlocks */
2644 /* Sigh; we have no other choices but to ask mcache to purge */
2645 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
2646 if ((bmap
& (1 << m
)) &&
2647 mcache_purge_cache(m_cache(m
))) {
2648 lck_mtx_lock(mbuf_mlock
);
2651 lck_mtx_unlock(mbuf_mlock
);
2656 * Request mcache to reap extra elements from all of its caches;
2657 * note that all reaps are serialized and happen only at a fixed
2662 lck_mtx_lock(mbuf_mlock
);
2665 static inline struct mbuf
*
2666 m_get_common(int wait
, short type
, int hdr
)
2669 int mcflags
= MSLEEPF(wait
);
2671 /* Is this due to a non-blocking retry? If so, then try harder */
2672 if (mcflags
& MCR_NOSLEEP
)
2673 mcflags
|= MCR_TRYHARD
;
2675 m
= mcache_alloc(m_cache(MC_MBUF
), mcflags
);
2677 MBUF_INIT(m
, hdr
, type
);
2678 mtype_stat_inc(type
);
2679 mtype_stat_dec(MT_FREE
);
2681 if (hdr
&& mac_init_mbuf(m
, wait
) != 0) {
2685 #endif /* MAC_NET */
2691 * Space allocation routines; these are also available as macros
2692 * for critical paths.
2694 #define _M_GET(wait, type) m_get_common(wait, type, 0)
2695 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
2696 #define _M_RETRY(wait, type) _M_GET(wait, type)
2697 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
2698 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
2699 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
2702 m_get(int wait
, int type
)
2704 return (_M_GET(wait
, type
));
2708 m_gethdr(int wait
, int type
)
2710 return (_M_GETHDR(wait
, type
));
2714 m_retry(int wait
, int type
)
2716 return (_M_RETRY(wait
, type
));
2720 m_retryhdr(int wait
, int type
)
2722 return (_M_RETRYHDR(wait
, type
));
2726 m_getclr(int wait
, int type
)
2730 _MGET(m
, wait
, type
);
2732 bzero(MTOD(m
, caddr_t
), MLEN
);
2737 m_free(struct mbuf
*m
)
2739 struct mbuf
*n
= m
->m_next
;
2741 if (m
->m_type
== MT_FREE
)
2742 panic("m_free: freeing an already freed mbuf");
2744 /* Free the aux data and tags if there is any */
2745 if (m
->m_flags
& M_PKTHDR
) {
2746 m_tag_delete_chain(m
, NULL
);
2749 if (m
->m_flags
& M_EXT
) {
2753 refcnt
= m_decref(m
);
2754 flags
= MEXT_FLAGS(m
);
2755 if (refcnt
== 0 && flags
== 0) {
2756 if (m
->m_ext
.ext_free
== NULL
) {
2757 mcache_free(m_cache(MC_CL
), m
->m_ext
.ext_buf
);
2758 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2759 mcache_free(m_cache(MC_BIGCL
),
2761 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
2762 mcache_free(m_cache(MC_16KCL
),
2765 (*(m
->m_ext
.ext_free
))(m
->m_ext
.ext_buf
,
2766 m
->m_ext
.ext_size
, m
->m_ext
.ext_arg
);
2768 mcache_free(ref_cache
, MEXT_RFA(m
));
2770 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
2771 VERIFY(m
->m_type
!= MT_FREE
);
2773 mtype_stat_dec(m
->m_type
);
2774 mtype_stat_inc(MT_FREE
);
2776 m
->m_type
= MT_FREE
;
2779 m
->m_next
= m
->m_nextpkt
= NULL
;
2781 /* "Free" into the intermediate cache */
2782 if (m
->m_ext
.ext_free
== NULL
) {
2783 mcache_free(m_cache(MC_MBUF_CL
), m
);
2784 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2785 mcache_free(m_cache(MC_MBUF_BIGCL
), m
);
2787 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
2788 mcache_free(m_cache(MC_MBUF_16KCL
), m
);
2794 if (m
->m_type
!= MT_FREE
) {
2795 mtype_stat_dec(m
->m_type
);
2796 mtype_stat_inc(MT_FREE
);
2799 m
->m_type
= MT_FREE
;
2800 m
->m_flags
= m
->m_len
= 0;
2801 m
->m_next
= m
->m_nextpkt
= NULL
;
2803 mcache_free(m_cache(MC_MBUF
), m
);
2808 __private_extern__
struct mbuf
*
2809 m_clattach(struct mbuf
*m
, int type
, caddr_t extbuf
,
2810 void (*extfree
)(caddr_t
, u_int
, caddr_t
), u_int extsize
, caddr_t extarg
,
2813 struct ext_ref
*rfa
= NULL
;
2815 if (m
== NULL
&& (m
= _M_GETHDR(wait
, type
)) == NULL
)
2818 if (m
->m_flags
& M_EXT
) {
2822 refcnt
= m_decref(m
);
2823 flags
= MEXT_FLAGS(m
);
2824 if (refcnt
== 0 && flags
== 0) {
2825 if (m
->m_ext
.ext_free
== NULL
) {
2826 mcache_free(m_cache(MC_CL
), m
->m_ext
.ext_buf
);
2827 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2828 mcache_free(m_cache(MC_BIGCL
),
2830 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
2831 mcache_free(m_cache(MC_16KCL
),
2834 (*(m
->m_ext
.ext_free
))(m
->m_ext
.ext_buf
,
2835 m
->m_ext
.ext_size
, m
->m_ext
.ext_arg
);
2837 /* Re-use the reference structure */
2839 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
2840 VERIFY(m
->m_type
!= MT_FREE
);
2842 mtype_stat_dec(m
->m_type
);
2843 mtype_stat_inc(MT_FREE
);
2845 m
->m_type
= MT_FREE
;
2848 m
->m_next
= m
->m_nextpkt
= NULL
;
2849 /* "Free" into the intermediate cache */
2850 if (m
->m_ext
.ext_free
== NULL
) {
2851 mcache_free(m_cache(MC_MBUF_CL
), m
);
2852 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2853 mcache_free(m_cache(MC_MBUF_BIGCL
), m
);
2855 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
2856 mcache_free(m_cache(MC_MBUF_16KCL
), m
);
2859 * Allocate a new mbuf, since we didn't divorce
2860 * the composite mbuf + cluster pair above.
2862 if ((m
= _M_GETHDR(wait
, type
)) == NULL
)
2868 (rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
) {
2873 MEXT_INIT(m
, extbuf
, extsize
, extfree
, extarg
, rfa
, 1, 0);
2878 /* m_mclget() add an mbuf cluster to a normal mbuf */
2880 m_mclget(struct mbuf
*m
, int wait
)
2882 struct ext_ref
*rfa
;
2884 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
2887 m
->m_ext
.ext_buf
= m_mclalloc(wait
);
2888 if (m
->m_ext
.ext_buf
!= NULL
) {
2889 MBUF_CL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
2891 mcache_free(ref_cache
, rfa
);
2896 /* Allocate an mbuf cluster */
2898 m_mclalloc(int wait
)
2900 int mcflags
= MSLEEPF(wait
);
2902 /* Is this due to a non-blocking retry? If so, then try harder */
2903 if (mcflags
& MCR_NOSLEEP
)
2904 mcflags
|= MCR_TRYHARD
;
2906 return (mcache_alloc(m_cache(MC_CL
), mcflags
));
2909 /* Free an mbuf cluster */
2911 m_mclfree(caddr_t p
)
2913 mcache_free(m_cache(MC_CL
), p
);
2917 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
2921 m_mclhasreference(struct mbuf
*m
)
2923 if (!(m
->m_flags
& M_EXT
))
2926 ASSERT(MEXT_RFA(m
) != NULL
);
2928 return (MEXT_REF(m
) > 1);
2931 __private_extern__ caddr_t
2932 m_bigalloc(int wait
)
2934 int mcflags
= MSLEEPF(wait
);
2936 /* Is this due to a non-blocking retry? If so, then try harder */
2937 if (mcflags
& MCR_NOSLEEP
)
2938 mcflags
|= MCR_TRYHARD
;
2940 return (mcache_alloc(m_cache(MC_BIGCL
), mcflags
));
2943 __private_extern__
void
2944 m_bigfree(caddr_t p
, __unused u_int size
, __unused caddr_t arg
)
2946 mcache_free(m_cache(MC_BIGCL
), p
);
2949 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
2950 __private_extern__
struct mbuf
*
2951 m_mbigget(struct mbuf
*m
, int wait
)
2953 struct ext_ref
*rfa
;
2955 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
2958 m
->m_ext
.ext_buf
= m_bigalloc(wait
);
2959 if (m
->m_ext
.ext_buf
!= NULL
) {
2960 MBUF_BIGCL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
2962 mcache_free(ref_cache
, rfa
);
2967 __private_extern__ caddr_t
2968 m_16kalloc(int wait
)
2970 int mcflags
= MSLEEPF(wait
);
2972 /* Is this due to a non-blocking retry? If so, then try harder */
2973 if (mcflags
& MCR_NOSLEEP
)
2974 mcflags
|= MCR_TRYHARD
;
2976 return (mcache_alloc(m_cache(MC_16KCL
), mcflags
));
2979 __private_extern__
void
2980 m_16kfree(caddr_t p
, __unused u_int size
, __unused caddr_t arg
)
2982 mcache_free(m_cache(MC_16KCL
), p
);
2985 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
2986 __private_extern__
struct mbuf
*
2987 m_m16kget(struct mbuf
*m
, int wait
)
2989 struct ext_ref
*rfa
;
2991 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
2994 m
->m_ext
.ext_buf
= m_16kalloc(wait
);
2995 if (m
->m_ext
.ext_buf
!= NULL
) {
2996 MBUF_16KCL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
2998 mcache_free(ref_cache
, rfa
);
3005 m_copy_pkthdr(struct mbuf
*to
, struct mbuf
*from
)
3008 /* We will be taking over the tags of 'to' */
3009 if (to
->m_flags
& M_PKTHDR
)
3010 m_tag_delete_chain(to
, NULL
);
3011 #endif /* MAC_NET */
3012 to
->m_pkthdr
= from
->m_pkthdr
; /* especially tags */
3013 m_tag_init(from
); /* purge tags from src */
3014 to
->m_flags
= (from
->m_flags
& M_COPYFLAGS
) | (to
->m_flags
& M_EXT
);
3015 if ((to
->m_flags
& M_EXT
) == 0)
3016 to
->m_data
= to
->m_pktdat
;
3020 * Duplicate "from"'s mbuf pkthdr in "to".
3021 * "from" must have M_PKTHDR set, and "to" must be empty.
3022 * In particular, this does a deep copy of the packet tags.
3025 m_dup_pkthdr(struct mbuf
*to
, struct mbuf
*from
, int how
)
3028 if (to
->m_flags
& M_PKTHDR
)
3029 m_tag_delete_chain(to
, NULL
);
3030 #endif /* MAC_NET */
3031 to
->m_flags
= (from
->m_flags
& M_COPYFLAGS
) | (to
->m_flags
& M_EXT
);
3032 if ((to
->m_flags
& M_EXT
) == 0)
3033 to
->m_data
= to
->m_pktdat
;
3034 to
->m_pkthdr
= from
->m_pkthdr
;
3036 return (m_tag_copy_chain(to
, from
, how
));
3040 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3041 * if wantall is not set, return whatever number were available. Set up the
3042 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3043 * are chained on the m_nextpkt field. Any packets requested beyond this
3044 * are chained onto the last packet header's m_next field. The size of
3045 * the cluster is controlled by the parameter bufsize.
3047 __private_extern__
struct mbuf
*
3048 m_getpackets_internal(unsigned int *num_needed
, int num_with_pkthdrs
,
3049 int wait
, int wantall
, size_t bufsize
)
3052 struct mbuf
**np
, *top
;
3053 unsigned int pnum
, needed
= *num_needed
;
3054 mcache_obj_t
*mp_list
= NULL
;
3055 int mcflags
= MSLEEPF(wait
);
3057 struct ext_ref
*rfa
;
3061 ASSERT(bufsize
== m_maxsize(MC_CL
) ||
3062 bufsize
== m_maxsize(MC_BIGCL
) ||
3063 bufsize
== m_maxsize(MC_16KCL
));
3066 * Caller must first check for njcl because this
3067 * routine is internal and not exposed/used via KPI.
3069 VERIFY(bufsize
!= m_maxsize(MC_16KCL
) || njcl
> 0);
3076 * The caller doesn't want all the requested buffers; only some.
3077 * Try hard to get what we can, but don't block. This effectively
3078 * overrides MCR_SLEEP, since this thread will not go to sleep
3079 * if we can't get all the buffers.
3081 if (!wantall
|| (mcflags
& MCR_NOSLEEP
))
3082 mcflags
|= MCR_TRYHARD
;
3084 /* Allocate the composite mbuf + cluster elements from the cache */
3085 if (bufsize
== m_maxsize(MC_CL
))
3086 cp
= m_cache(MC_MBUF_CL
);
3087 else if (bufsize
== m_maxsize(MC_BIGCL
))
3088 cp
= m_cache(MC_MBUF_BIGCL
);
3090 cp
= m_cache(MC_MBUF_16KCL
);
3091 needed
= mcache_alloc_ext(cp
, &mp_list
, needed
, mcflags
);
3093 for (pnum
= 0; pnum
< needed
; pnum
++) {
3094 m
= (struct mbuf
*)mp_list
;
3095 mp_list
= mp_list
->obj_next
;
3097 VERIFY(m
->m_type
== MT_FREE
&& m
->m_flags
== M_EXT
);
3098 cl
= m
->m_ext
.ext_buf
;
3101 ASSERT(cl
!= NULL
&& rfa
!= NULL
);
3102 VERIFY(MBUF_IS_COMPOSITE(m
));
3104 flag
= MEXT_FLAGS(m
);
3106 MBUF_INIT(m
, num_with_pkthdrs
, MT_DATA
);
3107 if (bufsize
== m_maxsize(MC_16KCL
)) {
3108 MBUF_16KCL_INIT(m
, cl
, rfa
, 1, flag
);
3109 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
3110 MBUF_BIGCL_INIT(m
, cl
, rfa
, 1, flag
);
3112 MBUF_CL_INIT(m
, cl
, rfa
, 1, flag
);
3115 if (num_with_pkthdrs
> 0) {
3118 if (mac_mbuf_label_init(m
, wait
) != 0) {
3122 #endif /* MAC_NET */
3126 if (num_with_pkthdrs
> 0)
3131 ASSERT(pnum
!= *num_needed
|| mp_list
== NULL
);
3132 if (mp_list
!= NULL
)
3133 mcache_free_ext(cp
, mp_list
);
3136 mtype_stat_add(MT_DATA
, pnum
);
3137 mtype_stat_sub(MT_FREE
, pnum
);
3140 if (wantall
&& (pnum
!= *num_needed
)) {
3151 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3152 * wantall is not set, return whatever number were available. The size of
3153 * each mbuf in the list is controlled by the parameter packetlen. Each
3154 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3155 * in the chain is called a segment. If maxsegments is not null and the
3156 * value pointed to is not null, this specify the maximum number of segments
3157 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3158 * is zero the caller does not have any restriction on the number of segments.
3159 * The actual number of segments of a mbuf chain is return in the value
3160 * pointed to by maxsegments.
3162 __private_extern__
struct mbuf
*
3163 m_allocpacket_internal(unsigned int *numlist
, size_t packetlen
,
3164 unsigned int *maxsegments
, int wait
, int wantall
, size_t wantsize
)
3166 struct mbuf
**np
, *top
, *first
= NULL
;
3167 size_t bufsize
, r_bufsize
;
3168 unsigned int num
= 0;
3169 unsigned int nsegs
= 0;
3170 unsigned int needed
, resid
;
3171 int mcflags
= MSLEEPF(wait
);
3172 mcache_obj_t
*mp_list
= NULL
, *rmp_list
= NULL
;
3173 mcache_t
*cp
= NULL
, *rcp
= NULL
;
3181 if (wantsize
== 0) {
3182 if (packetlen
<= MINCLSIZE
) {
3183 bufsize
= packetlen
;
3184 } else if (packetlen
> m_maxsize(MC_CL
)) {
3185 /* Use 4KB if jumbo cluster pool isn't available */
3186 if (packetlen
<= m_maxsize(MC_BIGCL
) || njcl
== 0)
3187 bufsize
= m_maxsize(MC_BIGCL
);
3189 bufsize
= m_maxsize(MC_16KCL
);
3191 bufsize
= m_maxsize(MC_CL
);
3193 } else if (wantsize
== m_maxsize(MC_CL
) ||
3194 wantsize
== m_maxsize(MC_BIGCL
) ||
3195 (wantsize
== m_maxsize(MC_16KCL
) && njcl
> 0)) {
3201 if (bufsize
<= MHLEN
) {
3203 } else if (bufsize
<= MINCLSIZE
) {
3204 if (maxsegments
!= NULL
&& *maxsegments
== 1) {
3205 bufsize
= m_maxsize(MC_CL
);
3210 } else if (bufsize
== m_maxsize(MC_16KCL
)) {
3212 nsegs
= ((packetlen
- 1) >> (PGSHIFT
+ 2)) + 1;
3213 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
3214 nsegs
= ((packetlen
- 1) >> PGSHIFT
) + 1;
3216 nsegs
= ((packetlen
- 1) >> MCLSHIFT
) + 1;
3218 if (maxsegments
!= NULL
) {
3219 if (*maxsegments
&& nsegs
> *maxsegments
) {
3220 *maxsegments
= nsegs
;
3223 *maxsegments
= nsegs
;
3227 * The caller doesn't want all the requested buffers; only some.
3228 * Try hard to get what we can, but don't block. This effectively
3229 * overrides MCR_SLEEP, since this thread will not go to sleep
3230 * if we can't get all the buffers.
3232 if (!wantall
|| (mcflags
& MCR_NOSLEEP
))
3233 mcflags
|= MCR_TRYHARD
;
3236 * Simple case where all elements in the lists/chains are mbufs.
3237 * Unless bufsize is greater than MHLEN, each segment chain is made
3238 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3239 * of 2 mbufs; the second one is used for the residual data, i.e.
3240 * the remaining data that cannot fit into the first mbuf.
3242 if (bufsize
<= MINCLSIZE
) {
3243 /* Allocate the elements in one shot from the mbuf cache */
3244 ASSERT(bufsize
<= MHLEN
|| nsegs
== 2);
3245 cp
= m_cache(MC_MBUF
);
3246 needed
= mcache_alloc_ext(cp
, &mp_list
,
3247 (*numlist
) * nsegs
, mcflags
);
3250 * The number of elements must be even if we are to use an
3251 * mbuf (instead of a cluster) to store the residual data.
3252 * If we couldn't allocate the requested number of mbufs,
3253 * trim the number down (if it's odd) in order to avoid
3254 * creating a partial segment chain.
3256 if (bufsize
> MHLEN
&& (needed
& 0x1))
3259 while (num
< needed
) {
3262 m
= (struct mbuf
*)mp_list
;
3263 mp_list
= mp_list
->obj_next
;
3266 MBUF_INIT(m
, 1, MT_DATA
);
3268 if (mac_init_mbuf(m
, wait
) != 0) {
3272 #endif /* MAC_NET */
3274 if (bufsize
> MHLEN
) {
3275 /* A second mbuf for this segment chain */
3276 m
->m_next
= (struct mbuf
*)mp_list
;
3277 mp_list
= mp_list
->obj_next
;
3278 ASSERT(m
->m_next
!= NULL
);
3280 MBUF_INIT(m
->m_next
, 0, MT_DATA
);
3286 ASSERT(num
!= *numlist
|| mp_list
== NULL
);
3289 mtype_stat_add(MT_DATA
, num
);
3290 mtype_stat_sub(MT_FREE
, num
);
3294 /* We've got them all; return to caller */
3295 if (num
== *numlist
)
3302 * Complex cases where elements are made up of one or more composite
3303 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3304 * be illustrated as follows:
3306 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3308 * Every composite mbuf + cluster element comes from the intermediate
3309 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
3310 * the last composite element will come from the MC_MBUF_CL cache,
3311 * unless the residual data is larger than 2KB where we use the
3312 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
3313 * data is defined as extra data beyond the first element that cannot
3314 * fit into the previous element, i.e. there is no residual data if
3315 * the chain only has 1 segment.
3317 r_bufsize
= bufsize
;
3318 resid
= packetlen
> bufsize
? packetlen
% bufsize
: 0;
3320 /* There is residual data; figure out the cluster size */
3321 if (wantsize
== 0 && packetlen
> MINCLSIZE
) {
3323 * Caller didn't request that all of the segments
3324 * in the chain use the same cluster size; use the
3325 * smaller of the cluster sizes.
3327 if (njcl
> 0 && resid
> m_maxsize(MC_BIGCL
))
3328 r_bufsize
= m_maxsize(MC_16KCL
);
3329 else if (resid
> m_maxsize(MC_CL
))
3330 r_bufsize
= m_maxsize(MC_BIGCL
);
3332 r_bufsize
= m_maxsize(MC_CL
);
3334 /* Use the same cluster size as the other segments */
3342 * Attempt to allocate composite mbuf + cluster elements for
3343 * the residual data in each chain; record the number of such
3344 * elements that can be allocated so that we know how many
3345 * segment chains we can afford to create.
3347 if (r_bufsize
<= m_maxsize(MC_CL
))
3348 rcp
= m_cache(MC_MBUF_CL
);
3349 else if (r_bufsize
<= m_maxsize(MC_BIGCL
))
3350 rcp
= m_cache(MC_MBUF_BIGCL
);
3352 rcp
= m_cache(MC_MBUF_16KCL
);
3353 needed
= mcache_alloc_ext(rcp
, &rmp_list
, *numlist
, mcflags
);
3358 /* This is temporarily reduced for calculation */
3364 * Attempt to allocate the rest of the composite mbuf + cluster
3365 * elements for the number of segment chains that we need.
3367 if (bufsize
<= m_maxsize(MC_CL
))
3368 cp
= m_cache(MC_MBUF_CL
);
3369 else if (bufsize
<= m_maxsize(MC_BIGCL
))
3370 cp
= m_cache(MC_MBUF_BIGCL
);
3372 cp
= m_cache(MC_MBUF_16KCL
);
3373 needed
= mcache_alloc_ext(cp
, &mp_list
, needed
* nsegs
, mcflags
);
3375 /* Round it down to avoid creating a partial segment chain */
3376 needed
= (needed
/ nsegs
) * nsegs
;
3382 * We're about to construct the chain(s); take into account
3383 * the number of segments we have created above to hold the
3384 * residual data for each chain, as well as restore the
3385 * original count of segments per chain.
3388 needed
+= needed
/ nsegs
;
3395 struct ext_ref
*rfa
;
3400 if (nsegs
== 1 || (num
% nsegs
) != 0 || resid
== 0) {
3401 m
= (struct mbuf
*)mp_list
;
3402 mp_list
= mp_list
->obj_next
;
3404 m
= (struct mbuf
*)rmp_list
;
3405 rmp_list
= rmp_list
->obj_next
;
3408 VERIFY(m
->m_type
== MT_FREE
&& m
->m_flags
== M_EXT
);
3409 VERIFY(m
->m_ext
.ext_free
== NULL
||
3410 m
->m_ext
.ext_free
== m_bigfree
||
3411 m
->m_ext
.ext_free
== m_16kfree
);
3413 cl
= m
->m_ext
.ext_buf
;
3416 ASSERT(cl
!= NULL
&& rfa
!= NULL
);
3417 VERIFY(MBUF_IS_COMPOSITE(m
));
3419 flag
= MEXT_FLAGS(m
);
3421 pkthdr
= (nsegs
== 1 || (num
% nsegs
) == 1);
3424 MBUF_INIT(m
, pkthdr
, MT_DATA
);
3425 if (m
->m_ext
.ext_free
== m_16kfree
) {
3426 MBUF_16KCL_INIT(m
, cl
, rfa
, 1, flag
);
3427 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3428 MBUF_BIGCL_INIT(m
, cl
, rfa
, 1, flag
);
3430 MBUF_CL_INIT(m
, cl
, rfa
, 1, flag
);
3433 if (pkthdr
&& mac_init_mbuf(m
, wait
) != 0) {
3438 #endif /* MAC_NET */
3441 if ((num
% nsegs
) == 0)
3442 np
= &first
->m_nextpkt
;
3451 mtype_stat_add(MT_DATA
, num
);
3452 mtype_stat_sub(MT_FREE
, num
);
3457 /* We've got them all; return to caller */
3458 if (num
== *numlist
) {
3459 ASSERT(mp_list
== NULL
&& rmp_list
== NULL
);
3464 /* Free up what's left of the above */
3465 if (mp_list
!= NULL
)
3466 mcache_free_ext(cp
, mp_list
);
3467 if (rmp_list
!= NULL
)
3468 mcache_free_ext(rcp
, rmp_list
);
3469 if (wantall
&& top
!= NULL
) {
3478 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3479 * packets on receive ring.
3481 __private_extern__
struct mbuf
*
3482 m_getpacket_how(int wait
)
3484 unsigned int num_needed
= 1;
3486 return (m_getpackets_internal(&num_needed
, 1, wait
, 1,
3491 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3492 * packets on receive ring.
3497 unsigned int num_needed
= 1;
3499 return (m_getpackets_internal(&num_needed
, 1, M_WAIT
, 1,
3504 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3505 * if this can't be met, return whatever number were available. Set up the
3506 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
3507 * are chained on the m_nextpkt field. Any packets requested beyond this are
3508 * chained onto the last packet header's m_next field.
3511 m_getpackets(int num_needed
, int num_with_pkthdrs
, int how
)
3513 unsigned int n
= num_needed
;
3515 return (m_getpackets_internal(&n
, num_with_pkthdrs
, how
, 0,
3520 * Return a list of mbuf hdrs set up as packet hdrs chained together
3521 * on the m_nextpkt field
3524 m_getpackethdrs(int num_needed
, int how
)
3527 struct mbuf
**np
, *top
;
3532 while (num_needed
--) {
3533 m
= _M_RETRYHDR(how
, MT_DATA
);
3545 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
3546 * for mbufs packets freed. Used by the drivers.
3549 m_freem_list(struct mbuf
*m
)
3551 struct mbuf
*nextpkt
;
3552 mcache_obj_t
*mp_list
= NULL
;
3553 mcache_obj_t
*mcl_list
= NULL
;
3554 mcache_obj_t
*mbc_list
= NULL
;
3555 mcache_obj_t
*m16k_list
= NULL
;
3556 mcache_obj_t
*m_mcl_list
= NULL
;
3557 mcache_obj_t
*m_mbc_list
= NULL
;
3558 mcache_obj_t
*m_m16k_list
= NULL
;
3559 mcache_obj_t
*ref_list
= NULL
;
3561 int mt_free
= 0, mt_data
= 0, mt_header
= 0, mt_soname
= 0, mt_tag
= 0;
3566 nextpkt
= m
->m_nextpkt
;
3567 m
->m_nextpkt
= NULL
;
3570 struct mbuf
*next
= m
->m_next
;
3571 mcache_obj_t
*o
, *rfa
;
3572 u_int32_t refcnt
, flags
;
3574 if (m
->m_type
== MT_FREE
)
3575 panic("m_free: freeing an already freed mbuf");
3577 if (m
->m_type
!= MT_FREE
)
3580 if (m
->m_flags
& M_PKTHDR
) {
3581 m_tag_delete_chain(m
, NULL
);
3584 if (!(m
->m_flags
& M_EXT
))
3587 o
= (mcache_obj_t
*)m
->m_ext
.ext_buf
;
3588 refcnt
= m_decref(m
);
3589 flags
= MEXT_FLAGS(m
);
3590 if (refcnt
== 0 && flags
== 0) {
3591 if (m
->m_ext
.ext_free
== NULL
) {
3592 o
->obj_next
= mcl_list
;
3594 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3595 o
->obj_next
= mbc_list
;
3597 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
3598 o
->obj_next
= m16k_list
;
3601 (*(m
->m_ext
.ext_free
))((caddr_t
)o
,
3605 rfa
= (mcache_obj_t
*)MEXT_RFA(m
);
3606 rfa
->obj_next
= ref_list
;
3609 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
3610 VERIFY(m
->m_type
!= MT_FREE
);
3612 * Amortize the costs of atomic operations
3613 * by doing them at the end, if possible.
3615 if (m
->m_type
== MT_DATA
)
3617 else if (m
->m_type
== MT_HEADER
)
3619 else if (m
->m_type
== MT_SONAME
)
3621 else if (m
->m_type
== MT_TAG
)
3624 mtype_stat_dec(m
->m_type
);
3626 m
->m_type
= MT_FREE
;
3629 m
->m_next
= m
->m_nextpkt
= NULL
;
3631 /* "Free" into the intermediate cache */
3632 o
= (mcache_obj_t
*)m
;
3633 if (m
->m_ext
.ext_free
== NULL
) {
3634 o
->obj_next
= m_mcl_list
;
3636 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3637 o
->obj_next
= m_mbc_list
;
3640 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
3641 o
->obj_next
= m_m16k_list
;
3649 * Amortize the costs of atomic operations
3650 * by doing them at the end, if possible.
3652 if (m
->m_type
== MT_DATA
)
3654 else if (m
->m_type
== MT_HEADER
)
3656 else if (m
->m_type
== MT_SONAME
)
3658 else if (m
->m_type
== MT_TAG
)
3660 else if (m
->m_type
!= MT_FREE
)
3661 mtype_stat_dec(m
->m_type
);
3663 m
->m_type
= MT_FREE
;
3664 m
->m_flags
= m
->m_len
= 0;
3665 m
->m_next
= m
->m_nextpkt
= NULL
;
3667 ((mcache_obj_t
*)m
)->obj_next
= mp_list
;
3668 mp_list
= (mcache_obj_t
*)m
;
3677 mtype_stat_add(MT_FREE
, mt_free
);
3679 mtype_stat_sub(MT_DATA
, mt_data
);
3681 mtype_stat_sub(MT_HEADER
, mt_header
);
3683 mtype_stat_sub(MT_SONAME
, mt_soname
);
3685 mtype_stat_sub(MT_TAG
, mt_tag
);
3687 if (mp_list
!= NULL
)
3688 mcache_free_ext(m_cache(MC_MBUF
), mp_list
);
3689 if (mcl_list
!= NULL
)
3690 mcache_free_ext(m_cache(MC_CL
), mcl_list
);
3691 if (mbc_list
!= NULL
)
3692 mcache_free_ext(m_cache(MC_BIGCL
), mbc_list
);
3693 if (m16k_list
!= NULL
)
3694 mcache_free_ext(m_cache(MC_16KCL
), m16k_list
);
3695 if (m_mcl_list
!= NULL
)
3696 mcache_free_ext(m_cache(MC_MBUF_CL
), m_mcl_list
);
3697 if (m_mbc_list
!= NULL
)
3698 mcache_free_ext(m_cache(MC_MBUF_BIGCL
), m_mbc_list
);
3699 if (m_m16k_list
!= NULL
)
3700 mcache_free_ext(m_cache(MC_MBUF_16KCL
), m_m16k_list
);
3701 if (ref_list
!= NULL
)
3702 mcache_free_ext(ref_cache
, ref_list
);
3708 m_freem(struct mbuf
*m
)
3715 * Mbuffer utility routines.
3719 * Compute the amount of space available before the current start
3720 * of data in an mbuf.
3723 m_leadingspace(struct mbuf
*m
)
3725 if (m
->m_flags
& M_EXT
) {
3726 if (MCLHASREFERENCE(m
))
3728 return (m
->m_data
- m
->m_ext
.ext_buf
);
3730 if (m
->m_flags
& M_PKTHDR
)
3731 return (m
->m_data
- m
->m_pktdat
);
3732 return (m
->m_data
- m
->m_dat
);
3736 * Compute the amount of space available after the end of data in an mbuf.
3739 m_trailingspace(struct mbuf
*m
)
3741 if (m
->m_flags
& M_EXT
) {
3742 if (MCLHASREFERENCE(m
))
3744 return (m
->m_ext
.ext_buf
+ m
->m_ext
.ext_size
-
3745 (m
->m_data
+ m
->m_len
));
3747 return (&m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
));
3751 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
3752 * copy junk along. Does not adjust packet header length.
3755 m_prepend(struct mbuf
*m
, int len
, int how
)
3759 _MGET(mn
, how
, m
->m_type
);
3764 if (m
->m_flags
& M_PKTHDR
) {
3765 M_COPY_PKTHDR(mn
, m
);
3766 m
->m_flags
&= ~M_PKTHDR
;
3777 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3778 * chain, copy junk along, and adjust length.
3781 m_prepend_2(struct mbuf
*m
, int len
, int how
)
3783 if (M_LEADINGSPACE(m
) >= len
) {
3787 m
= m_prepend(m
, len
, how
);
3789 if ((m
) && (m
->m_flags
& M_PKTHDR
))
3790 m
->m_pkthdr
.len
+= len
;
3795 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3796 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
3797 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3802 m_copym(struct mbuf
*m
, int off0
, int len
, int wait
)
3804 struct mbuf
*n
, *mhdr
= NULL
, **np
;
3809 if (off
< 0 || len
< 0)
3810 panic("m_copym: invalid offset %d or len %d", off
, len
);
3812 if (off
== 0 && (m
->m_flags
& M_PKTHDR
)) {
3817 while (off
>= m
->m_len
) {
3818 if (m
->m_next
== NULL
)
3819 panic("m_copym: invalid mbuf chain");
3828 if (len
!= M_COPYALL
)
3829 panic("m_copym: len != M_COPYALL");
3833 n
= _M_RETRY(wait
, m
->m_type
);
3840 M_COPY_PKTHDR(n
, mhdr
);
3841 if (len
== M_COPYALL
)
3842 n
->m_pkthdr
.len
-= off0
;
3844 n
->m_pkthdr
.len
= len
;
3847 if (len
== M_COPYALL
) {
3848 if (MIN(len
, (m
->m_len
- off
)) == len
) {
3849 printf("m->m_len %ld - off %d = %ld, %ld\n",
3850 m
->m_len
, off
, m
->m_len
- off
,
3851 MIN(len
, (m
->m_len
- off
)));
3854 n
->m_len
= MIN(len
, (m
->m_len
- off
));
3855 if (n
->m_len
== M_COPYALL
) {
3856 printf("n->m_len == M_COPYALL, fixing\n");
3859 if (m
->m_flags
& M_EXT
) {
3860 n
->m_ext
= m
->m_ext
;
3862 n
->m_data
= m
->m_data
+ off
;
3863 n
->m_flags
|= M_EXT
;
3865 bcopy(MTOD(m
, caddr_t
)+off
, MTOD(n
, caddr_t
),
3866 (unsigned)n
->m_len
);
3868 if (len
!= M_COPYALL
)
3887 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
3888 * within this routine also, the last mbuf and offset accessed are passed
3889 * out and can be passed back in to avoid having to rescan the entire mbuf
3890 * list (normally hung off of the socket)
3893 m_copym_with_hdrs(struct mbuf
*m
, int off0
, int len0
, int wait
,
3894 struct mbuf
**m_last
, int *m_off
)
3896 struct mbuf
*n
, **np
= NULL
;
3897 int off
= off0
, len
= len0
;
3898 struct mbuf
*top
= NULL
;
3899 int mcflags
= MSLEEPF(wait
);
3902 mcache_obj_t
*list
= NULL
;
3905 if (off
== 0 && (m
->m_flags
& M_PKTHDR
))
3908 if (*m_last
!= NULL
) {
3912 while (off
>= m
->m_len
) {
3922 len
-= MIN(len
, (n
->m_len
- ((needed
== 1) ? off
: 0)));
3929 * If the caller doesn't want to be put to sleep, mark it with
3930 * MCR_TRYHARD so that we may reclaim buffers from other places
3933 if (mcflags
& MCR_NOSLEEP
)
3934 mcflags
|= MCR_TRYHARD
;
3936 if (mcache_alloc_ext(m_cache(MC_MBUF
), &list
, needed
,
3942 n
= (struct mbuf
*)list
;
3943 list
= list
->obj_next
;
3944 ASSERT(n
!= NULL
&& m
!= NULL
);
3946 type
= (top
== NULL
) ? MT_HEADER
: m
->m_type
;
3947 MBUF_INIT(n
, (top
== NULL
), type
);
3949 if (top
== NULL
&& mac_mbuf_label_init(n
, wait
) != 0) {
3950 mtype_stat_inc(MT_HEADER
);
3951 mtype_stat_dec(MT_FREE
);
3955 #endif /* MAC_NET */
3967 M_COPY_PKTHDR(n
, m
);
3968 n
->m_pkthdr
.len
= len
;
3971 n
->m_len
= MIN(len
, (m
->m_len
- off
));
3973 if (m
->m_flags
& M_EXT
) {
3974 n
->m_ext
= m
->m_ext
;
3976 n
->m_data
= m
->m_data
+ off
;
3977 n
->m_flags
|= M_EXT
;
3979 bcopy(MTOD(m
, caddr_t
)+off
, MTOD(n
, caddr_t
),
3980 (unsigned)n
->m_len
);
3985 if ((off
+ n
->m_len
) == m
->m_len
) {
3986 *m_last
= m
->m_next
;
3990 *m_off
= off
+ n
->m_len
;
3999 mtype_stat_inc(MT_HEADER
);
4000 mtype_stat_add(type
, needed
);
4001 mtype_stat_sub(MT_FREE
, needed
+ 1);
4003 ASSERT(list
== NULL
);
4008 mcache_free_ext(m_cache(MC_MBUF
), list
);
4016 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4017 * continuing for "len" bytes, into the indicated buffer.
4020 m_copydata(struct mbuf
*m
, int off
, int len
, caddr_t cp
)
4024 if (off
< 0 || len
< 0)
4025 panic("m_copydata: invalid offset %d or len %d", off
, len
);
4029 panic("m_copydata: invalid mbuf chain");
4037 panic("m_copydata: invalid mbuf chain");
4038 count
= MIN(m
->m_len
- off
, len
);
4039 bcopy(MTOD(m
, caddr_t
) + off
, cp
, count
);
4048 * Concatenate mbuf chain n to m. Both chains must be of the same type
4049 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4052 m_cat(struct mbuf
*m
, struct mbuf
*n
)
4057 if ((m
->m_flags
& M_EXT
) ||
4058 m
->m_data
+ m
->m_len
+ n
->m_len
>= &m
->m_dat
[MLEN
]) {
4059 /* just join the two chains */
4063 /* splat the data from one into the other */
4064 bcopy(MTOD(n
, caddr_t
), MTOD(m
, caddr_t
) + m
->m_len
,
4066 m
->m_len
+= n
->m_len
;
4072 m_adj(struct mbuf
*mp
, int req_len
)
4078 if ((m
= mp
) == NULL
)
4084 while (m
!= NULL
&& len
> 0) {
4085 if (m
->m_len
<= len
) {
4096 if (m
->m_flags
& M_PKTHDR
)
4097 m
->m_pkthdr
.len
-= (req_len
- len
);
4100 * Trim from tail. Scan the mbuf chain,
4101 * calculating its length and finding the last mbuf.
4102 * If the adjustment only affects this mbuf, then just
4103 * adjust and return. Otherwise, rescan and truncate
4104 * after the remaining size.
4110 if (m
->m_next
== (struct mbuf
*)0)
4114 if (m
->m_len
>= len
) {
4117 if (m
->m_flags
& M_PKTHDR
)
4118 m
->m_pkthdr
.len
-= len
;
4125 * Correct length for chain is "count".
4126 * Find the mbuf with last data, adjust its length,
4127 * and toss data from remaining mbufs on chain.
4130 if (m
->m_flags
& M_PKTHDR
)
4131 m
->m_pkthdr
.len
= count
;
4132 for (; m
; m
= m
->m_next
) {
4133 if (m
->m_len
>= count
) {
4139 while ((m
= m
->m_next
))
4145 * Rearange an mbuf chain so that len bytes are contiguous
4146 * and in the data area of an mbuf (so that mtod and dtom
4147 * will work for a structure of size len). Returns the resulting
4148 * mbuf chain on success, frees it and returns null on failure.
4149 * If there is room, it will add up to max_protohdr-len extra bytes to the
4150 * contiguous region in an attempt to avoid being called next time.
4155 m_pullup(struct mbuf
*n
, int len
)
4162 * If first mbuf has no cluster, and has room for len bytes
4163 * without shifting current data, pullup into it,
4164 * otherwise allocate a new mbuf to prepend to the chain.
4166 if ((n
->m_flags
& M_EXT
) == 0 &&
4167 n
->m_data
+ len
< &n
->m_dat
[MLEN
] && n
->m_next
) {
4168 if (n
->m_len
>= len
)
4176 _MGET(m
, M_DONTWAIT
, n
->m_type
);
4180 if (n
->m_flags
& M_PKTHDR
) {
4181 M_COPY_PKTHDR(m
, n
);
4182 n
->m_flags
&= ~M_PKTHDR
;
4185 space
= &m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
);
4187 count
= MIN(MIN(MAX(len
, max_protohdr
), space
), n
->m_len
);
4188 bcopy(MTOD(n
, caddr_t
), MTOD(m
, caddr_t
) + m
->m_len
,
4198 } while (len
> 0 && n
);
4212 * Partition an mbuf chain in two pieces, returning the tail --
4213 * all but the first len0 bytes. In case of failure, it returns NULL and
4214 * attempts to restore the chain to its original state.
4217 m_split(struct mbuf
*m0
, int len0
, int wait
)
4220 unsigned len
= len0
, remain
;
4222 for (m
= m0
; m
&& len
> m
->m_len
; m
= m
->m_next
)
4226 remain
= m
->m_len
- len
;
4227 if (m0
->m_flags
& M_PKTHDR
) {
4228 _MGETHDR(n
, wait
, m0
->m_type
);
4231 n
->m_pkthdr
.rcvif
= m0
->m_pkthdr
.rcvif
;
4232 n
->m_pkthdr
.len
= m0
->m_pkthdr
.len
- len0
;
4233 m0
->m_pkthdr
.len
= len0
;
4234 if (m
->m_flags
& M_EXT
)
4236 if (remain
> MHLEN
) {
4237 /* m can't be the lead packet */
4239 n
->m_next
= m_split(m
, len
, wait
);
4240 if (n
->m_next
== NULL
) {
4246 MH_ALIGN(n
, remain
);
4247 } else if (remain
== 0) {
4252 _MGET(n
, wait
, m
->m_type
);
4258 if (m
->m_flags
& M_EXT
) {
4259 n
->m_flags
|= M_EXT
;
4260 n
->m_ext
= m
->m_ext
;
4262 n
->m_data
= m
->m_data
+ len
;
4264 bcopy(MTOD(m
, caddr_t
) + len
, MTOD(n
, caddr_t
), remain
);
4268 n
->m_next
= m
->m_next
;
4274 * Routine to copy from device local memory into mbufs.
4277 m_devget(char *buf
, int totlen
, int off0
, struct ifnet
*ifp
,
4278 void (*copy
)(const void *, void *, size_t))
4281 struct mbuf
*top
= NULL
, **mp
= &top
;
4282 int off
= off0
, len
;
4290 * If 'off' is non-zero, packet is trailer-encapsulated,
4291 * so we have to skip the type and length fields.
4293 cp
+= off
+ 2 * sizeof (u_int16_t
);
4294 totlen
-= 2 * sizeof (u_int16_t
);
4296 _MGETHDR(m
, M_DONTWAIT
, MT_DATA
);
4299 m
->m_pkthdr
.rcvif
= ifp
;
4300 m
->m_pkthdr
.len
= totlen
;
4303 while (totlen
> 0) {
4305 _MGET(m
, M_DONTWAIT
, MT_DATA
);
4312 len
= MIN(totlen
, epkt
- cp
);
4313 if (len
>= MINCLSIZE
) {
4314 MCLGET(m
, M_DONTWAIT
);
4315 if (m
->m_flags
& M_EXT
) {
4316 m
->m_len
= len
= MIN(len
, m_maxsize(MC_CL
));
4318 /* give up when it's out of cluster mbufs */
4326 * Place initial small packet/header at end of mbuf.
4328 if (len
< m
->m_len
) {
4330 len
+ max_linkhdr
<= m
->m_len
)
4331 m
->m_data
+= max_linkhdr
;
4338 copy(cp
, MTOD(m
, caddr_t
), (unsigned)len
);
4340 bcopy(cp
, MTOD(m
, caddr_t
), (unsigned)len
);
4352 * Cluster freelist allocation check.
4355 m_howmany(int num
, size_t bufsize
)
4358 u_int32_t m_clusters
, m_bigclusters
, m_16kclusters
;
4359 u_int32_t m_clfree
, m_bigclfree
, m_16kclfree
;
4361 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
4363 m_clusters
= m_total(MC_CL
);
4364 m_bigclusters
= m_total(MC_BIGCL
);
4365 m_16kclusters
= m_total(MC_16KCL
);
4366 m_clfree
= m_infree(MC_CL
);
4367 m_bigclfree
= m_infree(MC_BIGCL
);
4368 m_16kclfree
= m_infree(MC_16KCL
);
4370 /* Bail if we've maxed out the mbuf memory map */
4371 if ((bufsize
!= m_maxsize(MC_16KCL
) &&
4372 (m_clusters
+ (m_bigclusters
<< 1) >= nclusters
)) ||
4373 (njcl
> 0 && bufsize
== m_maxsize(MC_16KCL
) &&
4374 (m_16kclusters
<< 3) >= njcl
)) {
4376 if (bufsize
== MCLBYTES
&& num
> m_clfree
) {
4377 printf("m_howmany - out of small clusters, "
4378 "%d short\n", num
- mbstat
.m_clfree
);
4384 if (bufsize
== m_maxsize(MC_CL
)) {
4386 if (m_clusters
< MINCL
)
4387 return (MINCL
- m_clusters
);
4388 /* Too few (free < 1/16 total) and not over maximum */
4389 if (m_clusters
< m_maxlimit(MC_CL
)) {
4390 if (m_clfree
>= MCL_LOWAT
)
4392 if (num
>= m_clfree
)
4394 if (((m_clusters
+ num
) >> 4) > m_clfree
)
4395 j
= ((m_clusters
+ num
) >> 4) - m_clfree
;
4397 if (i
+ m_clusters
>= m_maxlimit(MC_CL
))
4398 i
= m_maxlimit(MC_CL
) - m_clusters
;
4400 VERIFY((m_total(MC_CL
) + i
) <= m_maxlimit(MC_CL
));
4401 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
4403 if (m_bigclusters
< MINBIGCL
)
4404 return (MINBIGCL
- m_bigclusters
);
4405 /* Too few (free < 1/16 total) and not over maximum */
4406 if (m_bigclusters
< m_maxlimit(MC_BIGCL
)) {
4407 if (m_bigclfree
>= MBIGCL_LOWAT
)
4409 if (num
>= m_bigclfree
)
4410 i
= num
- m_bigclfree
;
4411 if (((m_bigclusters
+ num
) >> 4) > m_bigclfree
)
4412 j
= ((m_bigclusters
+ num
) >> 4) - m_bigclfree
;
4414 if (i
+ m_bigclusters
>= m_maxlimit(MC_BIGCL
))
4415 i
= m_maxlimit(MC_BIGCL
) - m_bigclusters
;
4417 VERIFY((m_total(MC_BIGCL
) + i
) <= m_maxlimit(MC_BIGCL
));
4421 if (m_16kclusters
< MIN16KCL
)
4422 return (MIN16KCL
- m_16kclusters
);
4423 /* Too few (free < 1/16 total) and not over maximum */
4424 if (m_16kclusters
< m_maxlimit(MC_16KCL
)) {
4425 if (m_16kclfree
>= M16KCL_LOWAT
)
4427 if (num
>= m_16kclfree
)
4428 i
= num
- m_16kclfree
;
4429 if (((m_16kclusters
+ num
) >> 4) > m_16kclfree
)
4430 j
= ((m_16kclusters
+ num
) >> 4) - m_16kclfree
;
4432 if (i
+ m_16kclusters
>= m_maxlimit(MC_16KCL
))
4433 i
= m_maxlimit(MC_16KCL
) - m_16kclusters
;
4435 VERIFY((m_total(MC_16KCL
) + i
) <= m_maxlimit(MC_16KCL
));
4442 * Copy data from a buffer back into the indicated mbuf chain,
4443 * starting "off" bytes from the beginning, extending the mbuf
4444 * chain if necessary.
4447 m_copyback(struct mbuf
*m0
, int off
, int len
, caddr_t cp
)
4450 struct mbuf
*m
= m0
, *n
;
4455 while (off
> (mlen
= m
->m_len
)) {
4458 if (m
->m_next
== NULL
) {
4459 n
= m_getclr(M_DONTWAIT
, m
->m_type
);
4462 n
->m_len
= MIN(MLEN
, len
+ off
);
4468 mlen
= MIN(m
->m_len
- off
, len
);
4469 bcopy(cp
, off
+ MTOD(m
, caddr_t
), (unsigned)mlen
);
4477 if (m
->m_next
== NULL
) {
4478 n
= _M_GET(M_DONTWAIT
, m
->m_type
);
4481 n
->m_len
= MIN(MLEN
, len
);
4487 if (((m
= m0
)->m_flags
& M_PKTHDR
) && (m
->m_pkthdr
.len
< totlen
))
4488 m
->m_pkthdr
.len
= totlen
;
4492 mcl_to_paddr(char *addr
)
4496 if (!MBUF_IN_MAP(addr
))
4498 base_phys
= mcl_paddr
[(addr
- (char *)mbutl
) >> PGSHIFT
];
4502 return ((char *)((int)base_phys
| ((int)addr
& PGOFSET
)));
4506 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
4507 * And really copy the thing. That way, we don't "precompute" checksums
4508 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
4509 * small packets, don't dup into a cluster. That way received packets
4510 * don't take up too much room in the sockbuf (cf. sbspace()).
4515 m_dup(struct mbuf
*m
, int how
)
4517 struct mbuf
*n
, **np
;
4523 if (m
->m_flags
& M_PKTHDR
)
4527 * Quick check: if we have one mbuf and its data fits in an
4528 * mbuf with packet header, just copy and go.
4530 if (m
->m_next
== NULL
) {
4531 /* Then just move the data into an mbuf and be done... */
4533 if (m
->m_pkthdr
.len
<= MHLEN
&& m
->m_len
<= MHLEN
) {
4534 if ((n
= _M_GETHDR(how
, m
->m_type
)) == NULL
)
4536 n
->m_len
= m
->m_len
;
4537 m_dup_pkthdr(n
, m
, how
);
4538 bcopy(m
->m_data
, n
->m_data
, m
->m_len
);
4541 } else if (m
->m_len
<= MLEN
) {
4542 if ((n
= _M_GET(how
, m
->m_type
)) == NULL
)
4544 bcopy(m
->m_data
, n
->m_data
, m
->m_len
);
4545 n
->m_len
= m
->m_len
;
4551 kprintf("<%x: %x, %x, %x\n", m
, m
->m_flags
, m
->m_len
,
4555 n
= _M_GETHDR(how
, m
->m_type
);
4557 n
= _M_GET(how
, m
->m_type
);
4560 if (m
->m_flags
& M_EXT
) {
4561 if (m
->m_len
<= m_maxsize(MC_CL
))
4563 else if (m
->m_len
<= m_maxsize(MC_BIGCL
))
4564 n
= m_mbigget(n
, how
);
4565 else if (m
->m_len
<= m_maxsize(MC_16KCL
) && njcl
> 0)
4566 n
= m_m16kget(n
, how
);
4567 if (!(n
->m_flags
& M_EXT
)) {
4574 /* Don't use M_COPY_PKTHDR: preserve m_data */
4575 m_dup_pkthdr(n
, m
, how
);
4577 if (!(n
->m_flags
& M_EXT
))
4578 n
->m_data
= n
->m_pktdat
;
4580 n
->m_len
= m
->m_len
;
4582 * Get the dup on the same bdry as the original
4583 * Assume that the two mbufs have the same offset to data area
4584 * (up to word boundaries)
4586 bcopy(MTOD(m
, caddr_t
), MTOD(n
, caddr_t
), (unsigned)n
->m_len
);
4590 kprintf(">%x: %x, %x, %x\n", n
, n
->m_flags
, n
->m_len
,
4605 #define MBUF_MULTIPAGES(m) \
4606 (((m)->m_flags & M_EXT) && \
4607 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
4608 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
4609 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
4611 static struct mbuf
*
4612 m_expand(struct mbuf
*m
, struct mbuf
**last
)
4614 struct mbuf
*top
= NULL
;
4615 struct mbuf
**nm
= &top
;
4616 uintptr_t data0
, data
;
4617 unsigned int len0
, len
;
4619 VERIFY(MBUF_MULTIPAGES(m
));
4620 VERIFY(m
->m_next
== NULL
);
4621 data0
= (uintptr_t)m
->m_data
;
4629 if (IS_P2ALIGNED(data
, NBPG
) && len0
> NBPG
)
4631 else if (!IS_P2ALIGNED(data
, NBPG
) &&
4632 P2ROUNDUP(data
, NBPG
) < (data
+ len0
))
4633 len
= P2ROUNDUP(data
, NBPG
) - data
;
4638 VERIFY(m
->m_flags
& M_EXT
);
4639 m
->m_data
= (void *)data
;
4651 n
= _M_RETRY(M_DONTWAIT
, MT_DATA
);
4658 n
->m_ext
= m
->m_ext
;
4660 n
->m_flags
|= M_EXT
;
4667 m_normalize(struct mbuf
*m
)
4669 struct mbuf
*top
= NULL
;
4670 struct mbuf
**nm
= &top
;
4671 boolean_t expanded
= FALSE
;
4679 /* Does the data cross one or more page boundaries? */
4680 if (MBUF_MULTIPAGES(m
)) {
4682 if ((m
= m_expand(m
, &last
)) == NULL
) {
4698 atomic_add_32(&mb_normalized
, 1);
4703 m_mchtype(struct mbuf
*m
, int t
)
4706 mtype_stat_dec(m
->m_type
);
4711 m_mtod(struct mbuf
*m
)
4713 return (MTOD(m
, void *));
4719 return ((struct mbuf
*)((u_long
)(x
) & ~(MSIZE
-1)));
4723 m_mcheck(struct mbuf
*m
)
4729 * Inform the corresponding mcache(s) that there's a waiter below.
4732 mbuf_waiter_inc(mbuf_class_t
class, boolean_t comp
)
4734 mcache_waiter_inc(m_cache(class));
4736 if (class == MC_CL
) {
4737 mcache_waiter_inc(m_cache(MC_MBUF_CL
));
4738 } else if (class == MC_BIGCL
) {
4739 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL
));
4740 } else if (class == MC_16KCL
) {
4741 mcache_waiter_inc(m_cache(MC_MBUF_16KCL
));
4743 mcache_waiter_inc(m_cache(MC_MBUF_CL
));
4744 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL
));
4750 * Inform the corresponding mcache(s) that there's no more waiter below.
4753 mbuf_waiter_dec(mbuf_class_t
class, boolean_t comp
)
4755 mcache_waiter_dec(m_cache(class));
4757 if (class == MC_CL
) {
4758 mcache_waiter_dec(m_cache(MC_MBUF_CL
));
4759 } else if (class == MC_BIGCL
) {
4760 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL
));
4761 } else if (class == MC_16KCL
) {
4762 mcache_waiter_dec(m_cache(MC_MBUF_16KCL
));
4764 mcache_waiter_dec(m_cache(MC_MBUF_CL
));
4765 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL
));
4771 * Called during blocking allocation. Returns TRUE if one or more objects
4772 * are available at the per-CPU caches layer and that allocation should be
4773 * retried at that level.
4776 mbuf_sleep(mbuf_class_t
class, unsigned int num
, int wait
)
4778 boolean_t mcache_retry
= FALSE
;
4780 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
4782 /* Check if there's anything at the cache layer */
4783 if (mbuf_cached_above(class, wait
)) {
4784 mcache_retry
= TRUE
;
4788 /* Nothing? Then try hard to get it from somewhere */
4789 m_reclaim(class, num
, (wait
& MCR_COMP
));
4791 /* We tried hard and got something? */
4792 if (m_infree(class) > 0) {
4795 } else if (mbuf_cached_above(class, wait
)) {
4797 mcache_retry
= TRUE
;
4799 } else if (wait
& MCR_TRYHARD
) {
4800 mcache_retry
= TRUE
;
4805 * There's really nothing for us right now; inform the
4806 * cache(s) that there is a waiter below and go to sleep.
4808 mbuf_waiter_inc(class, (wait
& MCR_COMP
));
4810 VERIFY(!(wait
& MCR_NOSLEEP
));
4812 (void) msleep(mb_waitchan
, mbuf_mlock
, (PZERO
-1), m_cname(class), NULL
);
4814 /* We are now up; stop getting notified until next round */
4815 mbuf_waiter_dec(class, (wait
& MCR_COMP
));
4817 /* We waited and got something */
4818 if (m_infree(class) > 0) {
4821 } else if (mbuf_cached_above(class, wait
)) {
4823 mcache_retry
= TRUE
;
4826 return (mcache_retry
);
4830 mbuf_worker_thread(void)
4835 lck_mtx_lock(mbuf_mlock
);
4838 if (mbuf_expand_mcl
) {
4841 /* Adjust to current number of cluster in use */
4842 n
= mbuf_expand_mcl
-
4843 (m_total(MC_CL
) - m_infree(MC_CL
));
4844 if ((n
+ m_total(MC_CL
)) > m_maxlimit(MC_CL
))
4845 n
= m_maxlimit(MC_CL
) - m_total(MC_CL
);
4846 mbuf_expand_mcl
= 0;
4848 if (n
> 0 && freelist_populate(MC_CL
, n
, M_WAIT
) > 0)
4851 if (mbuf_expand_big
) {
4854 /* Adjust to current number of 4 KB cluster in use */
4855 n
= mbuf_expand_big
-
4856 (m_total(MC_BIGCL
) - m_infree(MC_BIGCL
));
4857 if ((n
+ m_total(MC_BIGCL
)) > m_maxlimit(MC_BIGCL
))
4858 n
= m_maxlimit(MC_BIGCL
) - m_total(MC_BIGCL
);
4859 mbuf_expand_big
= 0;
4861 if (n
> 0 && freelist_populate(MC_BIGCL
, n
, M_WAIT
) > 0)
4864 if (mbuf_expand_16k
) {
4867 /* Adjust to current number of 16 KB cluster in use */
4868 n
= mbuf_expand_16k
-
4869 (m_total(MC_16KCL
) - m_infree(MC_16KCL
));
4870 if ((n
+ m_total(MC_16KCL
)) > m_maxlimit(MC_16KCL
))
4871 n
= m_maxlimit(MC_16KCL
) - m_total(MC_16KCL
);
4872 mbuf_expand_16k
= 0;
4875 (void) freelist_populate(MC_16KCL
, n
, M_WAIT
);
4879 * Because we can run out of memory before filling the mbuf
4880 * map, we should not allocate more clusters than they are
4881 * mbufs -- otherwise we could have a large number of useless
4882 * clusters allocated.
4885 while (m_total(MC_MBUF
) <
4886 (m_total(MC_BIGCL
) + m_total(MC_CL
))) {
4887 if (freelist_populate(MC_MBUF
, 1, M_WAIT
) == 0)
4892 lck_mtx_unlock(mbuf_mlock
);
4894 assert_wait(&mbuf_worker_run
, THREAD_UNINT
);
4895 (void) thread_block((thread_continue_t
)mbuf_worker_thread
);
4900 mbuf_worker_thread_init(void)
4902 mbuf_worker_ready
++;
4903 mbuf_worker_thread();
4912 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
4914 VERIFY(MBUF_IN_MAP(buf
));
4915 ix
= ((char *)buf
- (char *)mbutl
) >> MBSHIFT
;
4916 VERIFY(ix
< maxslabgrp
);
4918 if ((slg
= slabstbl
[ix
]) == NULL
) {
4920 * In the current implementation, we never shrink the memory
4921 * pool (hence the cluster map); if we attempt to reallocate
4922 * a cluster group when it's already allocated, panic since
4923 * this is a sign of a memory corruption (slabstbl[ix] got
4924 * nullified). This also means that there shouldn't be any
4925 * hole in the kernel sub-map for the mbuf pool.
4928 VERIFY(ix
< slabgrp
);
4930 * Slabs expansion can only be done single threaded; when
4931 * we get here, it must be as a result of m_clalloc() which
4932 * is serialized and therefore mb_clalloc_busy must be set.
4934 VERIFY(mb_clalloc_busy
);
4935 lck_mtx_unlock(mbuf_mlock
);
4937 /* This is a new buffer; create the slabs group for it */
4938 MALLOC(slg
, mcl_slabg_t
*, sizeof (*slg
), M_TEMP
,
4940 VERIFY(slg
!= NULL
);
4942 lck_mtx_lock(mbuf_mlock
);
4944 * No other thread could have gone into m_clalloc() after
4945 * we dropped the lock above, so verify that it's true.
4947 VERIFY(mb_clalloc_busy
);
4951 /* Chain each slab in the group to its forward neighbor */
4952 for (k
= 1; k
< NSLABSPMB
; k
++)
4953 slg
->slg_slab
[k
- 1].sl_next
= &slg
->slg_slab
[k
];
4954 VERIFY(slg
->slg_slab
[NSLABSPMB
- 1].sl_next
== NULL
);
4956 /* And chain the last slab in the previous group to this */
4958 VERIFY(slabstbl
[ix
- 1]->
4959 slg_slab
[NSLABSPMB
- 1].sl_next
== NULL
);
4960 slabstbl
[ix
- 1]->slg_slab
[NSLABSPMB
- 1].sl_next
=
4965 ix
= MTOCL(buf
) % NSLABSPMB
;
4966 VERIFY(ix
< NSLABSPMB
);
4968 return (&slg
->slg_slab
[ix
]);
4972 slab_init(mcl_slab_t
*sp
, mbuf_class_t
class, u_int32_t flags
,
4973 void *base
, void *head
, unsigned int len
, int refcnt
, int chunks
)
4975 sp
->sl_class
= class;
4976 sp
->sl_flags
= flags
;
4980 sp
->sl_refcnt
= refcnt
;
4981 sp
->sl_chunks
= chunks
;
4986 slab_insert(mcl_slab_t
*sp
, mbuf_class_t
class)
4988 VERIFY(slab_is_detached(sp
));
4989 m_slab_cnt(class)++;
4990 TAILQ_INSERT_TAIL(&m_slablist(class), sp
, sl_link
);
4991 sp
->sl_flags
&= ~SLF_DETACHED
;
4992 if (class == MC_BIGCL
) {
4994 /* Next slab must already be present */
4996 VERIFY(slab_is_detached(sp
));
4997 sp
->sl_flags
&= ~SLF_DETACHED
;
4998 } else if (class == MC_16KCL
) {
5000 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
5002 /* Next slab must already be present */
5004 VERIFY(slab_is_detached(sp
));
5005 sp
->sl_flags
&= ~SLF_DETACHED
;
5011 slab_remove(mcl_slab_t
*sp
, mbuf_class_t
class)
5013 VERIFY(!slab_is_detached(sp
));
5014 VERIFY(m_slab_cnt(class) > 0);
5015 m_slab_cnt(class)--;
5016 TAILQ_REMOVE(&m_slablist(class), sp
, sl_link
);
5018 if (class == MC_BIGCL
) {
5020 /* Next slab must already be present */
5022 VERIFY(!slab_is_detached(sp
));
5024 } else if (class == MC_16KCL
) {
5026 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
5028 /* Next slab must already be present */
5030 VERIFY(!slab_is_detached(sp
));
5037 slab_inrange(mcl_slab_t
*sp
, void *buf
)
5039 return ((uintptr_t)buf
>= (uintptr_t)sp
->sl_base
&&
5040 (uintptr_t)buf
< ((uintptr_t)sp
->sl_base
+ sp
->sl_len
));
5046 slab_nextptr_panic(mcl_slab_t
*sp
, void *addr
)
5049 unsigned int chunk_len
= sp
->sl_len
/ sp
->sl_chunks
;
5050 uintptr_t buf
= (uintptr_t)sp
->sl_base
;
5052 for (i
= 0; i
< sp
->sl_chunks
; i
++, buf
+= chunk_len
) {
5053 void *next
= ((mcache_obj_t
*)buf
)->obj_next
;
5056 if (mclaudit
== NULL
) {
5057 if (next
!= NULL
&& !MBUF_IN_MAP(next
)) {
5058 mcache_t
*cp
= m_cache(sp
->sl_class
);
5059 panic("%s: %s buffer %p in slab %p modified "
5060 "after free at offset 0: %p out of range "
5061 "[%p-%p)\n", __func__
, cp
->mc_name
,
5062 (void *)buf
, sp
, next
, mbutl
, embutl
);
5066 mcache_audit_t
*mca
= mcl_audit_buf2mca(sp
->sl_class
,
5067 (mcache_obj_t
*)buf
);
5068 mcl_audit_verify_nextptr(next
, mca
);
5074 slab_detach(mcl_slab_t
*sp
)
5076 sp
->sl_link
.tqe_next
= (mcl_slab_t
*)-1;
5077 sp
->sl_link
.tqe_prev
= (mcl_slab_t
**)-1;
5078 sp
->sl_flags
|= SLF_DETACHED
;
5082 slab_is_detached(mcl_slab_t
*sp
)
5084 return ((intptr_t)sp
->sl_link
.tqe_next
== -1 &&
5085 (intptr_t)sp
->sl_link
.tqe_prev
== -1 &&
5086 (sp
->sl_flags
& SLF_DETACHED
));
5090 mcl_audit_init(void *buf
, mcache_audit_t
**mca_list
,
5091 mcache_obj_t
**con_list
, size_t con_size
, unsigned int num
)
5093 mcache_audit_t
*mca
, *mca_tail
;
5094 mcache_obj_t
*con
= NULL
;
5095 boolean_t save_contents
= (con_list
!= NULL
);
5098 ASSERT(num
<= NMBPCL
);
5099 ASSERT(con_list
== NULL
|| con_size
!= 0);
5102 /* Make sure we haven't been here before */
5103 for (i
= 0; i
< NMBPCL
; i
++)
5104 VERIFY(mclaudit
[ix
].cl_audit
[i
] == NULL
);
5106 mca
= mca_tail
= *mca_list
;
5110 for (i
= 0; i
< num
; i
++) {
5111 mcache_audit_t
*next
;
5113 next
= mca
->mca_next
;
5114 bzero(mca
, sizeof (*mca
));
5115 mca
->mca_next
= next
;
5116 mclaudit
[ix
].cl_audit
[i
] = mca
;
5118 /* Attach the contents buffer if requested */
5119 if (save_contents
) {
5120 VERIFY(con
!= NULL
);
5121 mca
->mca_contents_size
= con_size
;
5122 mca
->mca_contents
= con
;
5123 con
= con
->obj_next
;
5124 bzero(mca
->mca_contents
, mca
->mca_contents_size
);
5128 mca
= mca
->mca_next
;
5134 *mca_list
= mca_tail
->mca_next
;
5135 mca_tail
->mca_next
= NULL
;
5139 * Given an address of a buffer (mbuf/cluster/big cluster), return
5140 * the corresponding audit structure for that buffer.
5142 static mcache_audit_t
*
5143 mcl_audit_buf2mca(mbuf_class_t
class, mcache_obj_t
*o
)
5145 mcache_audit_t
*mca
= NULL
;
5148 VERIFY(IS_P2ALIGNED(o
, MIN(m_maxsize(class), NBPG
)));
5153 * For the mbuf case, find the index of the cluster
5154 * used by the mbuf and use that index to locate the
5155 * base address of the cluster. Then find out the
5156 * mbuf index relative to the cluster base and use
5157 * it to locate the audit structure.
5159 VERIFY(MCLIDX(CLTOM(ix
), o
) < (int)NMBPCL
);
5160 mca
= mclaudit
[ix
].cl_audit
[MCLIDX(CLTOM(ix
), o
)];
5167 * Same as above, but only return the first element.
5169 mca
= mclaudit
[ix
].cl_audit
[0];
5181 mcl_audit_mbuf(mcache_audit_t
*mca
, void *addr
, boolean_t composite
,
5184 struct mbuf
*m
= addr
;
5185 mcache_obj_t
*next
= ((mcache_obj_t
*)m
)->obj_next
;
5187 VERIFY(mca
->mca_contents
!= NULL
&&
5188 mca
->mca_contents_size
== AUDIT_CONTENTS_SIZE
);
5190 mcl_audit_verify_nextptr(next
, mca
);
5193 /* Save constructed mbuf fields */
5194 mcl_audit_save_mbuf(m
, mca
);
5195 mcache_set_pattern(MCACHE_FREE_PATTERN
, m
, m_maxsize(MC_MBUF
));
5196 ((mcache_obj_t
*)m
)->obj_next
= next
;
5200 /* Check if the buffer has been corrupted while in freelist */
5201 mcache_audit_free_verify_set(mca
, addr
, 0, m_maxsize(MC_MBUF
));
5203 /* Restore constructed mbuf fields */
5204 mcl_audit_restore_mbuf(m
, mca
, composite
);
5208 mcl_audit_restore_mbuf(struct mbuf
*m
, mcache_audit_t
*mca
, boolean_t composite
)
5210 struct mbuf
*ms
= (struct mbuf
*)mca
->mca_contents
;
5213 struct mbuf
*next
= m
->m_next
;
5214 VERIFY(ms
->m_flags
== M_EXT
&& MEXT_RFA(ms
) != NULL
&&
5215 MBUF_IS_COMPOSITE(ms
));
5217 * We could have hand-picked the mbuf fields and restore
5218 * them individually, but that will be a maintenance
5219 * headache. Instead, restore everything that was saved;
5220 * the mbuf layer will recheck and reinitialize anyway.
5222 bcopy(ms
, m
, mca
->mca_contents_size
);
5226 * For a regular mbuf (no cluster attached) there's nothing
5227 * to restore other than the type field, which is expected
5230 m
->m_type
= ms
->m_type
;
5236 mcl_audit_save_mbuf(struct mbuf
*m
, mcache_audit_t
*mca
)
5239 bcopy(m
, mca
->mca_contents
, mca
->mca_contents_size
);
5243 mcl_audit_cluster(mcache_audit_t
*mca
, void *addr
, size_t size
, boolean_t alloc
,
5244 boolean_t save_next
)
5246 mcache_obj_t
*next
= ((mcache_obj_t
*)addr
)->obj_next
;
5249 mcache_set_pattern(MCACHE_FREE_PATTERN
, addr
, size
);
5251 mcl_audit_verify_nextptr(next
, mca
);
5252 ((mcache_obj_t
*)addr
)->obj_next
= next
;
5255 /* Check if the buffer has been corrupted while in freelist */
5256 mcl_audit_verify_nextptr(next
, mca
);
5257 mcache_audit_free_verify_set(mca
, addr
, 0, size
);
5262 mcl_audit_mcheck_panic(struct mbuf
*m
)
5264 mcache_audit_t
*mca
;
5267 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
5269 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
5270 m
, (u_int16_t
)m
->m_type
, MT_FREE
, mcache_dump_mca(mca
));
5275 mcl_audit_verify_nextptr(void *next
, mcache_audit_t
*mca
)
5277 if (next
!= NULL
&& next
!= (void *)MCACHE_FREE_PATTERN
&&
5278 !MBUF_IN_MAP(next
)) {
5279 panic("mcl_audit: buffer %p modified after free at offset 0: "
5280 "%p out of range [%p-%p)\n%s\n",
5281 mca
->mca_addr
, next
, mbutl
, embutl
, mcache_dump_mca(mca
));
5286 SYSCTL_DECL(_kern_ipc
);
5287 SYSCTL_PROC(_kern_ipc
, KIPC_MBSTAT
, mbstat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5288 0, 0, mbstat_sysctl
, "S,mbstat", "");
5289 SYSCTL_PROC(_kern_ipc
, OID_AUTO
, mb_stat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5290 0, 0, mb_stat_sysctl
, "S,mb_stat", "");
5291 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mb_normalized
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5292 &mb_normalized
, 0, "");