2 * Copyright (c) 2008 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
82 #include <kern/kern_types.h>
83 #include <kern/simple_lock.h>
84 #include <kern/queue.h>
85 #include <kern/sched_prim.h>
86 #include <kern/cpu_number.h>
88 #include <libkern/OSAtomic.h>
89 #include <libkern/libkern.h>
91 #include <IOKit/IOMapper.h>
93 #include <machine/limits.h>
94 #include <machine/machine_routines.h>
97 #include <security/mac_framework.h>
100 #include <sys/mcache.h>
103 * MBUF IMPLEMENTATION NOTES.
105 * There is a total of 5 per-CPU caches:
108 * This is a cache of rudimentary objects of MSIZE in size; each
109 * object represents an mbuf structure. This cache preserves only
110 * the m_type field of the mbuf during its transactions.
113 * This is a cache of rudimentary objects of MCLBYTES in size; each
114 * object represents a mcluster structure. This cache does not
115 * preserve the contents of the objects during its transactions.
118 * This is a cache of rudimentary objects of NBPG in size; each
119 * object represents a mbigcluster structure. This cache does not
120 * preserve the contents of the objects during its transaction.
123 * This is a cache of mbufs each having a cluster attached to it.
124 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
125 * fields of the mbuf related to the external cluster are preserved
126 * during transactions.
129 * This is a cache of mbufs each having a big cluster attached to it.
130 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
131 * fields of the mbuf related to the external cluster are preserved
132 * during transactions.
136 * Allocation requests are handled first at the per-CPU (mcache) layer
137 * before falling back to the slab layer. Performance is optimal when
138 * the request is satisfied at the CPU layer because global data/lock
139 * never gets accessed. When the slab layer is entered for allocation,
140 * the slab freelist will be checked first for available objects before
141 * the VM backing store is invoked. Slab layer operations are serialized
142 * for all of the caches as the mbuf global lock is held most of the time.
143 * Allocation paths are different depending on the class of objects:
145 * a. Rudimentary object:
147 * { m_get_common(), m_clattach(), m_mclget(),
148 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
149 * composite object allocation }
152 * | +-----------------------+
154 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
157 * [CPU cache] -------> (found?) -------+
160 * mbuf_slab_alloc() |
163 * +---------> [freelist] -------> (found?) -------+
169 * +---<<---- kmem_mb_alloc()
171 * b. Composite object:
173 * { m_getpackets_internal(), m_allocpacket_internal() }
176 * | +------ (done) ---------+
178 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
181 * [CPU cache] -------> (found?) -------+
184 * mbuf_cslab_alloc() |
187 * [freelist] -------> (found?) -------+
190 * (rudimentary object) |
191 * mcache_alloc/mcache_alloc_ext() ------>>-----+
193 * Auditing notes: If auditing is enabled, buffers will be subjected to
194 * integrity checks by the audit routine. This is done by verifying their
195 * contents against DEADBEEF (free) pattern before returning them to caller.
196 * As part of this step, the routine will also record the transaction and
197 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
198 * also restore any constructed data structure fields if necessary.
200 * OBJECT DEALLOCATION:
202 * Freeing an object simply involves placing it into the CPU cache; this
203 * pollutes the cache to benefit subsequent allocations. The slab layer
204 * will only be entered if the object is to be purged out of the cache.
205 * During normal operations, this happens only when the CPU layer resizes
206 * its bucket while it's adjusting to the allocation load. Deallocation
207 * paths are different depending on the class of objects:
209 * a. Rudimentary object:
211 * { m_free(), m_freem_list(), composite object deallocation }
214 * | +------ (done) ---------+
216 * mcache_free/mcache_free_ext() |
219 * mbuf_slab_audit() |
222 * [CPU cache] ---> (not purging?) -----+
228 * [freelist] ----------->>------------+
229 * (objects never get purged to VM)
231 * b. Composite object:
233 * { m_free(), m_freem_list() }
236 * | +------ (done) ---------+
238 * mcache_free/mcache_free_ext() |
241 * mbuf_cslab_audit() |
244 * [CPU cache] ---> (not purging?) -----+
247 * mbuf_cslab_free() |
250 * [freelist] ---> (not purging?) -----+
253 * (rudimentary object) |
254 * mcache_free/mcache_free_ext() ------->>------+
256 * Auditing notes: If auditing is enabled, the audit routine will save
257 * any constructed data structure fields (if necessary) before filling the
258 * contents of the buffers with DEADBEEF (free) pattern and recording the
259 * transaction. Buffers that are freed (whether at CPU or slab layer) are
260 * expected to contain the free pattern.
264 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
265 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
266 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
267 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Note
268 * that debugging consumes more CPU and memory.
270 * Each object is associated with exactly one mcache_audit_t structure that
271 * contains the information related to its last buffer transaction. Given
272 * an address of an object, the audit structure can be retrieved by finding
273 * the position of the object relevant to the base address of the cluster:
275 * +------------+ +=============+
276 * | mbuf addr | | mclaudit[i] |
277 * +------------+ +=============+
279 * i = MTOCL(addr) +-------------+
280 * | +-----> | cl_audit[1] | -----> mcache_audit_t
281 * b = CLTOM(i) | +-------------+
283 * x = MCLIDX(b, addr) | +-------------+
284 * | | | cl_audit[7] |
285 * +-----------------+ +-------------+
288 * The mclaudit[] array is allocated at initialization time, but its contents
289 * get populated when the corresponding cluster is created. Because a cluster
290 * can be turned into NMBPCL number of mbufs, we preserve enough space for the
291 * mbufs so that there is a 1-to-1 mapping between them. A cluster that never
292 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
293 * remaining entries unused. For big clusters, only one entry is allocated
294 * and used for the entire cluster pair.
297 /* TODO: should be in header file */
298 /* kernel translater */
299 extern vm_offset_t
kmem_mb_alloc(vm_map_t
, int, int);
300 extern ppnum_t
pmap_find_phys(pmap_t pmap
, addr64_t va
);
301 extern vm_map_t mb_map
; /* special map */
304 static lck_mtx_t
*mbuf_mlock
;
305 static lck_attr_t
*mbuf_mlock_attr
;
306 static lck_grp_t
*mbuf_mlock_grp
;
307 static lck_grp_attr_t
*mbuf_mlock_grp_attr
;
309 /* Back-end (common) layer */
310 static void *mbuf_worker_run
; /* wait channel for worker thread */
311 static int mbuf_worker_ready
; /* worker thread is runnable */
312 static int mbuf_expand_mcl
; /* number of cluster creation requets */
313 static int mbuf_expand_big
; /* number of big cluster creation requests */
314 static int mbuf_expand_16k
; /* number of 16K cluster creation requests */
315 static int ncpu
; /* number of CPUs */
316 static ppnum_t
*mcl_paddr
; /* Array of cluster physical addresses */
317 static ppnum_t mcl_pages
; /* Size of array (# physical pages) */
318 static ppnum_t mcl_paddr_base
; /* Handle returned by IOMapper::iovmAlloc() */
319 static mcache_t
*ref_cache
; /* Cache of cluster reference & flags */
320 static mcache_t
*mcl_audit_con_cache
; /* Audit contents cache */
321 static unsigned int mbuf_debug
; /* patchable mbuf mcache flags */
322 static unsigned int mb_normalized
; /* number of packets "normalized" */
323 static unsigned int mbuf_gscale
; /* Power-of-two growth scale for m_howmany */
325 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
326 #define MB_GROWTH_NORMAL 4 /* Threshold: 15/16 of total */
329 MC_MBUF
= 0, /* Regular mbuf */
331 MC_BIGCL
, /* Large (4K) cluster */
332 MC_16KCL
, /* Jumbo (16K) cluster */
333 MC_MBUF_CL
, /* mbuf + cluster */
334 MC_MBUF_BIGCL
, /* mbuf + large (4K) cluster */
335 MC_MBUF_16KCL
/* mbuf + jumbo (16K) cluster */
338 #define MBUF_CLASS_MIN MC_MBUF
339 #define MBUF_CLASS_MAX MC_MBUF_16KCL
340 #define MBUF_CLASS_LAST MC_16KCL
341 #define MBUF_CLASS_VALID(c) \
342 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
343 #define MBUF_CLASS_COMPOSITE(c) \
344 ((int)(c) > MBUF_CLASS_LAST)
348 * mbuf specific mcache allocation request flags.
350 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
353 * Per-cluster slab structure.
355 * A slab is a cluster control structure that contains one or more object
356 * chunks; the available chunks are chained in the slab's freelist (sl_head).
357 * Each time a chunk is taken out of the slab, the slab's reference count
358 * gets incremented. When all chunks have been taken out, the empty slab
359 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
360 * returned to a slab causes the slab's reference count to be decremented;
361 * it also causes the slab to be reinserted back to class's slab list, if
362 * it's not already done.
364 * Compartmentalizing of the object chunks into slabs allows us to easily
365 * merge one or more slabs together when the adjacent slabs are idle, as
366 * well as to convert or move a slab from one class to another; e.g. the
367 * mbuf cluster slab can be converted to a regular cluster slab when all
368 * mbufs in the slab have been freed.
370 * A slab may also span across multiple clusters for chunks larger than
371 * a cluster's size. In this case, only the slab of the first cluster is
372 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
373 * that they are part of the larger slab.
375 typedef struct mcl_slab
{
376 struct mcl_slab
*sl_next
; /* neighboring slab */
377 u_int8_t sl_class
; /* controlling mbuf class */
378 int8_t sl_refcnt
; /* outstanding allocations */
379 int8_t sl_chunks
; /* chunks (bufs) in this slab */
380 u_int16_t sl_flags
; /* slab flags (see below) */
381 u_int16_t sl_len
; /* slab length */
382 void *sl_base
; /* base of allocated memory */
383 void *sl_head
; /* first free buffer */
384 TAILQ_ENTRY(mcl_slab
) sl_link
; /* next/prev slab on freelist */
387 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
388 #define SLF_PARTIAL 0x0002 /* part of another slab */
389 #define SLF_DETACHED 0x0004 /* not in slab freelist */
392 * The array of slabs are broken into groups of arrays per 1MB of kernel
393 * memory to reduce the footprint. Each group is allocated on demand
394 * whenever a new piece of memory mapped in from the VM crosses the 1MB
397 #define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */
399 typedef struct mcl_slabg
{
400 mcl_slab_t slg_slab
[NSLABSPMB
]; /* group of slabs */
404 * Per-cluster audit structure.
407 mcache_audit_t
*cl_audit
[NMBPCL
]; /* array of audits */
410 #if CONFIG_MBUF_NOEXPAND
411 static unsigned int maxmbufcl
;
412 #endif /* CONFIG_MBUF_NOEXPAND */
415 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
416 * and m_ext structures. If auditing is enabled, we allocate a shadow
417 * mbuf structure of this size inside each audit structure, and the
418 * contents of the real mbuf gets copied into it when the mbuf is freed.
419 * This allows us to pattern-fill the mbuf for integrity check, and to
420 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
421 * Note that we don't save the contents of clusters when they are freed;
422 * we simply pattern-fill them.
424 #define AUDIT_CONTENTS_SIZE ((MSIZE - MHLEN) + sizeof (_m_ext_t))
427 * mbuf specific mcache audit flags
429 #define MB_INUSE 0x01 /* object has not been returned to slab */
430 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
431 #define MB_SCVALID 0x04 /* object has valid saved contents */
434 * Each of the following two arrays hold up to nmbclusters elements.
436 static mcl_audit_t
*mclaudit
; /* array of cluster audit information */
437 static mcl_slabg_t
**slabstbl
; /* cluster slabs table */
438 static unsigned int maxslabgrp
; /* max # of entries in slabs table */
439 static unsigned int slabgrp
; /* # of entries in slabs table */
442 int nclusters
; /* # of clusters for non-jumbo (legacy) sizes */
443 int njcl
; /* # of clusters for jumbo sizes */
444 int njclbytes
; /* size of a jumbo cluster */
445 union mcluster
*mbutl
; /* first mapped cluster address */
446 union mcluster
*embutl
; /* ending virtual address of mclusters */
447 int max_linkhdr
; /* largest link-level header */
448 int max_protohdr
; /* largest protocol header */
449 int max_hdr
; /* largest link+protocol header */
450 int max_datalen
; /* MHLEN - max_hdr */
452 extern u_int32_t high_sb_max
;
454 /* TODO: should be in header file */
457 /* The minimum number of objects that are allocated, to start. */
459 #define MINBIGCL (MINCL >> 1)
460 #define MIN16KCL (MINCL >> 2)
462 /* Low watermarks (only map in pages once free counts go below) */
463 #define MCL_LOWAT MINCL
464 #define MBIGCL_LOWAT MINBIGCL
465 #define M16KCL_LOWAT MIN16KCL
468 mbuf_class_t mtbl_class
; /* class type */
469 mcache_t
*mtbl_cache
; /* mcache for this buffer class */
470 TAILQ_HEAD(mcl_slhead
, mcl_slab
) mtbl_slablist
; /* slab list */
471 mcache_obj_t
*mtbl_cobjlist
; /* composite objects freelist */
472 mb_class_stat_t
*mtbl_stats
; /* statistics fetchable via sysctl */
473 u_int32_t mtbl_maxsize
; /* maximum buffer size */
474 int mtbl_minlimit
; /* minimum allowed */
475 int mtbl_maxlimit
; /* maximum allowed */
476 u_int32_t mtbl_wantpurge
; /* purge during next reclaim */
479 #define m_class(c) mbuf_table[c].mtbl_class
480 #define m_cache(c) mbuf_table[c].mtbl_cache
481 #define m_slablist(c) mbuf_table[c].mtbl_slablist
482 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
483 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
484 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
485 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
486 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
487 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
488 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
489 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
490 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
491 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
492 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
493 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
494 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
495 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
496 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
497 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
498 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
500 static mbuf_table_t mbuf_table
[] = {
502 * The caches for mbufs, regular clusters and big clusters.
504 { MC_MBUF
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF
)),
505 NULL
, NULL
, 0, 0, 0, 0 },
506 { MC_CL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL
)),
507 NULL
, NULL
, 0, 0, 0, 0 },
508 { MC_BIGCL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL
)),
509 NULL
, NULL
, 0, 0, 0, 0 },
510 { MC_16KCL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL
)),
511 NULL
, NULL
, 0, 0, 0, 0 },
513 * The following are special caches; they serve as intermediate
514 * caches backed by the above rudimentary caches. Each object
515 * in the cache is an mbuf with a cluster attached to it. Unlike
516 * the above caches, these intermediate caches do not directly
517 * deal with the slab structures; instead, the constructed
518 * cached elements are simply stored in the freelists.
520 { MC_MBUF_CL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
521 { MC_MBUF_BIGCL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
522 { MC_MBUF_16KCL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
525 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
527 static void *mb_waitchan
= &mbuf_table
; /* wait channel for all caches */
528 static int mb_waiters
; /* number of sleepers */
530 /* The following are used to serialize m_clalloc() */
531 static boolean_t mb_clalloc_busy
;
532 static void *mb_clalloc_waitchan
= &mb_clalloc_busy
;
533 static int mb_clalloc_waiters
;
535 static int mbstat_sysctl SYSCTL_HANDLER_ARGS
;
536 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS
;
537 static void mbuf_table_init(void);
538 static inline void m_incref(struct mbuf
*);
539 static inline u_int32_t
m_decref(struct mbuf
*);
540 static int m_clalloc(const u_int32_t
, const int, const u_int32_t
);
541 static void mbuf_worker_thread_init(void);
542 static mcache_obj_t
*slab_alloc(mbuf_class_t
, int);
543 static void slab_free(mbuf_class_t
, mcache_obj_t
*);
544 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t
***,
546 static void mbuf_slab_free(void *, mcache_obj_t
*, int);
547 static void mbuf_slab_audit(void *, mcache_obj_t
*, boolean_t
);
548 static void mbuf_slab_notify(void *, u_int32_t
);
549 static unsigned int cslab_alloc(mbuf_class_t
, mcache_obj_t
***,
551 static unsigned int cslab_free(mbuf_class_t
, mcache_obj_t
*, int);
552 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t
***,
554 static void mbuf_cslab_free(void *, mcache_obj_t
*, int);
555 static void mbuf_cslab_audit(void *, mcache_obj_t
*, boolean_t
);
556 static int freelist_populate(mbuf_class_t
, unsigned int, int);
557 static boolean_t
mbuf_cached_above(mbuf_class_t
, int);
558 static boolean_t
mbuf_steal(mbuf_class_t
, unsigned int);
559 static void m_reclaim(mbuf_class_t
, unsigned int, boolean_t
);
560 static int m_howmany(int, size_t);
561 static void mbuf_worker_thread(void);
562 static boolean_t
mbuf_sleep(mbuf_class_t
, unsigned int, int);
564 static void mcl_audit_init(void *, mcache_audit_t
**, mcache_obj_t
**,
565 size_t, unsigned int);
566 static mcache_audit_t
*mcl_audit_buf2mca(mbuf_class_t
, mcache_obj_t
*);
567 static void mcl_audit_mbuf(mcache_audit_t
*, void *, boolean_t
, boolean_t
);
568 static void mcl_audit_cluster(mcache_audit_t
*, void *, size_t, boolean_t
,
570 static void mcl_audit_restore_mbuf(struct mbuf
*, mcache_audit_t
*, boolean_t
);
571 static void mcl_audit_save_mbuf(struct mbuf
*, mcache_audit_t
*);
572 static void mcl_audit_mcheck_panic(struct mbuf
*);
573 static void mcl_audit_verify_nextptr(void *, mcache_audit_t
*);
575 static mcl_slab_t
*slab_get(void *);
576 static void slab_init(mcl_slab_t
*, mbuf_class_t
, u_int32_t
,
577 void *, void *, unsigned int, int, int);
578 static void slab_insert(mcl_slab_t
*, mbuf_class_t
);
579 static void slab_remove(mcl_slab_t
*, mbuf_class_t
);
580 static boolean_t
slab_inrange(mcl_slab_t
*, void *);
581 static void slab_nextptr_panic(mcl_slab_t
*, void *);
582 static void slab_detach(mcl_slab_t
*);
583 static boolean_t
slab_is_detached(mcl_slab_t
*);
585 static unsigned int m_length(struct mbuf
*);
586 static int m_copyback0(struct mbuf
**, int, int, const void *, int, int);
587 static struct mbuf
*m_split0(struct mbuf
*, int, int, int);
589 /* flags for m_copyback0 */
590 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
591 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
592 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
593 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
596 * This flag is set for all mbufs that come out of and into the composite
597 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
598 * are marked with such a flag have clusters attached to them, and will be
599 * treated differently when they are freed; instead of being placed back
600 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
601 * are placed back into the appropriate composite cache's freelist, and the
602 * actual freeing is deferred until the composite objects are purged. At
603 * such a time, this flag will be cleared from the mbufs and the objects
604 * will be freed into their own separate freelists.
606 #define EXTF_COMPOSITE 0x1
608 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
609 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
610 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
611 #define MBUF_IS_COMPOSITE(m) \
612 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE))
615 * Macros used to verify the integrity of the mbuf.
617 #define _MCHECK(m) { \
618 if ((m)->m_type != MT_FREE) { \
619 if (mclaudit == NULL) \
620 panic("MCHECK: m_type=%d m=%p", \
621 (u_int16_t)(m)->m_type, m); \
623 mcl_audit_mcheck_panic(m); \
627 #define MBUF_IN_MAP(addr) \
628 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
630 #define MRANGE(addr) { \
631 if (!MBUF_IN_MAP(addr)) \
632 panic("MRANGE: address out of range 0x%p", addr); \
636 * Macro version of mtod.
638 #define MTOD(m, t) ((t)((m)->m_data))
641 * Macros to obtain cluster index and base cluster address.
643 #define MTOCL(x) (((char *)(x) - (char *)mbutl) >> MCLSHIFT)
644 #define CLTOM(x) ((union mcluster *)(mbutl + (x)))
647 * Macro to find the mbuf index relative to the cluster base.
649 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> 8)
652 * Macros used during mbuf and cluster initialization.
654 #define MBUF_INIT(m, pkthdr, type) { \
656 (m)->m_next = (m)->m_nextpkt = NULL; \
658 (m)->m_type = type; \
659 if ((pkthdr) == 0) { \
660 (m)->m_data = (m)->m_dat; \
663 (m)->m_data = (m)->m_pktdat; \
664 (m)->m_flags = M_PKTHDR; \
665 (m)->m_pkthdr.rcvif = NULL; \
666 (m)->m_pkthdr.len = 0; \
667 (m)->m_pkthdr.header = NULL; \
668 (m)->m_pkthdr.csum_flags = 0; \
669 (m)->m_pkthdr.csum_data = 0; \
670 (m)->m_pkthdr.tso_segsz = 0; \
671 (m)->m_pkthdr.vlan_tag = 0; \
672 (m)->m_pkthdr.socket_id = 0; \
678 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
679 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
680 (m)->m_flags |= M_EXT; \
681 (m)->m_ext.ext_size = (size); \
682 (m)->m_ext.ext_free = (free); \
683 (m)->m_ext.ext_arg = (arg); \
684 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
685 &(m)->m_ext.ext_refs; \
686 MEXT_RFA(m) = (rfa); \
687 MEXT_REF(m) = (ref); \
688 MEXT_FLAGS(m) = (flag); \
691 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
692 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
694 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
695 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
697 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
698 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
701 * Macro to convert BSD malloc sleep flag to mcache's
703 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
706 * The structure that holds all mbuf class statistics exportable via sysctl.
707 * Similar to mbstat structure, the mb_stat structure is protected by the
708 * global mbuf lock. It contains additional information about the classes
709 * that allows for a more accurate view of the state of the allocator.
711 struct mb_stat
*mb_stat
;
712 struct omb_stat
*omb_stat
; /* For backwards compatibility */
714 #define MB_STAT_SIZE(n) \
715 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
716 #define OMB_STAT_SIZE(n) \
717 ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
720 * The legacy structure holding all of the mbuf allocation statistics.
721 * The actual statistics used by the kernel are stored in the mbuf_table
722 * instead, and are updated atomically while the global mbuf lock is held.
723 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
724 * Unlike before, the kernel no longer relies on the contents of mbstat for
725 * its operations (e.g. cluster expansion) because the structure is exposed
726 * to outside and could possibly be modified, therefore making it unsafe.
727 * With the exception of the mbstat.m_mtypes array (see below), all of the
728 * statistics are updated as they change.
730 struct mbstat mbstat
;
732 #define MBSTAT_MTYPES_MAX \
733 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
736 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
737 * atomically and stored in a per-CPU structure which is lock-free; this is
738 * done in order to avoid writing to the global mbstat data structure which
739 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
740 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
741 * array and returned to the application. Any updates for types greater or
742 * equal than MT_MAX would be done atomically to the mbstat; this slows down
743 * performance but is okay since the kernel uses only up to MT_MAX-1 while
744 * anything beyond that (up to type 255) is considered a corner case.
747 unsigned int cpu_mtypes
[MT_MAX
];
748 } __attribute__((aligned(CPU_CACHE_SIZE
), packed
)) mtypes_cpu_t
;
751 mtypes_cpu_t mbs_cpu
[1];
754 static mbuf_mtypes_t
*mbuf_mtypes
; /* per-CPU statistics */
756 #define MBUF_MTYPES_SIZE(n) \
757 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
759 #define MTYPES_CPU(p) \
760 ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
762 /* This should be in a header file */
763 #define atomic_add_16(a, n) ((void) OSAddAtomic16(n, a))
764 #define atomic_add_32(a, n) ((void) OSAddAtomic(n, a))
766 #define mtype_stat_add(type, n) { \
767 if ((unsigned)(type) < MT_MAX) { \
768 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
769 atomic_add_32(&mbs->cpu_mtypes[type], n); \
770 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
771 atomic_add_16((int16_t*)&mbstat.m_mtypes[type], n); \
775 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
776 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
777 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
780 mbstat_sysctl SYSCTL_HANDLER_ARGS
782 #pragma unused(oidp, arg1, arg2)
786 bzero(&mtc
, sizeof (mtc
));
787 for (m
= 0; m
< ncpu
; m
++) {
788 mtypes_cpu_t
*scp
= &mbuf_mtypes
->mbs_cpu
[m
];
791 bcopy(&scp
->cpu_mtypes
, &temp
.cpu_mtypes
,
792 sizeof (temp
.cpu_mtypes
));
794 for (n
= 0; n
< MT_MAX
; n
++)
795 mtc
.cpu_mtypes
[n
] += temp
.cpu_mtypes
[n
];
797 lck_mtx_lock(mbuf_mlock
);
798 for (n
= 0; n
< MT_MAX
; n
++)
799 mbstat
.m_mtypes
[n
] = mtc
.cpu_mtypes
[n
];
800 lck_mtx_unlock(mbuf_mlock
);
802 return (SYSCTL_OUT(req
, &mbstat
, sizeof (mbstat
)));
806 mb_stat_sysctl SYSCTL_HANDLER_ARGS
808 #pragma unused(oidp, arg1, arg2)
813 int k
, m
, bktsize
, statsz
, proc64
= proc_is64bit(req
->p
);
815 lck_mtx_lock(mbuf_mlock
);
816 for (k
= 0; k
< NELEM(mbuf_table
); k
++) {
818 ccp
= &cp
->mc_cpu
[0];
819 bktsize
= ccp
->cc_bktsize
;
820 sp
= mbuf_table
[k
].mtbl_stats
;
822 if (cp
->mc_flags
& MCF_NOCPUCACHE
)
823 sp
->mbcl_mc_state
= MCS_DISABLED
;
824 else if (cp
->mc_purge_cnt
> 0)
825 sp
->mbcl_mc_state
= MCS_PURGING
;
826 else if (bktsize
== 0)
827 sp
->mbcl_mc_state
= MCS_OFFLINE
;
829 sp
->mbcl_mc_state
= MCS_ONLINE
;
831 sp
->mbcl_mc_cached
= 0;
832 for (m
= 0; m
< ncpu
; m
++) {
833 ccp
= &cp
->mc_cpu
[m
];
834 if (ccp
->cc_objs
> 0)
835 sp
->mbcl_mc_cached
+= ccp
->cc_objs
;
836 if (ccp
->cc_pobjs
> 0)
837 sp
->mbcl_mc_cached
+= ccp
->cc_pobjs
;
839 sp
->mbcl_mc_cached
+= (cp
->mc_full
.bl_total
* bktsize
);
840 sp
->mbcl_active
= sp
->mbcl_total
- sp
->mbcl_mc_cached
-
843 sp
->mbcl_mc_waiter_cnt
= cp
->mc_waiter_cnt
;
844 sp
->mbcl_mc_wretry_cnt
= cp
->mc_wretry_cnt
;
845 sp
->mbcl_mc_nwretry_cnt
= cp
->mc_nwretry_cnt
;
847 /* Calculate total count specific to each class */
848 sp
->mbcl_ctotal
= sp
->mbcl_total
;
849 switch (m_class(k
)) {
851 /* Deduct mbufs used in composite caches */
852 sp
->mbcl_ctotal
-= (m_total(MC_MBUF_CL
) +
853 m_total(MC_MBUF_BIGCL
));
857 /* Deduct clusters used in composite cache and mbufs */
858 sp
->mbcl_ctotal
-= (m_total(MC_MBUF_CL
) +
859 (P2ROUNDUP(m_total(MC_MBUF
), NMBPCL
)/NMBPCL
));
863 /* Deduct clusters used in composite cache */
864 sp
->mbcl_ctotal
-= m_total(MC_MBUF_BIGCL
);
868 /* Deduct clusters used in composite cache */
869 sp
->mbcl_ctotal
-= m_total(MC_MBUF_16KCL
);
878 struct omb_class_stat
*oc
;
879 struct mb_class_stat
*c
;
881 omb_stat
->mbs_cnt
= mb_stat
->mbs_cnt
;
882 oc
= &omb_stat
->mbs_class
[0];
883 c
= &mb_stat
->mbs_class
[0];
884 for (k
= 0; k
< omb_stat
->mbs_cnt
; k
++, oc
++, c
++) {
885 (void) snprintf(oc
->mbcl_cname
, sizeof (oc
->mbcl_cname
),
886 "%s", c
->mbcl_cname
);
887 oc
->mbcl_size
= c
->mbcl_size
;
888 oc
->mbcl_total
= c
->mbcl_total
;
889 oc
->mbcl_active
= c
->mbcl_active
;
890 oc
->mbcl_infree
= c
->mbcl_infree
;
891 oc
->mbcl_slab_cnt
= c
->mbcl_slab_cnt
;
892 oc
->mbcl_alloc_cnt
= c
->mbcl_alloc_cnt
;
893 oc
->mbcl_free_cnt
= c
->mbcl_free_cnt
;
894 oc
->mbcl_notified
= c
->mbcl_notified
;
895 oc
->mbcl_purge_cnt
= c
->mbcl_purge_cnt
;
896 oc
->mbcl_fail_cnt
= c
->mbcl_fail_cnt
;
897 oc
->mbcl_ctotal
= c
->mbcl_ctotal
;
898 oc
->mbcl_mc_state
= c
->mbcl_mc_state
;
899 oc
->mbcl_mc_cached
= c
->mbcl_mc_cached
;
900 oc
->mbcl_mc_waiter_cnt
= c
->mbcl_mc_waiter_cnt
;
901 oc
->mbcl_mc_wretry_cnt
= c
->mbcl_mc_wretry_cnt
;
902 oc
->mbcl_mc_nwretry_cnt
= c
->mbcl_mc_nwretry_cnt
;
905 statsz
= OMB_STAT_SIZE(NELEM(mbuf_table
));
908 statsz
= MB_STAT_SIZE(NELEM(mbuf_table
));
911 lck_mtx_unlock(mbuf_mlock
);
913 return (SYSCTL_OUT(req
, statp
, statsz
));
917 m_incref(struct mbuf
*m
)
920 volatile UInt32
*addr
= (volatile UInt32
*)&MEXT_REF(m
);
926 } while (!OSCompareAndSwap(old
, new, addr
));
929 static inline u_int32_t
930 m_decref(struct mbuf
*m
)
933 volatile UInt32
*addr
= (volatile UInt32
*)&MEXT_REF(m
);
939 } while (!OSCompareAndSwap(old
, new, addr
));
945 mbuf_table_init(void)
949 MALLOC(omb_stat
, struct omb_stat
*, OMB_STAT_SIZE(NELEM(mbuf_table
)),
950 M_TEMP
, M_WAITOK
| M_ZERO
);
951 VERIFY(omb_stat
!= NULL
);
953 MALLOC(mb_stat
, mb_stat_t
*, MB_STAT_SIZE(NELEM(mbuf_table
)),
954 M_TEMP
, M_WAITOK
| M_ZERO
);
955 VERIFY(mb_stat
!= NULL
);
957 mb_stat
->mbs_cnt
= NELEM(mbuf_table
);
958 for (m
= 0; m
< NELEM(mbuf_table
); m
++)
959 mbuf_table
[m
].mtbl_stats
= &mb_stat
->mbs_class
[m
];
961 #if CONFIG_MBUF_JUMBO
963 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
964 * this only on platforms where jumbo cluster pool is enabled.
966 njcl
= nmbclusters
/ 3;
967 njclbytes
= M16KCLBYTES
;
968 #endif /* CONFIG_MBUF_JUMBO */
971 * nclusters is going to be split in 2 to hold both the 2K
972 * and the 4K pools, so make sure each half is even.
974 nclusters
= P2ROUNDDOWN(nmbclusters
- njcl
, 4);
977 * Each jumbo cluster takes 8 2K clusters, so make
978 * sure that the pool size is evenly divisible by 8.
980 njcl
= P2ROUNDDOWN(nmbclusters
- nclusters
, 8);
983 #if CONFIG_MBUF_NOEXPAND
984 /* Only use 4k clusters if we're setting aside more than 256k */
985 if (nmbclusters
<= 128) {
986 maxmbufcl
= nmbclusters
/ 4;
988 /* Half to big clusters, half to small */
989 maxmbufcl
= (nmbclusters
/ 4) * 3;
991 #endif /* CONFIG_MBUF_NOEXPAND */
994 * 1/2 of the map is reserved for 2K clusters. Out of this, 1/16th
995 * of the total number of 2K clusters allocated is reserved and cannot
996 * be turned into mbufs. It can only be used for pure cluster objects.
998 m_minlimit(MC_CL
) = (nclusters
>> 5);
999 m_maxlimit(MC_CL
) = (nclusters
>> 1);
1000 m_maxsize(MC_CL
) = m_size(MC_CL
) = MCLBYTES
;
1001 (void) snprintf(m_cname(MC_CL
), MAX_MBUF_CNAME
, "cl");
1004 * The remaining (15/16th) can be turned into mbufs.
1006 m_minlimit(MC_MBUF
) = 0;
1007 m_maxlimit(MC_MBUF
) = (m_maxlimit(MC_CL
) - m_minlimit(MC_CL
)) * NMBPCL
;
1008 m_maxsize(MC_MBUF
) = m_size(MC_MBUF
) = MSIZE
;
1009 (void) snprintf(m_cname(MC_MBUF
), MAX_MBUF_CNAME
, "mbuf");
1012 * The other 1/2 of the map is reserved for 4K clusters.
1014 m_minlimit(MC_BIGCL
) = 0;
1015 m_maxlimit(MC_BIGCL
) = m_maxlimit(MC_CL
) >> 1;
1016 m_maxsize(MC_BIGCL
) = m_size(MC_BIGCL
) = NBPG
;
1017 (void) snprintf(m_cname(MC_BIGCL
), MAX_MBUF_CNAME
, "bigcl");
1020 * Set limits for the composite classes.
1022 m_minlimit(MC_MBUF_CL
) = 0;
1023 m_maxlimit(MC_MBUF_CL
) = m_maxlimit(MC_CL
) - m_minlimit(MC_CL
);
1024 m_maxsize(MC_MBUF_CL
) = MCLBYTES
;
1025 m_size(MC_MBUF_CL
) = m_size(MC_MBUF
) + m_size(MC_CL
);
1026 (void) snprintf(m_cname(MC_MBUF_CL
), MAX_MBUF_CNAME
, "mbuf_cl");
1028 m_minlimit(MC_MBUF_BIGCL
) = 0;
1029 m_maxlimit(MC_MBUF_BIGCL
) = m_maxlimit(MC_BIGCL
);
1030 m_maxsize(MC_MBUF_BIGCL
) = NBPG
;
1031 m_size(MC_MBUF_BIGCL
) = m_size(MC_MBUF
) + m_size(MC_BIGCL
);
1032 (void) snprintf(m_cname(MC_MBUF_BIGCL
), MAX_MBUF_CNAME
, "mbuf_bigcl");
1035 * And for jumbo classes.
1037 m_minlimit(MC_16KCL
) = 0;
1038 m_maxlimit(MC_16KCL
) = (njcl
>> 3);
1039 m_maxsize(MC_16KCL
) = m_size(MC_16KCL
) = M16KCLBYTES
;
1040 (void) snprintf(m_cname(MC_16KCL
), MAX_MBUF_CNAME
, "16kcl");
1042 m_minlimit(MC_MBUF_16KCL
) = 0;
1043 m_maxlimit(MC_MBUF_16KCL
) = m_maxlimit(MC_16KCL
);
1044 m_maxsize(MC_MBUF_16KCL
) = M16KCLBYTES
;
1045 m_size(MC_MBUF_16KCL
) = m_size(MC_MBUF
) + m_size(MC_16KCL
);
1046 (void) snprintf(m_cname(MC_MBUF_16KCL
), MAX_MBUF_CNAME
, "mbuf_16kcl");
1049 * Initialize the legacy mbstat structure.
1051 bzero(&mbstat
, sizeof (mbstat
));
1052 mbstat
.m_msize
= m_maxsize(MC_MBUF
);
1053 mbstat
.m_mclbytes
= m_maxsize(MC_CL
);
1054 mbstat
.m_minclsize
= MINCLSIZE
;
1055 mbstat
.m_mlen
= MLEN
;
1056 mbstat
.m_mhlen
= MHLEN
;
1057 mbstat
.m_bigmclbytes
= m_maxsize(MC_BIGCL
);
1060 #if defined(__LP64__)
1061 typedef struct ncl_tbl
{
1062 uint64_t nt_maxmem
; /* memory (sane) size */
1063 uint32_t nt_mbpool
; /* mbuf pool size */
1067 static ncl_tbl_t ncl_table
[] = {
1068 { (1ULL << GBSHIFT
) /* 1 GB */, (64 << MBSHIFT
) /* 64 MB */ },
1069 { (1ULL << (GBSHIFT
+ 3)) /* 8 GB */, (96 << MBSHIFT
) /* 96 MB */ },
1070 { (1ULL << (GBSHIFT
+ 4)) /* 16 GB */, (128 << MBSHIFT
) /* 128 MB */ },
1075 static ncl_tbl_t ncl_table_srv
[] = {
1076 { (1ULL << GBSHIFT
) /* 1 GB */, (96 << MBSHIFT
) /* 96 MB */ },
1077 { (1ULL << (GBSHIFT
+ 2)) /* 4 GB */, (128 << MBSHIFT
) /* 128 MB */ },
1078 { (1ULL << (GBSHIFT
+ 3)) /* 8 GB */, (160 << MBSHIFT
) /* 160 MB */ },
1079 { (1ULL << (GBSHIFT
+ 4)) /* 16 GB */, (192 << MBSHIFT
) /* 192 MB */ },
1080 { (1ULL << (GBSHIFT
+ 5)) /* 32 GB */, (256 << MBSHIFT
) /* 256 MB */ },
1081 { (1ULL << (GBSHIFT
+ 6)) /* 64 GB */, (384 << MBSHIFT
) /* 384 MB */ },
1084 #endif /* __LP64__ */
1086 __private_extern__
unsigned int
1087 mbuf_default_ncl(int srv
, uint64_t mem
)
1089 #if !defined(__LP64__)
1093 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1095 if ((n
= ((mem
/ 16) / MCLBYTES
)) > 32768)
1099 ncl_tbl_t
*tbl
= (srv
? ncl_table_srv
: ncl_table
);
1101 * 64-bit kernel (mbuf pool size based on table).
1103 n
= tbl
[0].nt_mbpool
;
1104 for (i
= 0; tbl
[i
].nt_mbpool
!= 0; i
++) {
1105 if (mem
< tbl
[i
].nt_maxmem
)
1107 n
= tbl
[i
].nt_mbpool
;
1110 #endif /* !__LP64__ */
1114 __private_extern__
void
1118 int initmcl
= MINCL
;
1120 thread_t thread
= THREAD_NULL
;
1122 if (nmbclusters
== 0)
1123 nmbclusters
= NMBCLUSTERS
;
1125 /* Setup the mbuf table */
1128 /* Global lock for common layer */
1129 mbuf_mlock_grp_attr
= lck_grp_attr_alloc_init();
1130 mbuf_mlock_grp
= lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr
);
1131 mbuf_mlock_attr
= lck_attr_alloc_init();
1132 mbuf_mlock
= lck_mtx_alloc_init(mbuf_mlock_grp
, mbuf_mlock_attr
);
1134 /* Allocate cluster slabs table */
1135 maxslabgrp
= P2ROUNDUP(nmbclusters
, NSLABSPMB
) / NSLABSPMB
;
1136 MALLOC(slabstbl
, mcl_slabg_t
**, maxslabgrp
* sizeof (mcl_slabg_t
*),
1137 M_TEMP
, M_WAITOK
| M_ZERO
);
1138 VERIFY(slabstbl
!= NULL
);
1140 /* Allocate audit structures if needed */
1141 PE_parse_boot_argn("mbuf_debug", &mbuf_debug
, sizeof (mbuf_debug
));
1142 mbuf_debug
|= mcache_getflags();
1143 if (mbuf_debug
& MCF_AUDIT
) {
1144 MALLOC(mclaudit
, mcl_audit_t
*,
1145 nmbclusters
* sizeof (*mclaudit
), M_TEMP
,
1147 VERIFY(mclaudit
!= NULL
);
1149 mcl_audit_con_cache
= mcache_create("mcl_audit_contents",
1150 AUDIT_CONTENTS_SIZE
, 0, 0, MCR_SLEEP
);
1151 VERIFY(mcl_audit_con_cache
!= NULL
);
1154 /* Calculate the number of pages assigned to the cluster pool */
1155 mcl_pages
= (nmbclusters
* MCLBYTES
) / CLBYTES
;
1156 MALLOC(mcl_paddr
, ppnum_t
*, mcl_pages
* sizeof (ppnum_t
),
1158 VERIFY(mcl_paddr
!= NULL
);
1160 /* Register with the I/O Bus mapper */
1161 mcl_paddr_base
= IOMapperIOVMAlloc(mcl_pages
);
1162 bzero((char *)mcl_paddr
, mcl_pages
* sizeof (ppnum_t
));
1164 embutl
= (union mcluster
*)
1165 ((unsigned char *)mbutl
+ (nmbclusters
* MCLBYTES
));
1167 PE_parse_boot_argn("initmcl", &initmcl
, sizeof (initmcl
));
1169 lck_mtx_lock(mbuf_mlock
);
1171 if (m_clalloc(MAX(NBPG
/CLBYTES
, 1) * initmcl
, M_WAIT
, MCLBYTES
) == 0)
1172 panic("mbinit: m_clalloc failed\n");
1174 lck_mtx_unlock(mbuf_mlock
);
1176 (void) kernel_thread_start((thread_continue_t
)mbuf_worker_thread_init
, NULL
, &thread
);
1177 thread_deallocate(thread
);
1179 ref_cache
= mcache_create("mext_ref", sizeof (struct ext_ref
),
1182 /* Create the cache for each class */
1183 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
1184 void *allocfunc
, *freefunc
, *auditfunc
;
1188 if (m_class(m
) == MC_MBUF_CL
|| m_class(m
) == MC_MBUF_BIGCL
||
1189 m_class(m
) == MC_MBUF_16KCL
) {
1190 allocfunc
= mbuf_cslab_alloc
;
1191 freefunc
= mbuf_cslab_free
;
1192 auditfunc
= mbuf_cslab_audit
;
1194 allocfunc
= mbuf_slab_alloc
;
1195 freefunc
= mbuf_slab_free
;
1196 auditfunc
= mbuf_slab_audit
;
1200 * Disable per-CPU caches for jumbo classes if there
1201 * is no jumbo cluster pool available in the system.
1202 * The cache itself is still created (but will never
1203 * be populated) since it simplifies the code.
1205 if ((m_class(m
) == MC_MBUF_16KCL
|| m_class(m
) == MC_16KCL
) &&
1207 flags
|= MCF_NOCPUCACHE
;
1209 m_cache(m
) = mcache_create_ext(m_cname(m
), m_maxsize(m
),
1210 allocfunc
, freefunc
, auditfunc
, mbuf_slab_notify
,
1211 (void *)(uintptr_t)m
, flags
, MCR_SLEEP
);
1215 * Allocate structure for per-CPU statistics that's aligned
1216 * on the CPU cache boundary; this code assumes that we never
1217 * uninitialize this framework, since the original address
1218 * before alignment is not saved.
1220 ncpu
= ml_get_max_cpus();
1221 MALLOC(buf
, void *, MBUF_MTYPES_SIZE(ncpu
) + CPU_CACHE_SIZE
,
1223 VERIFY(buf
!= NULL
);
1225 mbuf_mtypes
= (mbuf_mtypes_t
*)P2ROUNDUP((intptr_t)buf
, CPU_CACHE_SIZE
);
1226 bzero(mbuf_mtypes
, MBUF_MTYPES_SIZE(ncpu
));
1228 mbuf_gscale
= MB_GROWTH_NORMAL
;
1231 * Set the max limit on sb_max to be 1/16 th of the size of
1232 * memory allocated for mbuf clusters.
1234 high_sb_max
= (nmbclusters
<< (MCLSHIFT
- 4));
1235 if (high_sb_max
< sb_max
) {
1236 /* sb_max is too large for this configuration, scale it down */
1237 if (high_sb_max
> (1 << MBSHIFT
)) {
1238 /* We have atleast 16 M of mbuf pool */
1239 sb_max
= high_sb_max
;
1240 } else if ((nmbclusters
<< MCLSHIFT
) > (1 << MBSHIFT
)) {
1241 /* If we have more than 1M of mbufpool, cap the size of
1242 * max sock buf at 1M
1244 sb_max
= high_sb_max
= (1 << MBSHIFT
);
1246 sb_max
= high_sb_max
;
1250 printf("mbinit: done (%d MB memory set for mbuf pool)\n",
1251 (nmbclusters
<< MCLSHIFT
) >> MBSHIFT
);
1255 * Obtain a slab of object(s) from the class's freelist.
1257 static mcache_obj_t
*
1258 slab_alloc(mbuf_class_t
class, int wait
)
1263 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1265 VERIFY(class != MC_16KCL
|| njcl
> 0);
1267 /* This should always be NULL for us */
1268 VERIFY(m_cobjlist(class) == NULL
);
1271 * Treat composite objects as having longer lifespan by using
1272 * a slab from the reverse direction, in hoping that this could
1273 * reduce the probability of fragmentation for slabs that hold
1274 * more than one buffer chunks (e.g. mbuf slabs). For other
1275 * slabs, this probably doesn't make much of a difference.
1277 if (class == MC_MBUF
&& (wait
& MCR_COMP
))
1278 sp
= (mcl_slab_t
*)TAILQ_LAST(&m_slablist(class), mcl_slhead
);
1280 sp
= (mcl_slab_t
*)TAILQ_FIRST(&m_slablist(class));
1283 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1284 /* The slab list for this class is empty */
1288 VERIFY(m_infree(class) > 0);
1289 VERIFY(!slab_is_detached(sp
));
1290 VERIFY(sp
->sl_class
== class &&
1291 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
1293 VERIFY(slab_inrange(sp
, buf
) && sp
== slab_get(buf
));
1295 if (class == MC_MBUF
) {
1296 sp
->sl_head
= buf
->obj_next
;
1297 VERIFY(sp
->sl_head
!= NULL
|| sp
->sl_refcnt
== (NMBPCL
- 1));
1301 if (sp
->sl_head
!= NULL
&& !slab_inrange(sp
, sp
->sl_head
)) {
1302 slab_nextptr_panic(sp
, sp
->sl_head
);
1303 /* In case sl_head is in the map but not in the slab */
1304 VERIFY(slab_inrange(sp
, sp
->sl_head
));
1308 /* Increment slab reference */
1311 if (mclaudit
!= NULL
) {
1312 mcache_audit_t
*mca
= mcl_audit_buf2mca(class, buf
);
1313 mca
->mca_uflags
= 0;
1314 /* Save contents on mbuf objects only */
1315 if (class == MC_MBUF
)
1316 mca
->mca_uflags
|= MB_SCVALID
;
1319 if (class == MC_CL
) {
1320 mbstat
.m_clfree
= (--m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1322 * A 2K cluster slab can have at most 1 reference.
1324 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1325 sp
->sl_len
== m_maxsize(MC_CL
) && sp
->sl_head
== NULL
);
1326 } else if (class == MC_BIGCL
) {
1327 mcl_slab_t
*nsp
= sp
->sl_next
;
1328 mbstat
.m_bigclfree
= (--m_infree(MC_BIGCL
)) +
1329 m_infree(MC_MBUF_BIGCL
);
1331 * Increment 2nd slab. A 4K big cluster takes
1332 * 2 slabs, each having at most 1 reference.
1334 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1335 sp
->sl_len
== m_maxsize(MC_BIGCL
) && sp
->sl_head
== NULL
);
1336 /* Next slab must already be present */
1337 VERIFY(nsp
!= NULL
);
1339 VERIFY(!slab_is_detached(nsp
));
1340 VERIFY(nsp
->sl_class
== MC_BIGCL
&&
1341 nsp
->sl_flags
== (SLF_MAPPED
| SLF_PARTIAL
) &&
1342 nsp
->sl_refcnt
== 1 && nsp
->sl_chunks
== 0 &&
1343 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1344 nsp
->sl_head
== NULL
);
1345 } else if (class == MC_16KCL
) {
1349 --m_infree(MC_16KCL
);
1350 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1351 sp
->sl_len
== m_maxsize(MC_16KCL
) && sp
->sl_head
== NULL
);
1353 * Increment 2nd-8th slab. A 16K big cluster takes
1354 * 8 cluster slabs, each having at most 1 reference.
1356 for (nsp
= sp
, k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1358 /* Next slab must already be present */
1359 VERIFY(nsp
!= NULL
);
1361 VERIFY(!slab_is_detached(nsp
));
1362 VERIFY(nsp
->sl_class
== MC_16KCL
&&
1363 nsp
->sl_flags
== (SLF_MAPPED
| SLF_PARTIAL
) &&
1364 nsp
->sl_refcnt
== 1 && nsp
->sl_chunks
== 0 &&
1365 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1366 nsp
->sl_head
== NULL
);
1369 ASSERT(class == MC_MBUF
);
1370 --m_infree(MC_MBUF
);
1372 * If auditing is turned on, this check is
1373 * deferred until later in mbuf_slab_audit().
1375 if (mclaudit
== NULL
)
1376 _MCHECK((struct mbuf
*)buf
);
1378 * Since we have incremented the reference count above,
1379 * an mbuf slab (formerly a 2K cluster slab that was cut
1380 * up into mbufs) must have a reference count between 1
1381 * and NMBPCL at this point.
1383 VERIFY(sp
->sl_refcnt
>= 1 &&
1384 (unsigned short)sp
->sl_refcnt
<= NMBPCL
&&
1385 sp
->sl_chunks
== NMBPCL
&& sp
->sl_len
== m_maxsize(MC_CL
));
1386 VERIFY((unsigned short)sp
->sl_refcnt
< NMBPCL
||
1387 sp
->sl_head
== NULL
);
1390 /* If empty, remove this slab from the class's freelist */
1391 if (sp
->sl_head
== NULL
) {
1392 VERIFY(class != MC_MBUF
|| sp
->sl_refcnt
== NMBPCL
);
1393 slab_remove(sp
, class);
1400 * Place a slab of object(s) back into a class's slab list.
1403 slab_free(mbuf_class_t
class, mcache_obj_t
*buf
)
1407 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1409 VERIFY(class != MC_16KCL
|| njcl
> 0);
1410 VERIFY(buf
->obj_next
== NULL
);
1412 VERIFY(sp
->sl_class
== class && slab_inrange(sp
, buf
) &&
1413 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
1415 /* Decrement slab reference */
1418 if (class == MC_CL
|| class == MC_BIGCL
) {
1419 VERIFY(IS_P2ALIGNED(buf
, MCLBYTES
));
1421 * A 2K cluster slab can have at most 1 reference
1422 * which must be 0 at this point.
1424 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_chunks
== 1 &&
1425 sp
->sl_len
== m_maxsize(class) && sp
->sl_head
== NULL
);
1426 VERIFY(slab_is_detached(sp
));
1427 if (class == MC_BIGCL
) {
1428 mcl_slab_t
*nsp
= sp
->sl_next
;
1429 VERIFY(IS_P2ALIGNED(buf
, NBPG
));
1430 /* Next slab must already be present */
1431 VERIFY(nsp
!= NULL
);
1432 /* Decrement 2nd slab reference */
1435 * A 4K big cluster takes 2 slabs, both
1436 * must now have 0 reference.
1438 VERIFY(slab_is_detached(nsp
));
1439 VERIFY(nsp
->sl_class
== MC_BIGCL
&&
1440 (nsp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) &&
1441 nsp
->sl_refcnt
== 0 && nsp
->sl_chunks
== 0 &&
1442 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1443 nsp
->sl_head
== NULL
);
1445 } else if (class == MC_16KCL
) {
1449 * A 16K cluster takes 8 cluster slabs, all must
1450 * now have 0 reference.
1452 VERIFY(IS_P2ALIGNED(buf
, NBPG
));
1453 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_chunks
== 1 &&
1454 sp
->sl_len
== m_maxsize(MC_16KCL
) && sp
->sl_head
== NULL
);
1455 VERIFY(slab_is_detached(sp
));
1456 for (nsp
= sp
, k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1458 /* Next slab must already be present */
1459 VERIFY(nsp
!= NULL
);
1461 VERIFY(slab_is_detached(nsp
));
1462 VERIFY(nsp
->sl_class
== MC_16KCL
&&
1463 (nsp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) &&
1464 nsp
->sl_refcnt
== 0 && nsp
->sl_chunks
== 0 &&
1465 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1466 nsp
->sl_head
== NULL
);
1470 * An mbuf slab has a total of NMBPL reference counts.
1471 * Since we have decremented the reference above, it
1472 * must now be between 0 and NMBPCL-1.
1474 VERIFY(sp
->sl_refcnt
>= 0 &&
1475 (unsigned short)sp
->sl_refcnt
<= (NMBPCL
- 1) &&
1476 sp
->sl_chunks
== NMBPCL
&& sp
->sl_len
== m_maxsize(MC_CL
));
1477 VERIFY(sp
->sl_refcnt
< (NMBPCL
- 1) ||
1478 (slab_is_detached(sp
) && sp
->sl_head
== NULL
));
1482 * When auditing is enabled, ensure that the buffer still
1483 * contains the free pattern. Otherwise it got corrupted
1484 * while at the CPU cache layer.
1486 if (mclaudit
!= NULL
) {
1487 mcache_audit_t
*mca
= mcl_audit_buf2mca(class, buf
);
1488 mcache_audit_free_verify(mca
, buf
, 0, m_maxsize(class));
1489 mca
->mca_uflags
&= ~MB_SCVALID
;
1492 if (class == MC_CL
) {
1493 mbstat
.m_clfree
= (++m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1494 } else if (class == MC_BIGCL
) {
1495 mbstat
.m_bigclfree
= (++m_infree(MC_BIGCL
)) +
1496 m_infree(MC_MBUF_BIGCL
);
1497 } else if (class == MC_16KCL
) {
1498 ++m_infree(MC_16KCL
);
1500 ++m_infree(MC_MBUF
);
1501 buf
->obj_next
= sp
->sl_head
;
1505 /* All mbufs are freed; return the cluster that we stole earlier */
1506 if (sp
->sl_refcnt
== 0 && class == MC_MBUF
) {
1509 m_total(MC_MBUF
) -= NMBPCL
;
1510 mbstat
.m_mbufs
= m_total(MC_MBUF
);
1511 m_infree(MC_MBUF
) -= NMBPCL
;
1512 mtype_stat_add(MT_FREE
, -((unsigned)NMBPCL
));
1515 struct mbuf
*m
= sp
->sl_head
;
1517 sp
->sl_head
= m
->m_next
;
1520 VERIFY(sp
->sl_head
== NULL
);
1522 /* Remove the slab from the mbuf class's slab list */
1523 slab_remove(sp
, class);
1525 /* Reinitialize it as a 2K cluster slab */
1526 slab_init(sp
, MC_CL
, sp
->sl_flags
, sp
->sl_base
, sp
->sl_base
,
1529 if (mclaudit
!= NULL
)
1530 mcache_set_pattern(MCACHE_FREE_PATTERN
,
1531 (caddr_t
)sp
->sl_head
, m_maxsize(MC_CL
));
1533 mbstat
.m_clfree
= (++m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1535 VERIFY(slab_is_detached(sp
));
1536 /* And finally switch class */
1540 /* Reinsert the slab to the class's slab list */
1541 if (slab_is_detached(sp
))
1542 slab_insert(sp
, class);
1546 * Common allocator for rudimentary objects called by the CPU cache layer
1547 * during an allocation request whenever there is no available element in the
1548 * bucket layer. It returns one or more elements from the appropriate global
1549 * freelist. If the freelist is empty, it will attempt to populate it and
1550 * retry the allocation.
1553 mbuf_slab_alloc(void *arg
, mcache_obj_t
***plist
, unsigned int num
, int wait
)
1555 mbuf_class_t
class = (mbuf_class_t
)arg
;
1556 unsigned int need
= num
;
1557 mcache_obj_t
**list
= *plist
;
1559 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1562 lck_mtx_lock(mbuf_mlock
);
1565 if ((*list
= slab_alloc(class, wait
)) != NULL
) {
1566 (*list
)->obj_next
= NULL
;
1567 list
= *plist
= &(*list
)->obj_next
;
1571 * If the number of elements in freelist has
1572 * dropped below low watermark, asynchronously
1573 * populate the freelist now rather than doing
1574 * it later when we run out of elements.
1576 if (!mbuf_cached_above(class, wait
) &&
1577 m_infree(class) < m_total(class) >> 5) {
1578 (void) freelist_populate(class, 1,
1584 VERIFY(m_infree(class) == 0 || class == MC_CL
);
1586 (void) freelist_populate(class, 1,
1587 (wait
& MCR_NOSLEEP
) ? M_DONTWAIT
: M_WAIT
);
1589 if (m_infree(class) > 0)
1592 /* Check if there's anything at the cache layer */
1593 if (mbuf_cached_above(class, wait
))
1596 /* We have nothing and cannot block; give up */
1597 if (wait
& MCR_NOSLEEP
) {
1598 if (!(wait
& MCR_TRYHARD
)) {
1599 m_fail_cnt(class)++;
1606 * If the freelist is still empty and the caller is
1607 * willing to be blocked, sleep on the wait channel
1608 * until an element is available. Otherwise, if
1609 * MCR_TRYHARD is set, do our best to satisfy the
1610 * request without having to go to sleep.
1612 if (mbuf_worker_ready
&&
1613 mbuf_sleep(class, need
, wait
))
1616 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1620 m_alloc_cnt(class) += num
- need
;
1621 lck_mtx_unlock(mbuf_mlock
);
1623 return (num
- need
);
1627 * Common de-allocator for rudimentary objects called by the CPU cache
1628 * layer when one or more elements need to be returned to the appropriate
1632 mbuf_slab_free(void *arg
, mcache_obj_t
*list
, __unused
int purged
)
1634 mbuf_class_t
class = (mbuf_class_t
)arg
;
1635 mcache_obj_t
*nlist
;
1636 unsigned int num
= 0;
1639 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1641 lck_mtx_lock(mbuf_mlock
);
1644 nlist
= list
->obj_next
;
1645 list
->obj_next
= NULL
;
1646 slab_free(class, list
);
1648 if ((list
= nlist
) == NULL
)
1651 m_free_cnt(class) += num
;
1653 if ((w
= mb_waiters
) > 0)
1656 lck_mtx_unlock(mbuf_mlock
);
1659 wakeup(mb_waitchan
);
1663 * Common auditor for rudimentary objects called by the CPU cache layer
1664 * during an allocation or free request. For the former, this is called
1665 * after the objects are obtained from either the bucket or slab layer
1666 * and before they are returned to the caller. For the latter, this is
1667 * called immediately during free and before placing the objects into
1668 * the bucket or slab layer.
1671 mbuf_slab_audit(void *arg
, mcache_obj_t
*list
, boolean_t alloc
)
1673 mbuf_class_t
class = (mbuf_class_t
)arg
;
1674 mcache_audit_t
*mca
;
1676 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1678 while (list
!= NULL
) {
1679 lck_mtx_lock(mbuf_mlock
);
1680 mca
= mcl_audit_buf2mca(class, list
);
1682 /* Do the sanity checks */
1683 if (class == MC_MBUF
) {
1684 mcl_audit_mbuf(mca
, list
, FALSE
, alloc
);
1685 ASSERT(mca
->mca_uflags
& MB_SCVALID
);
1687 mcl_audit_cluster(mca
, list
, m_maxsize(class),
1689 ASSERT(!(mca
->mca_uflags
& MB_SCVALID
));
1691 /* Record this transaction */
1692 mcache_buffer_log(mca
, list
, m_cache(class));
1694 mca
->mca_uflags
|= MB_INUSE
;
1696 mca
->mca_uflags
&= ~MB_INUSE
;
1697 /* Unpair the object (unconditionally) */
1698 mca
->mca_uptr
= NULL
;
1699 lck_mtx_unlock(mbuf_mlock
);
1701 list
= list
->obj_next
;
1706 * Common notify routine for all caches. It is called by mcache when
1707 * one or more objects get freed. We use this indication to trigger
1708 * the wakeup of any sleeping threads so that they can retry their
1709 * allocation requests.
1712 mbuf_slab_notify(void *arg
, u_int32_t reason
)
1714 mbuf_class_t
class = (mbuf_class_t
)arg
;
1717 ASSERT(MBUF_CLASS_VALID(class));
1719 if (reason
!= MCN_RETRYALLOC
)
1722 lck_mtx_lock(mbuf_mlock
);
1723 if ((w
= mb_waiters
) > 0) {
1724 m_notified(class)++;
1727 lck_mtx_unlock(mbuf_mlock
);
1730 wakeup(mb_waitchan
);
1734 * Obtain object(s) from the composite class's freelist.
1737 cslab_alloc(mbuf_class_t
class, mcache_obj_t
***plist
, unsigned int num
)
1739 unsigned int need
= num
;
1740 mcl_slab_t
*sp
, *clsp
, *nsp
;
1742 mcache_obj_t
**list
= *plist
;
1746 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1747 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1749 /* Get what we can from the freelist */
1750 while ((*list
= m_cobjlist(class)) != NULL
) {
1753 m
= (struct mbuf
*)*list
;
1755 cl
= m
->m_ext
.ext_buf
;
1756 clsp
= slab_get(cl
);
1757 VERIFY(m
->m_flags
== M_EXT
&& cl
!= NULL
);
1758 VERIFY(MEXT_RFA(m
) != NULL
&& MBUF_IS_COMPOSITE(m
));
1759 VERIFY(clsp
->sl_refcnt
== 1);
1760 if (class == MC_MBUF_BIGCL
) {
1761 nsp
= clsp
->sl_next
;
1762 /* Next slab must already be present */
1763 VERIFY(nsp
!= NULL
);
1764 VERIFY(nsp
->sl_refcnt
== 1);
1765 } else if (class == MC_MBUF_16KCL
) {
1767 for (nsp
= clsp
, k
= 1;
1768 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1770 /* Next slab must already be present */
1771 VERIFY(nsp
!= NULL
);
1772 VERIFY(nsp
->sl_refcnt
== 1);
1776 if ((m_cobjlist(class) = (*list
)->obj_next
) != NULL
&&
1777 !MBUF_IN_MAP(m_cobjlist(class))) {
1778 slab_nextptr_panic(sp
, m_cobjlist(class));
1781 (*list
)->obj_next
= NULL
;
1782 list
= *plist
= &(*list
)->obj_next
;
1787 m_infree(class) -= (num
- need
);
1789 return (num
- need
);
1793 * Place object(s) back into a composite class's freelist.
1796 cslab_free(mbuf_class_t
class, mcache_obj_t
*list
, int purged
)
1798 mcache_obj_t
*o
, *tail
;
1799 unsigned int num
= 0;
1800 struct mbuf
*m
, *ms
;
1801 mcache_audit_t
*mca
= NULL
;
1802 mcache_obj_t
*ref_list
= NULL
;
1803 mcl_slab_t
*clsp
, *nsp
;
1806 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1807 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1808 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1812 while ((m
= ms
= (struct mbuf
*)o
) != NULL
) {
1813 mcache_obj_t
*rfa
, *nexto
= o
->obj_next
;
1815 /* Do the mbuf sanity checks */
1816 if (mclaudit
!= NULL
) {
1817 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
1818 mcache_audit_free_verify(mca
, m
, 0, m_maxsize(MC_MBUF
));
1819 ms
= (struct mbuf
*)mca
->mca_contents
;
1822 /* Do the cluster sanity checks */
1823 cl
= ms
->m_ext
.ext_buf
;
1824 clsp
= slab_get(cl
);
1825 if (mclaudit
!= NULL
) {
1827 if (class == MC_MBUF_CL
)
1828 size
= m_maxsize(MC_CL
);
1829 else if (class == MC_MBUF_BIGCL
)
1830 size
= m_maxsize(MC_BIGCL
);
1832 size
= m_maxsize(MC_16KCL
);
1833 mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL
,
1834 (mcache_obj_t
*)cl
), cl
, 0, size
);
1836 VERIFY(ms
->m_type
== MT_FREE
);
1837 VERIFY(ms
->m_flags
== M_EXT
);
1838 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
1839 VERIFY(clsp
->sl_refcnt
== 1);
1840 if (class == MC_MBUF_BIGCL
) {
1841 nsp
= clsp
->sl_next
;
1842 /* Next slab must already be present */
1843 VERIFY(nsp
!= NULL
);
1844 VERIFY(nsp
->sl_refcnt
== 1);
1845 } else if (class == MC_MBUF_16KCL
) {
1847 for (nsp
= clsp
, k
= 1;
1848 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1850 /* Next slab must already be present */
1851 VERIFY(nsp
!= NULL
);
1852 VERIFY(nsp
->sl_refcnt
== 1);
1857 * If we're asked to purge, restore the actual mbuf using
1858 * contents of the shadow structure (if auditing is enabled)
1859 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1860 * about to free it and the attached cluster into their caches.
1863 /* Restore constructed mbuf fields */
1864 if (mclaudit
!= NULL
)
1865 mcl_audit_restore_mbuf(m
, mca
, TRUE
);
1870 rfa
= (mcache_obj_t
*)MEXT_RFA(m
);
1871 rfa
->obj_next
= ref_list
;
1875 m
->m_type
= MT_FREE
;
1876 m
->m_flags
= m
->m_len
= 0;
1877 m
->m_next
= m
->m_nextpkt
= NULL
;
1879 /* Save mbuf fields and make auditing happy */
1880 if (mclaudit
!= NULL
)
1881 mcl_audit_mbuf(mca
, o
, FALSE
, FALSE
);
1883 VERIFY(m_total(class) > 0);
1888 slab_free(MC_MBUF
, o
);
1890 /* And free the cluster */
1891 ((mcache_obj_t
*)cl
)->obj_next
= NULL
;
1892 if (class == MC_MBUF_CL
)
1893 slab_free(MC_CL
, cl
);
1894 else if (class == MC_MBUF_BIGCL
)
1895 slab_free(MC_BIGCL
, cl
);
1897 slab_free(MC_16KCL
, cl
);
1906 tail
->obj_next
= m_cobjlist(class);
1907 m_cobjlist(class) = list
;
1908 m_infree(class) += num
;
1909 } else if (ref_list
!= NULL
) {
1910 mcache_free_ext(ref_cache
, ref_list
);
1917 * Common allocator for composite objects called by the CPU cache layer
1918 * during an allocation request whenever there is no available element in
1919 * the bucket layer. It returns one or more composite elements from the
1920 * appropriate global freelist. If the freelist is empty, it will attempt
1921 * to obtain the rudimentary objects from their caches and construct them
1922 * into composite mbuf + cluster objects.
1925 mbuf_cslab_alloc(void *arg
, mcache_obj_t
***plist
, unsigned int needed
,
1928 mbuf_class_t
class = (mbuf_class_t
)arg
;
1929 mcache_t
*cp
= NULL
;
1930 unsigned int num
= 0, cnum
= 0, want
= needed
;
1931 mcache_obj_t
*ref_list
= NULL
;
1932 mcache_obj_t
*mp_list
= NULL
;
1933 mcache_obj_t
*clp_list
= NULL
;
1934 mcache_obj_t
**list
;
1935 struct ext_ref
*rfa
;
1939 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1942 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1944 /* There should not be any slab for this class */
1945 VERIFY(m_slab_cnt(class) == 0 &&
1946 m_slablist(class).tqh_first
== NULL
&&
1947 m_slablist(class).tqh_last
== NULL
);
1949 lck_mtx_lock(mbuf_mlock
);
1951 /* Try using the freelist first */
1952 num
= cslab_alloc(class, plist
, needed
);
1954 if (num
== needed
) {
1955 m_alloc_cnt(class) += num
;
1956 lck_mtx_unlock(mbuf_mlock
);
1960 lck_mtx_unlock(mbuf_mlock
);
1963 * We could not satisfy the request using the freelist alone;
1964 * allocate from the appropriate rudimentary caches and use
1965 * whatever we can get to construct the composite objects.
1970 * Mark these allocation requests as coming from a composite cache.
1971 * Also, if the caller is willing to be blocked, mark the request
1972 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
1973 * slab layer waiting for the individual object when one or more
1974 * of the already-constructed composite objects are available.
1977 if (!(wait
& MCR_NOSLEEP
))
1980 needed
= mcache_alloc_ext(m_cache(MC_MBUF
), &mp_list
, needed
, wait
);
1982 ASSERT(mp_list
== NULL
);
1985 if (class == MC_MBUF_CL
)
1986 cp
= m_cache(MC_CL
);
1987 else if (class == MC_MBUF_BIGCL
)
1988 cp
= m_cache(MC_BIGCL
);
1990 cp
= m_cache(MC_16KCL
);
1991 needed
= mcache_alloc_ext(cp
, &clp_list
, needed
, wait
);
1993 ASSERT(clp_list
== NULL
);
1996 needed
= mcache_alloc_ext(ref_cache
, &ref_list
, needed
, wait
);
1998 ASSERT(ref_list
== NULL
);
2003 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2004 * overs will get freed accordingly before we return to caller.
2006 for (cnum
= 0; cnum
< needed
; cnum
++) {
2009 m
= ms
= (struct mbuf
*)mp_list
;
2010 mp_list
= mp_list
->obj_next
;
2013 clp_list
= clp_list
->obj_next
;
2014 ((mcache_obj_t
*)cl
)->obj_next
= NULL
;
2016 rfa
= (struct ext_ref
*)ref_list
;
2017 ref_list
= ref_list
->obj_next
;
2018 ((mcache_obj_t
*)rfa
)->obj_next
= NULL
;
2021 * If auditing is enabled, construct the shadow mbuf
2022 * in the audit structure instead of in the actual one.
2023 * mbuf_cslab_audit() will take care of restoring the
2024 * contents after the integrity check.
2026 if (mclaudit
!= NULL
) {
2027 mcache_audit_t
*mca
, *cl_mca
;
2030 lck_mtx_lock(mbuf_mlock
);
2031 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
2032 ms
= ((struct mbuf
*)mca
->mca_contents
);
2033 cl_mca
= mcl_audit_buf2mca(MC_CL
, (mcache_obj_t
*)cl
);
2036 * Pair them up. Note that this is done at the time
2037 * the mbuf+cluster objects are constructed. This
2038 * information should be treated as "best effort"
2039 * debugging hint since more than one mbufs can refer
2040 * to a cluster. In that case, the cluster might not
2041 * be freed along with the mbuf it was paired with.
2043 mca
->mca_uptr
= cl_mca
;
2044 cl_mca
->mca_uptr
= mca
;
2046 ASSERT(mca
->mca_uflags
& MB_SCVALID
);
2047 ASSERT(!(cl_mca
->mca_uflags
& MB_SCVALID
));
2048 lck_mtx_unlock(mbuf_mlock
);
2050 /* Technically, they are in the freelist */
2051 mcache_set_pattern(MCACHE_FREE_PATTERN
, m
,
2052 m_maxsize(MC_MBUF
));
2053 if (class == MC_MBUF_CL
)
2054 size
= m_maxsize(MC_CL
);
2055 else if (class == MC_MBUF_BIGCL
)
2056 size
= m_maxsize(MC_BIGCL
);
2058 size
= m_maxsize(MC_16KCL
);
2059 mcache_set_pattern(MCACHE_FREE_PATTERN
, cl
, size
);
2062 MBUF_INIT(ms
, 0, MT_FREE
);
2063 if (class == MC_MBUF_16KCL
) {
2064 MBUF_16KCL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
2065 } else if (class == MC_MBUF_BIGCL
) {
2066 MBUF_BIGCL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
2068 MBUF_CL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
2070 VERIFY(ms
->m_flags
== M_EXT
);
2071 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
2073 *list
= (mcache_obj_t
*)m
;
2074 (*list
)->obj_next
= NULL
;
2075 list
= *plist
= &(*list
)->obj_next
;
2080 * Free up what's left of the above.
2082 if (mp_list
!= NULL
)
2083 mcache_free_ext(m_cache(MC_MBUF
), mp_list
);
2084 if (clp_list
!= NULL
)
2085 mcache_free_ext(cp
, clp_list
);
2086 if (ref_list
!= NULL
)
2087 mcache_free_ext(ref_cache
, ref_list
);
2089 lck_mtx_lock(mbuf_mlock
);
2090 if (num
> 0 || cnum
> 0) {
2091 m_total(class) += cnum
;
2092 VERIFY(m_total(class) <= m_maxlimit(class));
2093 m_alloc_cnt(class) += num
+ cnum
;
2095 if ((num
+ cnum
) < want
)
2096 m_fail_cnt(class) += (want
- (num
+ cnum
));
2097 lck_mtx_unlock(mbuf_mlock
);
2099 return (num
+ cnum
);
2103 * Common de-allocator for composite objects called by the CPU cache
2104 * layer when one or more elements need to be returned to the appropriate
2108 mbuf_cslab_free(void *arg
, mcache_obj_t
*list
, int purged
)
2110 mbuf_class_t
class = (mbuf_class_t
)arg
;
2114 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2116 lck_mtx_lock(mbuf_mlock
);
2118 num
= cslab_free(class, list
, purged
);
2119 m_free_cnt(class) += num
;
2121 if ((w
= mb_waiters
) > 0)
2124 lck_mtx_unlock(mbuf_mlock
);
2127 wakeup(mb_waitchan
);
2131 * Common auditor for composite objects called by the CPU cache layer
2132 * during an allocation or free request. For the former, this is called
2133 * after the objects are obtained from either the bucket or slab layer
2134 * and before they are returned to the caller. For the latter, this is
2135 * called immediately during free and before placing the objects into
2136 * the bucket or slab layer.
2139 mbuf_cslab_audit(void *arg
, mcache_obj_t
*list
, boolean_t alloc
)
2141 mbuf_class_t
class = (mbuf_class_t
)arg
;
2142 mcache_audit_t
*mca
;
2143 struct mbuf
*m
, *ms
;
2144 mcl_slab_t
*clsp
, *nsp
;
2148 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2150 while ((m
= ms
= (struct mbuf
*)list
) != NULL
) {
2151 lck_mtx_lock(mbuf_mlock
);
2152 /* Do the mbuf sanity checks and record its transaction */
2153 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
2154 mcl_audit_mbuf(mca
, m
, TRUE
, alloc
);
2155 mcache_buffer_log(mca
, m
, m_cache(class));
2157 mca
->mca_uflags
|= MB_COMP_INUSE
;
2159 mca
->mca_uflags
&= ~MB_COMP_INUSE
;
2162 * Use the shadow mbuf in the audit structure if we are
2163 * freeing, since the contents of the actual mbuf has been
2164 * pattern-filled by the above call to mcl_audit_mbuf().
2167 ms
= (struct mbuf
*)mca
->mca_contents
;
2169 /* Do the cluster sanity checks and record its transaction */
2170 cl
= ms
->m_ext
.ext_buf
;
2171 clsp
= slab_get(cl
);
2172 VERIFY(ms
->m_flags
== M_EXT
&& cl
!= NULL
);
2173 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
2174 VERIFY(clsp
->sl_refcnt
== 1);
2175 if (class == MC_MBUF_BIGCL
) {
2176 nsp
= clsp
->sl_next
;
2177 /* Next slab must already be present */
2178 VERIFY(nsp
!= NULL
);
2179 VERIFY(nsp
->sl_refcnt
== 1);
2180 } else if (class == MC_MBUF_16KCL
) {
2182 for (nsp
= clsp
, k
= 1;
2183 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
2185 /* Next slab must already be present */
2186 VERIFY(nsp
!= NULL
);
2187 VERIFY(nsp
->sl_refcnt
== 1);
2191 mca
= mcl_audit_buf2mca(MC_CL
, cl
);
2192 if (class == MC_MBUF_CL
)
2193 size
= m_maxsize(MC_CL
);
2194 else if (class == MC_MBUF_BIGCL
)
2195 size
= m_maxsize(MC_BIGCL
);
2197 size
= m_maxsize(MC_16KCL
);
2198 mcl_audit_cluster(mca
, cl
, size
, alloc
, FALSE
);
2199 mcache_buffer_log(mca
, cl
, m_cache(class));
2201 mca
->mca_uflags
|= MB_COMP_INUSE
;
2203 mca
->mca_uflags
&= ~MB_COMP_INUSE
;
2204 lck_mtx_unlock(mbuf_mlock
);
2206 list
= list
->obj_next
;
2211 * Allocate some number of mbuf clusters and place on cluster freelist.
2214 m_clalloc(const u_int32_t num
, const int wait
, const u_int32_t bufsize
)
2218 int numpages
= 0, large_buffer
= (bufsize
== m_maxsize(MC_16KCL
));
2219 vm_offset_t page
= 0;
2220 mcache_audit_t
*mca_list
= NULL
;
2221 mcache_obj_t
*con_list
= NULL
;
2224 VERIFY(bufsize
== m_maxsize(MC_CL
) ||
2225 bufsize
== m_maxsize(MC_BIGCL
) || bufsize
== m_maxsize(MC_16KCL
));
2227 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2230 * Multiple threads may attempt to populate the cluster map one
2231 * after another. Since we drop the lock below prior to acquiring
2232 * the physical page(s), our view of the cluster map may no longer
2233 * be accurate, and we could end up over-committing the pages beyond
2234 * the maximum allowed for each class. To prevent it, this entire
2235 * operation (including the page mapping) is serialized.
2237 while (mb_clalloc_busy
) {
2238 mb_clalloc_waiters
++;
2239 (void) msleep(mb_clalloc_waitchan
, mbuf_mlock
,
2240 (PZERO
-1), "m_clalloc", NULL
);
2241 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2244 /* We are busy now; tell everyone else to go away */
2245 mb_clalloc_busy
= TRUE
;
2248 * Honor the caller's wish to block or not block. We have a way
2249 * to grow the pool asynchronously using the mbuf worker thread.
2251 i
= m_howmany(num
, bufsize
);
2252 if (i
== 0 || (wait
& M_DONTWAIT
))
2255 lck_mtx_unlock(mbuf_mlock
);
2257 size
= round_page(i
* bufsize
);
2258 page
= kmem_mb_alloc(mb_map
, size
, large_buffer
);
2261 * If we did ask for "n" 16K physically contiguous chunks
2262 * and didn't get them, then please try again without this
2265 if (large_buffer
&& page
== 0)
2266 page
= kmem_mb_alloc(mb_map
, size
, 0);
2269 if (bufsize
<= m_maxsize(MC_BIGCL
)) {
2270 /* Try for 1 page if failed, only for 2KB/4KB request */
2272 page
= kmem_mb_alloc(mb_map
, size
, 0);
2276 lck_mtx_lock(mbuf_mlock
);
2281 VERIFY(IS_P2ALIGNED(page
, NBPG
));
2282 numpages
= size
/ NBPG
;
2284 /* If auditing is enabled, allocate the audit structures now */
2285 if (mclaudit
!= NULL
) {
2289 * Yes, I realize this is a waste of memory for clusters
2290 * that never get transformed into mbufs, as we may end
2291 * up with NMBPCL-1 unused audit structures per cluster.
2292 * But doing so tremendously simplifies the allocation
2293 * strategy, since at this point we are not holding the
2294 * mbuf lock and the caller is okay to be blocked. For
2295 * the case of big clusters, we allocate one structure
2296 * for each as we never turn them into mbufs.
2298 if (bufsize
== m_maxsize(MC_CL
)) {
2299 needed
= numpages
* 2 * NMBPCL
;
2301 i
= mcache_alloc_ext(mcl_audit_con_cache
,
2302 &con_list
, needed
, MCR_SLEEP
);
2304 VERIFY(con_list
!= NULL
&& i
== needed
);
2305 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2308 needed
= numpages
/ (M16KCLBYTES
/ NBPG
);
2311 i
= mcache_alloc_ext(mcache_audit_cache
,
2312 (mcache_obj_t
**)&mca_list
, needed
, MCR_SLEEP
);
2314 VERIFY(mca_list
!= NULL
&& i
== needed
);
2317 lck_mtx_lock(mbuf_mlock
);
2319 for (i
= 0; i
< numpages
; i
++, page
+= NBPG
) {
2320 ppnum_t offset
= ((char *)page
- (char *)mbutl
) / NBPG
;
2321 ppnum_t new_page
= pmap_find_phys(kernel_pmap
,
2325 * In the case of no mapper being available the following
2326 * code noops and returns the input page; if there is a
2327 * mapper the appropriate I/O page is returned.
2329 VERIFY(offset
< mcl_pages
);
2330 new_page
= IOMapperInsertPage(mcl_paddr_base
, offset
, new_page
);
2331 mcl_paddr
[offset
] = new_page
<< PGSHIFT
;
2333 /* Pattern-fill this fresh page */
2334 if (mclaudit
!= NULL
)
2335 mcache_set_pattern(MCACHE_FREE_PATTERN
,
2336 (caddr_t
)page
, NBPG
);
2338 if (bufsize
== m_maxsize(MC_CL
)) {
2339 union mcluster
*mcl
= (union mcluster
*)page
;
2341 /* 1st cluster in the page */
2343 if (mclaudit
!= NULL
)
2344 mcl_audit_init(mcl
, &mca_list
, &con_list
,
2345 AUDIT_CONTENTS_SIZE
, NMBPCL
);
2347 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2348 slab_init(sp
, MC_CL
, SLF_MAPPED
,
2349 mcl
, mcl
, bufsize
, 0, 1);
2351 /* Insert this slab */
2352 slab_insert(sp
, MC_CL
);
2354 /* Update stats now since slab_get() drops the lock */
2355 mbstat
.m_clfree
= ++m_infree(MC_CL
) +
2356 m_infree(MC_MBUF_CL
);
2357 mbstat
.m_clusters
= ++m_total(MC_CL
);
2358 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2360 /* 2nd cluster in the page */
2361 sp
= slab_get(++mcl
);
2362 if (mclaudit
!= NULL
)
2363 mcl_audit_init(mcl
, &mca_list
, &con_list
,
2364 AUDIT_CONTENTS_SIZE
, NMBPCL
);
2366 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2367 slab_init(sp
, MC_CL
, SLF_MAPPED
,
2368 mcl
, mcl
, bufsize
, 0, 1);
2370 /* Insert this slab */
2371 slab_insert(sp
, MC_CL
);
2373 /* Update stats now since slab_get() drops the lock */
2374 mbstat
.m_clfree
= ++m_infree(MC_CL
) +
2375 m_infree(MC_MBUF_CL
);
2376 mbstat
.m_clusters
= ++m_total(MC_CL
);
2377 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2378 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2379 union mbigcluster
*mbc
= (union mbigcluster
*)page
;
2382 /* One for the entire page */
2384 if (mclaudit
!= NULL
)
2385 mcl_audit_init(mbc
, &mca_list
, NULL
, 0, 1);
2387 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2388 slab_init(sp
, MC_BIGCL
, SLF_MAPPED
,
2389 mbc
, mbc
, bufsize
, 0, 1);
2391 /* 2nd cluster's slab is part of the previous one */
2392 nsp
= slab_get(((union mcluster
*)page
) + 1);
2393 slab_init(nsp
, MC_BIGCL
, SLF_MAPPED
| SLF_PARTIAL
,
2394 mbc
, NULL
, 0, 0, 0);
2396 /* Insert this slab */
2397 slab_insert(sp
, MC_BIGCL
);
2399 /* Update stats now since slab_get() drops the lock */
2400 mbstat
.m_bigclfree
= ++m_infree(MC_BIGCL
) +
2401 m_infree(MC_MBUF_BIGCL
);
2402 mbstat
.m_bigclusters
= ++m_total(MC_BIGCL
);
2403 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
2404 } else if ((i
% (M16KCLBYTES
/ NBPG
)) == 0) {
2405 union m16kcluster
*m16kcl
= (union m16kcluster
*)page
;
2410 /* One for the entire 16KB */
2411 sp
= slab_get(m16kcl
);
2412 if (mclaudit
!= NULL
)
2413 mcl_audit_init(m16kcl
, &mca_list
, NULL
, 0, 1);
2415 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2416 slab_init(sp
, MC_16KCL
, SLF_MAPPED
,
2417 m16kcl
, m16kcl
, bufsize
, 0, 1);
2419 /* 2nd-8th cluster's slab is part of the first one */
2420 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
2421 nsp
= slab_get(((union mcluster
*)page
) + k
);
2422 VERIFY(nsp
->sl_refcnt
== 0 &&
2423 nsp
->sl_flags
== 0);
2424 slab_init(nsp
, MC_16KCL
,
2425 SLF_MAPPED
| SLF_PARTIAL
,
2426 m16kcl
, NULL
, 0, 0, 0);
2429 /* Insert this slab */
2430 slab_insert(sp
, MC_16KCL
);
2432 /* Update stats now since slab_get() drops the lock */
2433 m_infree(MC_16KCL
)++;
2434 m_total(MC_16KCL
)++;
2435 VERIFY(m_total(MC_16KCL
) <= m_maxlimit(MC_16KCL
));
2438 VERIFY(mca_list
== NULL
&& con_list
== NULL
);
2440 /* We're done; let others enter */
2441 mb_clalloc_busy
= FALSE
;
2442 if (mb_clalloc_waiters
> 0) {
2443 mb_clalloc_waiters
= 0;
2444 wakeup(mb_clalloc_waitchan
);
2447 if (bufsize
== m_maxsize(MC_CL
))
2448 return (numpages
<< 1);
2449 else if (bufsize
== m_maxsize(MC_BIGCL
))
2452 VERIFY(bufsize
== m_maxsize(MC_16KCL
));
2453 return (numpages
/ (M16KCLBYTES
/ NBPG
));
2456 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2458 /* We're done; let others enter */
2459 mb_clalloc_busy
= FALSE
;
2460 if (mb_clalloc_waiters
> 0) {
2461 mb_clalloc_waiters
= 0;
2462 wakeup(mb_clalloc_waitchan
);
2466 * When non-blocking we kick a thread if we have to grow the
2467 * pool or if the number of free clusters is less than requested.
2469 if (bufsize
== m_maxsize(MC_CL
)) {
2472 * Remember total number of clusters needed
2475 i
+= m_total(MC_CL
);
2476 if (i
> mbuf_expand_mcl
) {
2477 mbuf_expand_mcl
= i
;
2478 if (mbuf_worker_ready
)
2479 wakeup((caddr_t
)&mbuf_worker_run
);
2483 if (m_infree(MC_CL
) >= num
)
2485 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2488 * Remember total number of 4KB clusters needed
2491 i
+= m_total(MC_BIGCL
);
2492 if (i
> mbuf_expand_big
) {
2493 mbuf_expand_big
= i
;
2494 if (mbuf_worker_ready
)
2495 wakeup((caddr_t
)&mbuf_worker_run
);
2499 if (m_infree(MC_BIGCL
) >= num
)
2504 * Remember total number of 16KB clusters needed
2507 i
+= m_total(MC_16KCL
);
2508 if (i
> mbuf_expand_16k
) {
2509 mbuf_expand_16k
= i
;
2510 if (mbuf_worker_ready
)
2511 wakeup((caddr_t
)&mbuf_worker_run
);
2515 if (m_infree(MC_16KCL
) >= num
)
2522 * Populate the global freelist of the corresponding buffer class.
2525 freelist_populate(mbuf_class_t
class, unsigned int num
, int wait
)
2527 mcache_obj_t
*o
= NULL
;
2530 VERIFY(class == MC_MBUF
|| class == MC_CL
|| class == MC_BIGCL
||
2533 #if CONFIG_MBUF_NOEXPAND
2534 if ((mbstat
.m_mbufs
/ NMBPCL
) >= maxmbufcl
) {
2536 static int printonce
= 1;
2537 if (printonce
== 1) {
2539 printf("m_expand failed, allocated %ld out of %d "
2540 "clusters\n", mbstat
.m_mbufs
/ NMBPCL
,
2546 #endif /* CONFIG_MBUF_NOEXPAND */
2548 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2553 i
= m_clalloc(num
, wait
, m_maxsize(MC_CL
));
2555 /* Respect the 2K clusters minimum limit */
2556 if (m_total(MC_CL
) == m_maxlimit(MC_CL
) &&
2557 m_infree(MC_CL
) <= m_minlimit(MC_CL
)) {
2558 if (class != MC_CL
|| (wait
& MCR_COMP
))
2567 return (m_clalloc(num
, wait
, m_maxsize(class)) != 0);
2575 /* Steal a cluster and cut it up to create NMBPCL mbufs */
2576 if ((o
= slab_alloc(MC_CL
, wait
)) != NULL
) {
2577 struct mbuf
*m
= (struct mbuf
*)o
;
2578 mcache_audit_t
*mca
= NULL
;
2579 mcl_slab_t
*sp
= slab_get(o
);
2581 VERIFY(slab_is_detached(sp
) &&
2582 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
2584 /* Make sure that the cluster is unmolested while in freelist */
2585 if (mclaudit
!= NULL
) {
2586 mca
= mcl_audit_buf2mca(MC_CL
, o
);
2587 mcache_audit_free_verify(mca
, o
, 0, m_maxsize(MC_CL
));
2590 /* Reinitialize it as an mbuf slab */
2591 slab_init(sp
, MC_MBUF
, sp
->sl_flags
, sp
->sl_base
, NULL
,
2592 sp
->sl_len
, 0, NMBPCL
);
2594 VERIFY(m
== (struct mbuf
*)sp
->sl_base
);
2595 VERIFY(sp
->sl_head
== NULL
);
2597 m_total(MC_MBUF
) += NMBPCL
;
2598 mbstat
.m_mbufs
= m_total(MC_MBUF
);
2599 m_infree(MC_MBUF
) += NMBPCL
;
2600 mtype_stat_add(MT_FREE
, NMBPCL
);
2605 * If auditing is enabled, construct the shadow mbuf
2606 * in the audit structure instead of the actual one.
2607 * mbuf_slab_audit() will take care of restoring the
2608 * contents after the integrity check.
2610 if (mclaudit
!= NULL
) {
2612 mca
= mcl_audit_buf2mca(MC_MBUF
,
2614 ms
= ((struct mbuf
*)mca
->mca_contents
);
2615 ms
->m_type
= MT_FREE
;
2617 m
->m_type
= MT_FREE
;
2619 m
->m_next
= sp
->sl_head
;
2620 sp
->sl_head
= (void *)m
++;
2623 /* Insert it into the mbuf class's slab list */
2624 slab_insert(sp
, MC_MBUF
);
2626 if ((i
= mb_waiters
) > 0)
2629 wakeup(mb_waitchan
);
2638 * (Inaccurately) check if it might be worth a trip back to the
2639 * mcache layer due the availability of objects there. We'll
2640 * end up back here if there's nothing up there.
2643 mbuf_cached_above(mbuf_class_t
class, int wait
)
2647 if (wait
& MCR_COMP
)
2648 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL
)) ||
2649 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL
)));
2653 if (wait
& MCR_COMP
)
2654 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL
)));
2658 if (wait
& MCR_COMP
)
2659 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL
)));
2663 if (wait
& MCR_COMP
)
2664 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL
)));
2677 return (!mcache_bkt_isempty(m_cache(class)));
2681 * If possible, convert constructed objects to raw ones.
2684 mbuf_steal(mbuf_class_t
class, unsigned int num
)
2686 mcache_obj_t
*top
= NULL
;
2687 mcache_obj_t
**list
= &top
;
2688 unsigned int tot
= 0;
2690 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2702 /* Get the required number of constructed objects if possible */
2703 if (m_infree(class) > m_minlimit(class)) {
2704 tot
= cslab_alloc(class, &list
,
2705 MIN(num
, m_infree(class)));
2708 /* And destroy them to get back the raw objects */
2710 (void) cslab_free(class, top
, 1);
2718 return (tot
== num
);
2722 m_reclaim(mbuf_class_t
class, unsigned int num
, boolean_t comp
)
2726 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2728 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2729 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
2730 VERIFY(m_total(MC_16KCL
) <= m_maxlimit(MC_16KCL
));
2733 * This logic can be made smarter; for now, simply mark
2734 * all other related classes as potential victims.
2738 m_wantpurge(MC_CL
)++;
2739 m_wantpurge(MC_MBUF_CL
)++;
2740 m_wantpurge(MC_MBUF_BIGCL
)++;
2744 m_wantpurge(MC_MBUF
)++;
2746 m_wantpurge(MC_MBUF_CL
)++;
2751 m_wantpurge(MC_MBUF_BIGCL
)++;
2756 m_wantpurge(MC_MBUF_16KCL
)++;
2765 * Run through each marked class and check if we really need to
2766 * purge (and therefore temporarily disable) the per-CPU caches
2767 * layer used by the class. If so, remember the classes since
2768 * we are going to drop the lock below prior to purging.
2770 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
2771 if (m_wantpurge(m
) > 0) {
2774 * Try hard to steal the required number of objects
2775 * from the freelist of other mbuf classes. Only
2776 * purge and disable the per-CPU caches layer when
2777 * we don't have enough; it's the last resort.
2779 if (!mbuf_steal(m
, num
))
2784 lck_mtx_unlock(mbuf_mlock
);
2787 /* drain is performed in pfslowtimo(), to avoid deadlocks */
2790 /* Sigh; we have no other choices but to ask mcache to purge */
2791 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
2792 if ((bmap
& (1 << m
)) &&
2793 mcache_purge_cache(m_cache(m
))) {
2794 lck_mtx_lock(mbuf_mlock
);
2797 lck_mtx_unlock(mbuf_mlock
);
2802 * Request mcache to reap extra elements from all of its caches;
2803 * note that all reaps are serialized and happen only at a fixed
2808 lck_mtx_lock(mbuf_mlock
);
2811 static inline struct mbuf
*
2812 m_get_common(int wait
, short type
, int hdr
)
2815 int mcflags
= MSLEEPF(wait
);
2817 /* Is this due to a non-blocking retry? If so, then try harder */
2818 if (mcflags
& MCR_NOSLEEP
)
2819 mcflags
|= MCR_TRYHARD
;
2821 m
= mcache_alloc(m_cache(MC_MBUF
), mcflags
);
2823 MBUF_INIT(m
, hdr
, type
);
2824 mtype_stat_inc(type
);
2825 mtype_stat_dec(MT_FREE
);
2827 if (hdr
&& mac_init_mbuf(m
, wait
) != 0) {
2831 #endif /* MAC_NET */
2837 * Space allocation routines; these are also available as macros
2838 * for critical paths.
2840 #define _M_GET(wait, type) m_get_common(wait, type, 0)
2841 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
2842 #define _M_RETRY(wait, type) _M_GET(wait, type)
2843 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
2844 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
2845 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
2848 m_get(int wait
, int type
)
2850 return (_M_GET(wait
, type
));
2854 m_gethdr(int wait
, int type
)
2856 return (_M_GETHDR(wait
, type
));
2860 m_retry(int wait
, int type
)
2862 return (_M_RETRY(wait
, type
));
2866 m_retryhdr(int wait
, int type
)
2868 return (_M_RETRYHDR(wait
, type
));
2872 m_getclr(int wait
, int type
)
2876 _MGET(m
, wait
, type
);
2878 bzero(MTOD(m
, caddr_t
), MLEN
);
2883 m_free(struct mbuf
*m
)
2885 struct mbuf
*n
= m
->m_next
;
2887 if (m
->m_type
== MT_FREE
)
2888 panic("m_free: freeing an already freed mbuf");
2890 /* Free the aux data and tags if there is any */
2891 if (m
->m_flags
& M_PKTHDR
) {
2892 m_tag_delete_chain(m
, NULL
);
2895 if (m
->m_flags
& M_EXT
) {
2899 refcnt
= m_decref(m
);
2900 flags
= MEXT_FLAGS(m
);
2901 if (refcnt
== 0 && flags
== 0) {
2902 if (m
->m_ext
.ext_free
== NULL
) {
2903 mcache_free(m_cache(MC_CL
), m
->m_ext
.ext_buf
);
2904 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2905 mcache_free(m_cache(MC_BIGCL
),
2907 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
2908 mcache_free(m_cache(MC_16KCL
),
2911 (*(m
->m_ext
.ext_free
))(m
->m_ext
.ext_buf
,
2912 m
->m_ext
.ext_size
, m
->m_ext
.ext_arg
);
2914 mcache_free(ref_cache
, MEXT_RFA(m
));
2916 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
2917 VERIFY(m
->m_type
!= MT_FREE
);
2919 mtype_stat_dec(m
->m_type
);
2920 mtype_stat_inc(MT_FREE
);
2922 m
->m_type
= MT_FREE
;
2925 m
->m_next
= m
->m_nextpkt
= NULL
;
2927 /* "Free" into the intermediate cache */
2928 if (m
->m_ext
.ext_free
== NULL
) {
2929 mcache_free(m_cache(MC_MBUF_CL
), m
);
2930 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2931 mcache_free(m_cache(MC_MBUF_BIGCL
), m
);
2933 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
2934 mcache_free(m_cache(MC_MBUF_16KCL
), m
);
2940 if (m
->m_type
!= MT_FREE
) {
2941 mtype_stat_dec(m
->m_type
);
2942 mtype_stat_inc(MT_FREE
);
2945 m
->m_type
= MT_FREE
;
2946 m
->m_flags
= m
->m_len
= 0;
2947 m
->m_next
= m
->m_nextpkt
= NULL
;
2949 mcache_free(m_cache(MC_MBUF
), m
);
2954 __private_extern__
struct mbuf
*
2955 m_clattach(struct mbuf
*m
, int type
, caddr_t extbuf
,
2956 void (*extfree
)(caddr_t
, u_int
, caddr_t
), u_int extsize
, caddr_t extarg
,
2959 struct ext_ref
*rfa
= NULL
;
2961 if (m
== NULL
&& (m
= _M_GETHDR(wait
, type
)) == NULL
)
2964 if (m
->m_flags
& M_EXT
) {
2968 refcnt
= m_decref(m
);
2969 flags
= MEXT_FLAGS(m
);
2970 if (refcnt
== 0 && flags
== 0) {
2971 if (m
->m_ext
.ext_free
== NULL
) {
2972 mcache_free(m_cache(MC_CL
), m
->m_ext
.ext_buf
);
2973 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2974 mcache_free(m_cache(MC_BIGCL
),
2976 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
2977 mcache_free(m_cache(MC_16KCL
),
2980 (*(m
->m_ext
.ext_free
))(m
->m_ext
.ext_buf
,
2981 m
->m_ext
.ext_size
, m
->m_ext
.ext_arg
);
2983 /* Re-use the reference structure */
2985 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
2986 VERIFY(m
->m_type
!= MT_FREE
);
2988 mtype_stat_dec(m
->m_type
);
2989 mtype_stat_inc(MT_FREE
);
2991 m
->m_type
= MT_FREE
;
2994 m
->m_next
= m
->m_nextpkt
= NULL
;
2995 /* "Free" into the intermediate cache */
2996 if (m
->m_ext
.ext_free
== NULL
) {
2997 mcache_free(m_cache(MC_MBUF_CL
), m
);
2998 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2999 mcache_free(m_cache(MC_MBUF_BIGCL
), m
);
3001 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
3002 mcache_free(m_cache(MC_MBUF_16KCL
), m
);
3005 * Allocate a new mbuf, since we didn't divorce
3006 * the composite mbuf + cluster pair above.
3008 if ((m
= _M_GETHDR(wait
, type
)) == NULL
)
3014 (rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
) {
3019 MEXT_INIT(m
, extbuf
, extsize
, extfree
, extarg
, rfa
, 1, 0);
3025 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3026 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3029 m_getcl(int wait
, int type
, int flags
)
3032 int mcflags
= MSLEEPF(wait
);
3033 int hdr
= (flags
& M_PKTHDR
);
3035 /* Is this due to a non-blocking retry? If so, then try harder */
3036 if (mcflags
& MCR_NOSLEEP
)
3037 mcflags
|= MCR_TRYHARD
;
3039 m
= mcache_alloc(m_cache(MC_MBUF_CL
), mcflags
);
3041 MBUF_INIT(m
, hdr
, type
);
3042 mtype_stat_inc(type
);
3043 mtype_stat_dec(MT_FREE
);
3045 if (hdr
&& mac_init_mbuf(m
, wait
) != 0) {
3049 #endif /* MAC_NET */
3054 /* m_mclget() add an mbuf cluster to a normal mbuf */
3056 m_mclget(struct mbuf
*m
, int wait
)
3058 struct ext_ref
*rfa
;
3060 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
3063 m
->m_ext
.ext_buf
= m_mclalloc(wait
);
3064 if (m
->m_ext
.ext_buf
!= NULL
) {
3065 MBUF_CL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
3067 mcache_free(ref_cache
, rfa
);
3072 /* Allocate an mbuf cluster */
3074 m_mclalloc(int wait
)
3076 int mcflags
= MSLEEPF(wait
);
3078 /* Is this due to a non-blocking retry? If so, then try harder */
3079 if (mcflags
& MCR_NOSLEEP
)
3080 mcflags
|= MCR_TRYHARD
;
3082 return (mcache_alloc(m_cache(MC_CL
), mcflags
));
3085 /* Free an mbuf cluster */
3087 m_mclfree(caddr_t p
)
3089 mcache_free(m_cache(MC_CL
), p
);
3093 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3097 m_mclhasreference(struct mbuf
*m
)
3099 if (!(m
->m_flags
& M_EXT
))
3102 ASSERT(MEXT_RFA(m
) != NULL
);
3104 return (MEXT_REF(m
) > 1);
3107 __private_extern__ caddr_t
3108 m_bigalloc(int wait
)
3110 int mcflags
= MSLEEPF(wait
);
3112 /* Is this due to a non-blocking retry? If so, then try harder */
3113 if (mcflags
& MCR_NOSLEEP
)
3114 mcflags
|= MCR_TRYHARD
;
3116 return (mcache_alloc(m_cache(MC_BIGCL
), mcflags
));
3119 __private_extern__
void
3120 m_bigfree(caddr_t p
, __unused u_int size
, __unused caddr_t arg
)
3122 mcache_free(m_cache(MC_BIGCL
), p
);
3125 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3126 __private_extern__
struct mbuf
*
3127 m_mbigget(struct mbuf
*m
, int wait
)
3129 struct ext_ref
*rfa
;
3131 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
3134 m
->m_ext
.ext_buf
= m_bigalloc(wait
);
3135 if (m
->m_ext
.ext_buf
!= NULL
) {
3136 MBUF_BIGCL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
3138 mcache_free(ref_cache
, rfa
);
3143 __private_extern__ caddr_t
3144 m_16kalloc(int wait
)
3146 int mcflags
= MSLEEPF(wait
);
3148 /* Is this due to a non-blocking retry? If so, then try harder */
3149 if (mcflags
& MCR_NOSLEEP
)
3150 mcflags
|= MCR_TRYHARD
;
3152 return (mcache_alloc(m_cache(MC_16KCL
), mcflags
));
3155 __private_extern__
void
3156 m_16kfree(caddr_t p
, __unused u_int size
, __unused caddr_t arg
)
3158 mcache_free(m_cache(MC_16KCL
), p
);
3161 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3162 __private_extern__
struct mbuf
*
3163 m_m16kget(struct mbuf
*m
, int wait
)
3165 struct ext_ref
*rfa
;
3167 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
3170 m
->m_ext
.ext_buf
= m_16kalloc(wait
);
3171 if (m
->m_ext
.ext_buf
!= NULL
) {
3172 MBUF_16KCL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
3174 mcache_free(ref_cache
, rfa
);
3180 * "Move" mbuf pkthdr from "from" to "to".
3181 * "from" must have M_PKTHDR set, and "to" must be empty.
3184 m_copy_pkthdr(struct mbuf
*to
, struct mbuf
*from
)
3186 /* We will be taking over the tags of 'to' */
3187 if (to
->m_flags
& M_PKTHDR
)
3188 m_tag_delete_chain(to
, NULL
);
3189 to
->m_pkthdr
= from
->m_pkthdr
; /* especially tags */
3190 m_tag_init(from
); /* purge tags from src */
3191 m_prio_init(from
); /* reset priority from src */
3192 to
->m_flags
= (from
->m_flags
& M_COPYFLAGS
) | (to
->m_flags
& M_EXT
);
3193 if ((to
->m_flags
& M_EXT
) == 0)
3194 to
->m_data
= to
->m_pktdat
;
3198 * Duplicate "from"'s mbuf pkthdr in "to".
3199 * "from" must have M_PKTHDR set, and "to" must be empty.
3200 * In particular, this does a deep copy of the packet tags.
3203 m_dup_pkthdr(struct mbuf
*to
, struct mbuf
*from
, int how
)
3205 if (to
->m_flags
& M_PKTHDR
)
3206 m_tag_delete_chain(to
, NULL
);
3207 to
->m_flags
= (from
->m_flags
& M_COPYFLAGS
) | (to
->m_flags
& M_EXT
);
3208 if ((to
->m_flags
& M_EXT
) == 0)
3209 to
->m_data
= to
->m_pktdat
;
3210 to
->m_pkthdr
= from
->m_pkthdr
;
3212 return (m_tag_copy_chain(to
, from
, how
));
3216 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3217 * if wantall is not set, return whatever number were available. Set up the
3218 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3219 * are chained on the m_nextpkt field. Any packets requested beyond this
3220 * are chained onto the last packet header's m_next field. The size of
3221 * the cluster is controlled by the parameter bufsize.
3223 __private_extern__
struct mbuf
*
3224 m_getpackets_internal(unsigned int *num_needed
, int num_with_pkthdrs
,
3225 int wait
, int wantall
, size_t bufsize
)
3228 struct mbuf
**np
, *top
;
3229 unsigned int pnum
, needed
= *num_needed
;
3230 mcache_obj_t
*mp_list
= NULL
;
3231 int mcflags
= MSLEEPF(wait
);
3233 struct ext_ref
*rfa
;
3237 ASSERT(bufsize
== m_maxsize(MC_CL
) ||
3238 bufsize
== m_maxsize(MC_BIGCL
) ||
3239 bufsize
== m_maxsize(MC_16KCL
));
3242 * Caller must first check for njcl because this
3243 * routine is internal and not exposed/used via KPI.
3245 VERIFY(bufsize
!= m_maxsize(MC_16KCL
) || njcl
> 0);
3252 * The caller doesn't want all the requested buffers; only some.
3253 * Try hard to get what we can, but don't block. This effectively
3254 * overrides MCR_SLEEP, since this thread will not go to sleep
3255 * if we can't get all the buffers.
3257 if (!wantall
|| (mcflags
& MCR_NOSLEEP
))
3258 mcflags
|= MCR_TRYHARD
;
3260 /* Allocate the composite mbuf + cluster elements from the cache */
3261 if (bufsize
== m_maxsize(MC_CL
))
3262 cp
= m_cache(MC_MBUF_CL
);
3263 else if (bufsize
== m_maxsize(MC_BIGCL
))
3264 cp
= m_cache(MC_MBUF_BIGCL
);
3266 cp
= m_cache(MC_MBUF_16KCL
);
3267 needed
= mcache_alloc_ext(cp
, &mp_list
, needed
, mcflags
);
3269 for (pnum
= 0; pnum
< needed
; pnum
++) {
3270 m
= (struct mbuf
*)mp_list
;
3271 mp_list
= mp_list
->obj_next
;
3273 VERIFY(m
->m_type
== MT_FREE
&& m
->m_flags
== M_EXT
);
3274 cl
= m
->m_ext
.ext_buf
;
3277 ASSERT(cl
!= NULL
&& rfa
!= NULL
);
3278 VERIFY(MBUF_IS_COMPOSITE(m
));
3280 flag
= MEXT_FLAGS(m
);
3282 MBUF_INIT(m
, num_with_pkthdrs
, MT_DATA
);
3283 if (bufsize
== m_maxsize(MC_16KCL
)) {
3284 MBUF_16KCL_INIT(m
, cl
, rfa
, 1, flag
);
3285 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
3286 MBUF_BIGCL_INIT(m
, cl
, rfa
, 1, flag
);
3288 MBUF_CL_INIT(m
, cl
, rfa
, 1, flag
);
3291 if (num_with_pkthdrs
> 0) {
3294 if (mac_mbuf_label_init(m
, wait
) != 0) {
3298 #endif /* MAC_NET */
3302 if (num_with_pkthdrs
> 0)
3307 ASSERT(pnum
!= *num_needed
|| mp_list
== NULL
);
3308 if (mp_list
!= NULL
)
3309 mcache_free_ext(cp
, mp_list
);
3312 mtype_stat_add(MT_DATA
, pnum
);
3313 mtype_stat_sub(MT_FREE
, pnum
);
3316 if (wantall
&& (pnum
!= *num_needed
)) {
3327 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3328 * wantall is not set, return whatever number were available. The size of
3329 * each mbuf in the list is controlled by the parameter packetlen. Each
3330 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3331 * in the chain is called a segment. If maxsegments is not null and the
3332 * value pointed to is not null, this specify the maximum number of segments
3333 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3334 * is zero the caller does not have any restriction on the number of segments.
3335 * The actual number of segments of a mbuf chain is return in the value
3336 * pointed to by maxsegments.
3338 __private_extern__
struct mbuf
*
3339 m_allocpacket_internal(unsigned int *numlist
, size_t packetlen
,
3340 unsigned int *maxsegments
, int wait
, int wantall
, size_t wantsize
)
3342 struct mbuf
**np
, *top
, *first
= NULL
;
3343 size_t bufsize
, r_bufsize
;
3344 unsigned int num
= 0;
3345 unsigned int nsegs
= 0;
3346 unsigned int needed
, resid
;
3347 int mcflags
= MSLEEPF(wait
);
3348 mcache_obj_t
*mp_list
= NULL
, *rmp_list
= NULL
;
3349 mcache_t
*cp
= NULL
, *rcp
= NULL
;
3357 if (wantsize
== 0) {
3358 if (packetlen
<= MINCLSIZE
) {
3359 bufsize
= packetlen
;
3360 } else if (packetlen
> m_maxsize(MC_CL
)) {
3361 /* Use 4KB if jumbo cluster pool isn't available */
3362 if (packetlen
<= m_maxsize(MC_BIGCL
) || njcl
== 0)
3363 bufsize
= m_maxsize(MC_BIGCL
);
3365 bufsize
= m_maxsize(MC_16KCL
);
3367 bufsize
= m_maxsize(MC_CL
);
3369 } else if (wantsize
== m_maxsize(MC_CL
) ||
3370 wantsize
== m_maxsize(MC_BIGCL
) ||
3371 (wantsize
== m_maxsize(MC_16KCL
) && njcl
> 0)) {
3377 if (bufsize
<= MHLEN
) {
3379 } else if (bufsize
<= MINCLSIZE
) {
3380 if (maxsegments
!= NULL
&& *maxsegments
== 1) {
3381 bufsize
= m_maxsize(MC_CL
);
3386 } else if (bufsize
== m_maxsize(MC_16KCL
)) {
3388 nsegs
= ((packetlen
- 1) >> (PGSHIFT
+ 2)) + 1;
3389 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
3390 nsegs
= ((packetlen
- 1) >> PGSHIFT
) + 1;
3392 nsegs
= ((packetlen
- 1) >> MCLSHIFT
) + 1;
3394 if (maxsegments
!= NULL
) {
3395 if (*maxsegments
&& nsegs
> *maxsegments
) {
3396 *maxsegments
= nsegs
;
3399 *maxsegments
= nsegs
;
3403 * The caller doesn't want all the requested buffers; only some.
3404 * Try hard to get what we can, but don't block. This effectively
3405 * overrides MCR_SLEEP, since this thread will not go to sleep
3406 * if we can't get all the buffers.
3408 if (!wantall
|| (mcflags
& MCR_NOSLEEP
))
3409 mcflags
|= MCR_TRYHARD
;
3412 * Simple case where all elements in the lists/chains are mbufs.
3413 * Unless bufsize is greater than MHLEN, each segment chain is made
3414 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3415 * of 2 mbufs; the second one is used for the residual data, i.e.
3416 * the remaining data that cannot fit into the first mbuf.
3418 if (bufsize
<= MINCLSIZE
) {
3419 /* Allocate the elements in one shot from the mbuf cache */
3420 ASSERT(bufsize
<= MHLEN
|| nsegs
== 2);
3421 cp
= m_cache(MC_MBUF
);
3422 needed
= mcache_alloc_ext(cp
, &mp_list
,
3423 (*numlist
) * nsegs
, mcflags
);
3426 * The number of elements must be even if we are to use an
3427 * mbuf (instead of a cluster) to store the residual data.
3428 * If we couldn't allocate the requested number of mbufs,
3429 * trim the number down (if it's odd) in order to avoid
3430 * creating a partial segment chain.
3432 if (bufsize
> MHLEN
&& (needed
& 0x1))
3435 while (num
< needed
) {
3438 m
= (struct mbuf
*)mp_list
;
3439 mp_list
= mp_list
->obj_next
;
3442 MBUF_INIT(m
, 1, MT_DATA
);
3444 if (mac_init_mbuf(m
, wait
) != 0) {
3448 #endif /* MAC_NET */
3450 if (bufsize
> MHLEN
) {
3451 /* A second mbuf for this segment chain */
3452 m
->m_next
= (struct mbuf
*)mp_list
;
3453 mp_list
= mp_list
->obj_next
;
3454 ASSERT(m
->m_next
!= NULL
);
3456 MBUF_INIT(m
->m_next
, 0, MT_DATA
);
3462 ASSERT(num
!= *numlist
|| mp_list
== NULL
);
3465 mtype_stat_add(MT_DATA
, num
);
3466 mtype_stat_sub(MT_FREE
, num
);
3470 /* We've got them all; return to caller */
3471 if (num
== *numlist
)
3478 * Complex cases where elements are made up of one or more composite
3479 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3480 * be illustrated as follows:
3482 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3484 * Every composite mbuf + cluster element comes from the intermediate
3485 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
3486 * the last composite element will come from the MC_MBUF_CL cache,
3487 * unless the residual data is larger than 2KB where we use the
3488 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
3489 * data is defined as extra data beyond the first element that cannot
3490 * fit into the previous element, i.e. there is no residual data if
3491 * the chain only has 1 segment.
3493 r_bufsize
= bufsize
;
3494 resid
= packetlen
> bufsize
? packetlen
% bufsize
: 0;
3496 /* There is residual data; figure out the cluster size */
3497 if (wantsize
== 0 && packetlen
> MINCLSIZE
) {
3499 * Caller didn't request that all of the segments
3500 * in the chain use the same cluster size; use the
3501 * smaller of the cluster sizes.
3503 if (njcl
> 0 && resid
> m_maxsize(MC_BIGCL
))
3504 r_bufsize
= m_maxsize(MC_16KCL
);
3505 else if (resid
> m_maxsize(MC_CL
))
3506 r_bufsize
= m_maxsize(MC_BIGCL
);
3508 r_bufsize
= m_maxsize(MC_CL
);
3510 /* Use the same cluster size as the other segments */
3518 * Attempt to allocate composite mbuf + cluster elements for
3519 * the residual data in each chain; record the number of such
3520 * elements that can be allocated so that we know how many
3521 * segment chains we can afford to create.
3523 if (r_bufsize
<= m_maxsize(MC_CL
))
3524 rcp
= m_cache(MC_MBUF_CL
);
3525 else if (r_bufsize
<= m_maxsize(MC_BIGCL
))
3526 rcp
= m_cache(MC_MBUF_BIGCL
);
3528 rcp
= m_cache(MC_MBUF_16KCL
);
3529 needed
= mcache_alloc_ext(rcp
, &rmp_list
, *numlist
, mcflags
);
3534 /* This is temporarily reduced for calculation */
3540 * Attempt to allocate the rest of the composite mbuf + cluster
3541 * elements for the number of segment chains that we need.
3543 if (bufsize
<= m_maxsize(MC_CL
))
3544 cp
= m_cache(MC_MBUF_CL
);
3545 else if (bufsize
<= m_maxsize(MC_BIGCL
))
3546 cp
= m_cache(MC_MBUF_BIGCL
);
3548 cp
= m_cache(MC_MBUF_16KCL
);
3549 needed
= mcache_alloc_ext(cp
, &mp_list
, needed
* nsegs
, mcflags
);
3551 /* Round it down to avoid creating a partial segment chain */
3552 needed
= (needed
/ nsegs
) * nsegs
;
3558 * We're about to construct the chain(s); take into account
3559 * the number of segments we have created above to hold the
3560 * residual data for each chain, as well as restore the
3561 * original count of segments per chain.
3564 needed
+= needed
/ nsegs
;
3571 struct ext_ref
*rfa
;
3576 if (nsegs
== 1 || (num
% nsegs
) != 0 || resid
== 0) {
3577 m
= (struct mbuf
*)mp_list
;
3578 mp_list
= mp_list
->obj_next
;
3580 m
= (struct mbuf
*)rmp_list
;
3581 rmp_list
= rmp_list
->obj_next
;
3584 VERIFY(m
->m_type
== MT_FREE
&& m
->m_flags
== M_EXT
);
3585 VERIFY(m
->m_ext
.ext_free
== NULL
||
3586 m
->m_ext
.ext_free
== m_bigfree
||
3587 m
->m_ext
.ext_free
== m_16kfree
);
3589 cl
= m
->m_ext
.ext_buf
;
3592 ASSERT(cl
!= NULL
&& rfa
!= NULL
);
3593 VERIFY(MBUF_IS_COMPOSITE(m
));
3595 flag
= MEXT_FLAGS(m
);
3597 pkthdr
= (nsegs
== 1 || (num
% nsegs
) == 1);
3600 MBUF_INIT(m
, pkthdr
, MT_DATA
);
3601 if (m
->m_ext
.ext_free
== m_16kfree
) {
3602 MBUF_16KCL_INIT(m
, cl
, rfa
, 1, flag
);
3603 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3604 MBUF_BIGCL_INIT(m
, cl
, rfa
, 1, flag
);
3606 MBUF_CL_INIT(m
, cl
, rfa
, 1, flag
);
3609 if (pkthdr
&& mac_init_mbuf(m
, wait
) != 0) {
3614 #endif /* MAC_NET */
3617 if ((num
% nsegs
) == 0)
3618 np
= &first
->m_nextpkt
;
3627 mtype_stat_add(MT_DATA
, num
);
3628 mtype_stat_sub(MT_FREE
, num
);
3633 /* We've got them all; return to caller */
3634 if (num
== *numlist
) {
3635 ASSERT(mp_list
== NULL
&& rmp_list
== NULL
);
3640 /* Free up what's left of the above */
3641 if (mp_list
!= NULL
)
3642 mcache_free_ext(cp
, mp_list
);
3643 if (rmp_list
!= NULL
)
3644 mcache_free_ext(rcp
, rmp_list
);
3645 if (wantall
&& top
!= NULL
) {
3654 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3655 * packets on receive ring.
3657 __private_extern__
struct mbuf
*
3658 m_getpacket_how(int wait
)
3660 unsigned int num_needed
= 1;
3662 return (m_getpackets_internal(&num_needed
, 1, wait
, 1,
3667 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3668 * packets on receive ring.
3673 unsigned int num_needed
= 1;
3675 return (m_getpackets_internal(&num_needed
, 1, M_WAIT
, 1,
3680 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3681 * if this can't be met, return whatever number were available. Set up the
3682 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
3683 * are chained on the m_nextpkt field. Any packets requested beyond this are
3684 * chained onto the last packet header's m_next field.
3687 m_getpackets(int num_needed
, int num_with_pkthdrs
, int how
)
3689 unsigned int n
= num_needed
;
3691 return (m_getpackets_internal(&n
, num_with_pkthdrs
, how
, 0,
3696 * Return a list of mbuf hdrs set up as packet hdrs chained together
3697 * on the m_nextpkt field
3700 m_getpackethdrs(int num_needed
, int how
)
3703 struct mbuf
**np
, *top
;
3708 while (num_needed
--) {
3709 m
= _M_RETRYHDR(how
, MT_DATA
);
3721 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
3722 * for mbufs packets freed. Used by the drivers.
3725 m_freem_list(struct mbuf
*m
)
3727 struct mbuf
*nextpkt
;
3728 mcache_obj_t
*mp_list
= NULL
;
3729 mcache_obj_t
*mcl_list
= NULL
;
3730 mcache_obj_t
*mbc_list
= NULL
;
3731 mcache_obj_t
*m16k_list
= NULL
;
3732 mcache_obj_t
*m_mcl_list
= NULL
;
3733 mcache_obj_t
*m_mbc_list
= NULL
;
3734 mcache_obj_t
*m_m16k_list
= NULL
;
3735 mcache_obj_t
*ref_list
= NULL
;
3737 int mt_free
= 0, mt_data
= 0, mt_header
= 0, mt_soname
= 0, mt_tag
= 0;
3742 nextpkt
= m
->m_nextpkt
;
3743 m
->m_nextpkt
= NULL
;
3746 struct mbuf
*next
= m
->m_next
;
3747 mcache_obj_t
*o
, *rfa
;
3748 u_int32_t refcnt
, flags
;
3750 if (m
->m_type
== MT_FREE
)
3751 panic("m_free: freeing an already freed mbuf");
3753 if (m
->m_type
!= MT_FREE
)
3756 if (m
->m_flags
& M_PKTHDR
) {
3757 m_tag_delete_chain(m
, NULL
);
3760 if (!(m
->m_flags
& M_EXT
))
3763 o
= (mcache_obj_t
*)m
->m_ext
.ext_buf
;
3764 refcnt
= m_decref(m
);
3765 flags
= MEXT_FLAGS(m
);
3766 if (refcnt
== 0 && flags
== 0) {
3767 if (m
->m_ext
.ext_free
== NULL
) {
3768 o
->obj_next
= mcl_list
;
3770 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3771 o
->obj_next
= mbc_list
;
3773 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
3774 o
->obj_next
= m16k_list
;
3777 (*(m
->m_ext
.ext_free
))((caddr_t
)o
,
3781 rfa
= (mcache_obj_t
*)MEXT_RFA(m
);
3782 rfa
->obj_next
= ref_list
;
3785 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
3786 VERIFY(m
->m_type
!= MT_FREE
);
3788 * Amortize the costs of atomic operations
3789 * by doing them at the end, if possible.
3791 if (m
->m_type
== MT_DATA
)
3793 else if (m
->m_type
== MT_HEADER
)
3795 else if (m
->m_type
== MT_SONAME
)
3797 else if (m
->m_type
== MT_TAG
)
3800 mtype_stat_dec(m
->m_type
);
3802 m
->m_type
= MT_FREE
;
3805 m
->m_next
= m
->m_nextpkt
= NULL
;
3807 /* "Free" into the intermediate cache */
3808 o
= (mcache_obj_t
*)m
;
3809 if (m
->m_ext
.ext_free
== NULL
) {
3810 o
->obj_next
= m_mcl_list
;
3812 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3813 o
->obj_next
= m_mbc_list
;
3816 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
3817 o
->obj_next
= m_m16k_list
;
3825 * Amortize the costs of atomic operations
3826 * by doing them at the end, if possible.
3828 if (m
->m_type
== MT_DATA
)
3830 else if (m
->m_type
== MT_HEADER
)
3832 else if (m
->m_type
== MT_SONAME
)
3834 else if (m
->m_type
== MT_TAG
)
3836 else if (m
->m_type
!= MT_FREE
)
3837 mtype_stat_dec(m
->m_type
);
3839 m
->m_type
= MT_FREE
;
3840 m
->m_flags
= m
->m_len
= 0;
3841 m
->m_next
= m
->m_nextpkt
= NULL
;
3843 ((mcache_obj_t
*)m
)->obj_next
= mp_list
;
3844 mp_list
= (mcache_obj_t
*)m
;
3853 mtype_stat_add(MT_FREE
, mt_free
);
3855 mtype_stat_sub(MT_DATA
, mt_data
);
3857 mtype_stat_sub(MT_HEADER
, mt_header
);
3859 mtype_stat_sub(MT_SONAME
, mt_soname
);
3861 mtype_stat_sub(MT_TAG
, mt_tag
);
3863 if (mp_list
!= NULL
)
3864 mcache_free_ext(m_cache(MC_MBUF
), mp_list
);
3865 if (mcl_list
!= NULL
)
3866 mcache_free_ext(m_cache(MC_CL
), mcl_list
);
3867 if (mbc_list
!= NULL
)
3868 mcache_free_ext(m_cache(MC_BIGCL
), mbc_list
);
3869 if (m16k_list
!= NULL
)
3870 mcache_free_ext(m_cache(MC_16KCL
), m16k_list
);
3871 if (m_mcl_list
!= NULL
)
3872 mcache_free_ext(m_cache(MC_MBUF_CL
), m_mcl_list
);
3873 if (m_mbc_list
!= NULL
)
3874 mcache_free_ext(m_cache(MC_MBUF_BIGCL
), m_mbc_list
);
3875 if (m_m16k_list
!= NULL
)
3876 mcache_free_ext(m_cache(MC_MBUF_16KCL
), m_m16k_list
);
3877 if (ref_list
!= NULL
)
3878 mcache_free_ext(ref_cache
, ref_list
);
3884 m_freem(struct mbuf
*m
)
3891 * Mbuffer utility routines.
3895 * Compute the amount of space available before the current start
3896 * of data in an mbuf.
3899 m_leadingspace(struct mbuf
*m
)
3901 if (m
->m_flags
& M_EXT
) {
3902 if (MCLHASREFERENCE(m
))
3904 return (m
->m_data
- m
->m_ext
.ext_buf
);
3906 if (m
->m_flags
& M_PKTHDR
)
3907 return (m
->m_data
- m
->m_pktdat
);
3908 return (m
->m_data
- m
->m_dat
);
3912 * Compute the amount of space available after the end of data in an mbuf.
3915 m_trailingspace(struct mbuf
*m
)
3917 if (m
->m_flags
& M_EXT
) {
3918 if (MCLHASREFERENCE(m
))
3920 return (m
->m_ext
.ext_buf
+ m
->m_ext
.ext_size
-
3921 (m
->m_data
+ m
->m_len
));
3923 return (&m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
));
3927 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
3928 * copy junk along. Does not adjust packet header length.
3931 m_prepend(struct mbuf
*m
, int len
, int how
)
3935 _MGET(mn
, how
, m
->m_type
);
3940 if (m
->m_flags
& M_PKTHDR
) {
3941 M_COPY_PKTHDR(mn
, m
);
3942 m
->m_flags
&= ~M_PKTHDR
;
3953 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3954 * chain, copy junk along, and adjust length.
3957 m_prepend_2(struct mbuf
*m
, int len
, int how
)
3959 if (M_LEADINGSPACE(m
) >= len
) {
3963 m
= m_prepend(m
, len
, how
);
3965 if ((m
) && (m
->m_flags
& M_PKTHDR
))
3966 m
->m_pkthdr
.len
+= len
;
3971 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3972 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
3973 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3978 m_copym(struct mbuf
*m
, int off0
, int len
, int wait
)
3980 struct mbuf
*n
, *mhdr
= NULL
, **np
;
3985 if (off
< 0 || len
< 0)
3986 panic("m_copym: invalid offset %d or len %d", off
, len
);
3988 if (off
== 0 && (m
->m_flags
& M_PKTHDR
)) {
3993 while (off
>= m
->m_len
) {
3994 if (m
->m_next
== NULL
)
3995 panic("m_copym: invalid mbuf chain");
4004 if (len
!= M_COPYALL
)
4005 panic("m_copym: len != M_COPYALL");
4009 n
= _M_RETRY(wait
, m
->m_type
);
4016 M_COPY_PKTHDR(n
, mhdr
);
4017 if (len
== M_COPYALL
)
4018 n
->m_pkthdr
.len
-= off0
;
4020 n
->m_pkthdr
.len
= len
;
4023 if (len
== M_COPYALL
) {
4024 if (MIN(len
, (m
->m_len
- off
)) == len
) {
4025 printf("m->m_len %d - off %d = %d, %d\n",
4026 m
->m_len
, off
, m
->m_len
- off
,
4027 MIN(len
, (m
->m_len
- off
)));
4030 n
->m_len
= MIN(len
, (m
->m_len
- off
));
4031 if (n
->m_len
== M_COPYALL
) {
4032 printf("n->m_len == M_COPYALL, fixing\n");
4035 if (m
->m_flags
& M_EXT
) {
4036 n
->m_ext
= m
->m_ext
;
4038 n
->m_data
= m
->m_data
+ off
;
4039 n
->m_flags
|= M_EXT
;
4041 bcopy(MTOD(m
, caddr_t
)+off
, MTOD(n
, caddr_t
),
4042 (unsigned)n
->m_len
);
4044 if (len
!= M_COPYALL
)
4063 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4064 * within this routine also, the last mbuf and offset accessed are passed
4065 * out and can be passed back in to avoid having to rescan the entire mbuf
4066 * list (normally hung off of the socket)
4069 m_copym_with_hdrs(struct mbuf
*m
, int off0
, int len0
, int wait
,
4070 struct mbuf
**m_last
, int *m_off
)
4072 struct mbuf
*n
, **np
= NULL
;
4073 int off
= off0
, len
= len0
;
4074 struct mbuf
*top
= NULL
;
4075 int mcflags
= MSLEEPF(wait
);
4078 mcache_obj_t
*list
= NULL
;
4081 if (off
== 0 && (m
->m_flags
& M_PKTHDR
))
4084 if (*m_last
!= NULL
) {
4088 while (off
>= m
->m_len
) {
4098 len
-= MIN(len
, (n
->m_len
- ((needed
== 1) ? off
: 0)));
4105 * If the caller doesn't want to be put to sleep, mark it with
4106 * MCR_TRYHARD so that we may reclaim buffers from other places
4109 if (mcflags
& MCR_NOSLEEP
)
4110 mcflags
|= MCR_TRYHARD
;
4112 if (mcache_alloc_ext(m_cache(MC_MBUF
), &list
, needed
,
4118 n
= (struct mbuf
*)list
;
4119 list
= list
->obj_next
;
4120 ASSERT(n
!= NULL
&& m
!= NULL
);
4122 type
= (top
== NULL
) ? MT_HEADER
: m
->m_type
;
4123 MBUF_INIT(n
, (top
== NULL
), type
);
4125 if (top
== NULL
&& mac_mbuf_label_init(n
, wait
) != 0) {
4126 mtype_stat_inc(MT_HEADER
);
4127 mtype_stat_dec(MT_FREE
);
4131 #endif /* MAC_NET */
4143 M_COPY_PKTHDR(n
, m
);
4144 n
->m_pkthdr
.len
= len
;
4147 n
->m_len
= MIN(len
, (m
->m_len
- off
));
4149 if (m
->m_flags
& M_EXT
) {
4150 n
->m_ext
= m
->m_ext
;
4152 n
->m_data
= m
->m_data
+ off
;
4153 n
->m_flags
|= M_EXT
;
4155 bcopy(MTOD(m
, caddr_t
)+off
, MTOD(n
, caddr_t
),
4156 (unsigned)n
->m_len
);
4161 if ((off
+ n
->m_len
) == m
->m_len
) {
4162 *m_last
= m
->m_next
;
4166 *m_off
= off
+ n
->m_len
;
4175 mtype_stat_inc(MT_HEADER
);
4176 mtype_stat_add(type
, needed
);
4177 mtype_stat_sub(MT_FREE
, needed
+ 1);
4179 ASSERT(list
== NULL
);
4184 mcache_free_ext(m_cache(MC_MBUF
), list
);
4192 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4193 * continuing for "len" bytes, into the indicated buffer.
4196 m_copydata(struct mbuf
*m
, int off
, int len
, void *vp
)
4201 if (off
< 0 || len
< 0)
4202 panic("m_copydata: invalid offset %d or len %d", off
, len
);
4206 panic("m_copydata: invalid mbuf chain");
4214 panic("m_copydata: invalid mbuf chain");
4215 count
= MIN(m
->m_len
- off
, len
);
4216 bcopy(MTOD(m
, caddr_t
) + off
, cp
, count
);
4225 * Concatenate mbuf chain n to m. Both chains must be of the same type
4226 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4229 m_cat(struct mbuf
*m
, struct mbuf
*n
)
4234 if ((m
->m_flags
& M_EXT
) ||
4235 m
->m_data
+ m
->m_len
+ n
->m_len
>= &m
->m_dat
[MLEN
]) {
4236 /* just join the two chains */
4240 /* splat the data from one into the other */
4241 bcopy(MTOD(n
, caddr_t
), MTOD(m
, caddr_t
) + m
->m_len
,
4243 m
->m_len
+= n
->m_len
;
4249 m_adj(struct mbuf
*mp
, int req_len
)
4255 if ((m
= mp
) == NULL
)
4261 while (m
!= NULL
&& len
> 0) {
4262 if (m
->m_len
<= len
) {
4273 if (m
->m_flags
& M_PKTHDR
)
4274 m
->m_pkthdr
.len
-= (req_len
- len
);
4277 * Trim from tail. Scan the mbuf chain,
4278 * calculating its length and finding the last mbuf.
4279 * If the adjustment only affects this mbuf, then just
4280 * adjust and return. Otherwise, rescan and truncate
4281 * after the remaining size.
4287 if (m
->m_next
== (struct mbuf
*)0)
4291 if (m
->m_len
>= len
) {
4294 if (m
->m_flags
& M_PKTHDR
)
4295 m
->m_pkthdr
.len
-= len
;
4302 * Correct length for chain is "count".
4303 * Find the mbuf with last data, adjust its length,
4304 * and toss data from remaining mbufs on chain.
4307 if (m
->m_flags
& M_PKTHDR
)
4308 m
->m_pkthdr
.len
= count
;
4309 for (; m
; m
= m
->m_next
) {
4310 if (m
->m_len
>= count
) {
4316 while ((m
= m
->m_next
))
4322 * Rearange an mbuf chain so that len bytes are contiguous
4323 * and in the data area of an mbuf (so that mtod and dtom
4324 * will work for a structure of size len). Returns the resulting
4325 * mbuf chain on success, frees it and returns null on failure.
4326 * If there is room, it will add up to max_protohdr-len extra bytes to the
4327 * contiguous region in an attempt to avoid being called next time.
4332 m_pullup(struct mbuf
*n
, int len
)
4339 * If first mbuf has no cluster, and has room for len bytes
4340 * without shifting current data, pullup into it,
4341 * otherwise allocate a new mbuf to prepend to the chain.
4343 if ((n
->m_flags
& M_EXT
) == 0 &&
4344 n
->m_data
+ len
< &n
->m_dat
[MLEN
] && n
->m_next
) {
4345 if (n
->m_len
>= len
)
4353 _MGET(m
, M_DONTWAIT
, n
->m_type
);
4357 if (n
->m_flags
& M_PKTHDR
) {
4358 M_COPY_PKTHDR(m
, n
);
4359 n
->m_flags
&= ~M_PKTHDR
;
4362 space
= &m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
);
4364 count
= MIN(MIN(MAX(len
, max_protohdr
), space
), n
->m_len
);
4365 bcopy(MTOD(n
, caddr_t
), MTOD(m
, caddr_t
) + m
->m_len
,
4375 } while (len
> 0 && n
);
4389 * Partition an mbuf chain in two pieces, returning the tail --
4390 * all but the first len0 bytes. In case of failure, it returns NULL and
4391 * attempts to restore the chain to its original state.
4394 m_split(struct mbuf
*m0
, int len0
, int wait
)
4396 return (m_split0(m0
, len0
, wait
, 1));
4399 static struct mbuf
*
4400 m_split0(struct mbuf
*m0
, int len0
, int wait
, int copyhdr
)
4403 unsigned len
= len0
, remain
;
4405 for (m
= m0
; m
&& len
> m
->m_len
; m
= m
->m_next
)
4409 remain
= m
->m_len
- len
;
4410 if (copyhdr
&& (m0
->m_flags
& M_PKTHDR
)) {
4411 _MGETHDR(n
, wait
, m0
->m_type
);
4414 n
->m_pkthdr
.rcvif
= m0
->m_pkthdr
.rcvif
;
4415 n
->m_pkthdr
.len
= m0
->m_pkthdr
.len
- len0
;
4416 m0
->m_pkthdr
.len
= len0
;
4417 if (m
->m_flags
& M_EXT
)
4419 if (remain
> MHLEN
) {
4420 /* m can't be the lead packet */
4422 n
->m_next
= m_split(m
, len
, wait
);
4423 if (n
->m_next
== NULL
) {
4429 MH_ALIGN(n
, remain
);
4430 } else if (remain
== 0) {
4435 _MGET(n
, wait
, m
->m_type
);
4441 if (m
->m_flags
& M_EXT
) {
4442 n
->m_flags
|= M_EXT
;
4443 n
->m_ext
= m
->m_ext
;
4445 n
->m_data
= m
->m_data
+ len
;
4447 bcopy(MTOD(m
, caddr_t
) + len
, MTOD(n
, caddr_t
), remain
);
4451 n
->m_next
= m
->m_next
;
4457 * Routine to copy from device local memory into mbufs.
4460 m_devget(char *buf
, int totlen
, int off0
, struct ifnet
*ifp
,
4461 void (*copy
)(const void *, void *, size_t))
4464 struct mbuf
*top
= NULL
, **mp
= &top
;
4465 int off
= off0
, len
;
4473 * If 'off' is non-zero, packet is trailer-encapsulated,
4474 * so we have to skip the type and length fields.
4476 cp
+= off
+ 2 * sizeof (u_int16_t
);
4477 totlen
-= 2 * sizeof (u_int16_t
);
4479 _MGETHDR(m
, M_DONTWAIT
, MT_DATA
);
4482 m
->m_pkthdr
.rcvif
= ifp
;
4483 m
->m_pkthdr
.len
= totlen
;
4486 while (totlen
> 0) {
4488 _MGET(m
, M_DONTWAIT
, MT_DATA
);
4495 len
= MIN(totlen
, epkt
- cp
);
4496 if (len
>= MINCLSIZE
) {
4497 MCLGET(m
, M_DONTWAIT
);
4498 if (m
->m_flags
& M_EXT
) {
4499 m
->m_len
= len
= MIN(len
, m_maxsize(MC_CL
));
4501 /* give up when it's out of cluster mbufs */
4509 * Place initial small packet/header at end of mbuf.
4511 if (len
< m
->m_len
) {
4513 len
+ max_linkhdr
<= m
->m_len
)
4514 m
->m_data
+= max_linkhdr
;
4521 copy(cp
, MTOD(m
, caddr_t
), (unsigned)len
);
4523 bcopy(cp
, MTOD(m
, caddr_t
), (unsigned)len
);
4535 mbuf_growth_aggressive(void)
4537 lck_mtx_lock(mbuf_mlock
);
4539 * Don't start to grow the pool until we are at least
4540 * 1/2 (50%) of current total capacity.
4542 mbuf_gscale
= MB_GROWTH_AGGRESSIVE
;
4543 lck_mtx_unlock(mbuf_mlock
);
4547 mbuf_growth_normal(void)
4549 lck_mtx_lock(mbuf_mlock
);
4551 * Don't start to grow the pool until we are at least
4552 * 15/16 (93.75%) of current total capacity.
4554 mbuf_gscale
= MB_GROWTH_NORMAL
;
4555 lck_mtx_unlock(mbuf_mlock
);
4559 * Cluster freelist allocation check.
4562 m_howmany(int num
, size_t bufsize
)
4565 u_int32_t m_clusters
, m_bigclusters
, m_16kclusters
;
4566 u_int32_t m_clfree
, m_bigclfree
, m_16kclfree
;
4567 u_int32_t s
= mbuf_gscale
;
4569 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
4571 m_clusters
= m_total(MC_CL
);
4572 m_bigclusters
= m_total(MC_BIGCL
);
4573 m_16kclusters
= m_total(MC_16KCL
);
4574 m_clfree
= m_infree(MC_CL
);
4575 m_bigclfree
= m_infree(MC_BIGCL
);
4576 m_16kclfree
= m_infree(MC_16KCL
);
4578 /* Bail if we've maxed out the mbuf memory map */
4579 if ((bufsize
!= m_maxsize(MC_16KCL
) &&
4580 (m_clusters
+ (m_bigclusters
<< 1) >= nclusters
)) ||
4581 (njcl
> 0 && bufsize
== m_maxsize(MC_16KCL
) &&
4582 (m_16kclusters
<< 3) >= njcl
)) {
4584 if (bufsize
== MCLBYTES
&& num
> m_clfree
) {
4585 printf("m_howmany - out of small clusters, "
4586 "%d short\n", num
- mbstat
.m_clfree
);
4592 if (bufsize
== m_maxsize(MC_CL
)) {
4594 if (m_clusters
< MINCL
)
4595 return (MINCL
- m_clusters
);
4596 /* Too few (free < threshold) and not over maximum */
4597 if (m_clusters
< m_maxlimit(MC_CL
)) {
4598 if (m_clfree
>= MCL_LOWAT
)
4600 if (num
>= m_clfree
)
4602 if (((m_clusters
+ num
) >> s
) > m_clfree
)
4603 j
= ((m_clusters
+ num
) >> s
) - m_clfree
;
4605 if (i
+ m_clusters
>= m_maxlimit(MC_CL
))
4606 i
= m_maxlimit(MC_CL
) - m_clusters
;
4608 VERIFY((m_total(MC_CL
) + i
) <= m_maxlimit(MC_CL
));
4609 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
4611 if (m_bigclusters
< MINBIGCL
)
4612 return (MINBIGCL
- m_bigclusters
);
4613 /* Too few (free < 1/16 total) and not over maximum */
4614 if (m_bigclusters
< m_maxlimit(MC_BIGCL
)) {
4615 if (m_bigclfree
>= MBIGCL_LOWAT
)
4617 if (num
>= m_bigclfree
)
4618 i
= num
- m_bigclfree
;
4619 if (((m_bigclusters
+ num
) >> 4) > m_bigclfree
)
4620 j
= ((m_bigclusters
+ num
) >> 4) - m_bigclfree
;
4622 if (i
+ m_bigclusters
>= m_maxlimit(MC_BIGCL
))
4623 i
= m_maxlimit(MC_BIGCL
) - m_bigclusters
;
4625 VERIFY((m_total(MC_BIGCL
) + i
) <= m_maxlimit(MC_BIGCL
));
4629 if (m_16kclusters
< MIN16KCL
)
4630 return (MIN16KCL
- m_16kclusters
);
4631 /* Too few (free < 1/16 total) and not over maximum */
4632 if (m_16kclusters
< m_maxlimit(MC_16KCL
)) {
4633 if (m_16kclfree
>= M16KCL_LOWAT
)
4635 if (num
>= m_16kclfree
)
4636 i
= num
- m_16kclfree
;
4637 if (((m_16kclusters
+ num
) >> 4) > m_16kclfree
)
4638 j
= ((m_16kclusters
+ num
) >> 4) - m_16kclfree
;
4640 if (i
+ m_16kclusters
>= m_maxlimit(MC_16KCL
))
4641 i
= m_maxlimit(MC_16KCL
) - m_16kclusters
;
4643 VERIFY((m_total(MC_16KCL
) + i
) <= m_maxlimit(MC_16KCL
));
4650 * Return the number of bytes in the mbuf chain, m.
4653 m_length(struct mbuf
*m
)
4656 unsigned int pktlen
;
4658 if (m
->m_flags
& M_PKTHDR
)
4659 return (m
->m_pkthdr
.len
);
4662 for (m0
= m
; m0
!= NULL
; m0
= m0
->m_next
)
4663 pktlen
+= m0
->m_len
;
4668 * Copy data from a buffer back into the indicated mbuf chain,
4669 * starting "off" bytes from the beginning, extending the mbuf
4670 * chain if necessary.
4673 m_copyback(struct mbuf
*m0
, int off
, int len
, const void *cp
)
4676 struct mbuf
*origm
= m0
;
4686 m_copyback0(&m0
, off
, len
, cp
,
4687 M_COPYBACK0_COPYBACK
| M_COPYBACK0_EXTEND
, M_DONTWAIT
);
4690 if (error
!= 0 || (m0
!= NULL
&& origm
!= m0
))
4691 panic("m_copyback");
4696 m_copyback_cow(struct mbuf
*m0
, int off
, int len
, const void *cp
, int how
)
4700 /* don't support chain expansion */
4701 VERIFY(off
+ len
<= m_length(m0
));
4703 error
= m_copyback0(&m0
, off
, len
, cp
,
4704 M_COPYBACK0_COPYBACK
| M_COPYBACK0_COW
, how
);
4707 * no way to recover from partial success.
4708 * just free the chain.
4717 * m_makewritable: ensure the specified range writable.
4720 m_makewritable(struct mbuf
**mp
, int off
, int len
, int how
)
4725 int origlen
, reslen
;
4727 origlen
= m_length(*mp
);
4730 #if 0 /* M_COPYALL is large enough */
4731 if (len
== M_COPYALL
)
4732 len
= m_length(*mp
) - off
; /* XXX */
4735 error
= m_copyback0(mp
, off
, len
, NULL
,
4736 M_COPYBACK0_PRESERVE
| M_COPYBACK0_COW
, how
);
4740 for (n
= *mp
; n
; n
= n
->m_next
)
4742 if (origlen
!= reslen
)
4743 panic("m_makewritable: length changed");
4744 if (((*mp
)->m_flags
& M_PKTHDR
) && reslen
!= (*mp
)->m_pkthdr
.len
)
4745 panic("m_makewritable: inconsist");
4752 m_copyback0(struct mbuf
**mp0
, int off
, int len
, const void *vp
, int flags
,
4759 const char *cp
= vp
;
4761 VERIFY(mp0
!= NULL
);
4762 VERIFY(*mp0
!= NULL
);
4763 VERIFY((flags
& M_COPYBACK0_PRESERVE
) == 0 || cp
== NULL
);
4764 VERIFY((flags
& M_COPYBACK0_COPYBACK
) == 0 || cp
!= NULL
);
4767 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
4768 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
4771 VERIFY((~flags
& (M_COPYBACK0_EXTEND
|M_COPYBACK0_COW
)) != 0);
4775 while (off
> (mlen
= m
->m_len
)) {
4778 if (m
->m_next
== NULL
) {
4781 if (!(flags
& M_COPYBACK0_EXTEND
))
4785 * try to make some space at the end of "m".
4789 if (off
+ len
>= MINCLSIZE
&&
4790 !(m
->m_flags
& M_EXT
) && m
->m_len
== 0) {
4793 tspace
= M_TRAILINGSPACE(m
);
4795 tspace
= MIN(tspace
, off
+ len
);
4797 bzero(mtod(m
, char *) + m
->m_len
,
4806 * need to allocate an mbuf.
4809 if (off
+ len
>= MINCLSIZE
) {
4810 n
= m_getcl(how
, m
->m_type
, 0);
4812 n
= _M_GET(how
, m
->m_type
);
4818 n
->m_len
= MIN(M_TRAILINGSPACE(n
), off
+ len
);
4819 bzero(mtod(n
, char *), MIN(n
->m_len
, off
));
4826 mlen
= m
->m_len
- off
;
4827 if (mlen
!= 0 && m_mclhasreference(m
)) {
4832 * this mbuf is read-only.
4833 * allocate a new writable mbuf and try again.
4836 #if defined(DIAGNOSTIC)
4837 if (!(flags
& M_COPYBACK0_COW
))
4838 panic("m_copyback0: read-only");
4839 #endif /* defined(DIAGNOSTIC) */
4842 * if we're going to write into the middle of
4843 * a mbuf, split it first.
4845 if (off
> 0 && len
< mlen
) {
4846 n
= m_split0(m
, off
, how
, 0);
4857 * XXX TODO coalesce into the trailingspace of
4858 * the previous mbuf when possible.
4862 * allocate a new mbuf. copy packet header if needed.
4864 n
= _M_GET(how
, m
->m_type
);
4867 if (off
== 0 && (m
->m_flags
& M_PKTHDR
)) {
4868 M_COPY_PKTHDR(n
, m
);
4871 if (len
>= MINCLSIZE
)
4872 MCLGET(n
, M_DONTWAIT
);
4874 (n
->m_flags
& M_EXT
) ? MCLBYTES
: MLEN
;
4880 * free the region which has been overwritten.
4881 * copying data from old mbufs if requested.
4883 if (flags
& M_COPYBACK0_PRESERVE
)
4884 datap
= mtod(n
, char *);
4888 VERIFY(off
== 0 || eatlen
>= mlen
);
4890 VERIFY(len
>= mlen
);
4894 m_copydata(m
, off
, mlen
, datap
);
4901 while (m
!= NULL
&& m_mclhasreference(m
) &&
4902 n
->m_type
== m
->m_type
&& eatlen
> 0) {
4903 mlen
= MIN(eatlen
, m
->m_len
);
4905 m_copydata(m
, 0, mlen
, datap
);
4912 *mp
= m
= m_free(m
);
4920 mlen
= MIN(mlen
, len
);
4921 if (flags
& M_COPYBACK0_COPYBACK
) {
4922 bcopy(cp
, mtod(m
, caddr_t
) + off
, (unsigned)mlen
);
4931 if (m
->m_next
== NULL
) {
4938 if (((m
= *mp0
)->m_flags
& M_PKTHDR
) && (m
->m_pkthdr
.len
< totlen
)) {
4939 VERIFY(flags
& M_COPYBACK0_EXTEND
);
4940 m
->m_pkthdr
.len
= totlen
;
4950 mcl_to_paddr(char *addr
)
4952 vm_offset_t base_phys
;
4954 if (!MBUF_IN_MAP(addr
))
4956 base_phys
= mcl_paddr
[(addr
- (char *)mbutl
) >> PGSHIFT
];
4960 return ((char *)((uintptr_t)base_phys
| ((uintptr_t)addr
& PGOFSET
)));
4964 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
4965 * And really copy the thing. That way, we don't "precompute" checksums
4966 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
4967 * small packets, don't dup into a cluster. That way received packets
4968 * don't take up too much room in the sockbuf (cf. sbspace()).
4973 m_dup(struct mbuf
*m
, int how
)
4975 struct mbuf
*n
, **np
;
4981 if (m
->m_flags
& M_PKTHDR
)
4985 * Quick check: if we have one mbuf and its data fits in an
4986 * mbuf with packet header, just copy and go.
4988 if (m
->m_next
== NULL
) {
4989 /* Then just move the data into an mbuf and be done... */
4991 if (m
->m_pkthdr
.len
<= MHLEN
&& m
->m_len
<= MHLEN
) {
4992 if ((n
= _M_GETHDR(how
, m
->m_type
)) == NULL
)
4994 n
->m_len
= m
->m_len
;
4995 m_dup_pkthdr(n
, m
, how
);
4996 bcopy(m
->m_data
, n
->m_data
, m
->m_len
);
4999 } else if (m
->m_len
<= MLEN
) {
5000 if ((n
= _M_GET(how
, m
->m_type
)) == NULL
)
5002 bcopy(m
->m_data
, n
->m_data
, m
->m_len
);
5003 n
->m_len
= m
->m_len
;
5009 kprintf("<%x: %x, %x, %x\n", m
, m
->m_flags
, m
->m_len
,
5013 n
= _M_GETHDR(how
, m
->m_type
);
5015 n
= _M_GET(how
, m
->m_type
);
5018 if (m
->m_flags
& M_EXT
) {
5019 if (m
->m_len
<= m_maxsize(MC_CL
))
5021 else if (m
->m_len
<= m_maxsize(MC_BIGCL
))
5022 n
= m_mbigget(n
, how
);
5023 else if (m
->m_len
<= m_maxsize(MC_16KCL
) && njcl
> 0)
5024 n
= m_m16kget(n
, how
);
5025 if (!(n
->m_flags
& M_EXT
)) {
5032 /* Don't use M_COPY_PKTHDR: preserve m_data */
5033 m_dup_pkthdr(n
, m
, how
);
5035 if (!(n
->m_flags
& M_EXT
))
5036 n
->m_data
= n
->m_pktdat
;
5038 n
->m_len
= m
->m_len
;
5040 * Get the dup on the same bdry as the original
5041 * Assume that the two mbufs have the same offset to data area
5042 * (up to word boundaries)
5044 bcopy(MTOD(m
, caddr_t
), MTOD(n
, caddr_t
), (unsigned)n
->m_len
);
5048 kprintf(">%x: %x, %x, %x\n", n
, n
->m_flags
, n
->m_len
,
5063 #define MBUF_MULTIPAGES(m) \
5064 (((m)->m_flags & M_EXT) && \
5065 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
5066 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
5067 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5069 static struct mbuf
*
5070 m_expand(struct mbuf
*m
, struct mbuf
**last
)
5072 struct mbuf
*top
= NULL
;
5073 struct mbuf
**nm
= &top
;
5074 uintptr_t data0
, data
;
5075 unsigned int len0
, len
;
5077 VERIFY(MBUF_MULTIPAGES(m
));
5078 VERIFY(m
->m_next
== NULL
);
5079 data0
= (uintptr_t)m
->m_data
;
5087 if (IS_P2ALIGNED(data
, NBPG
) && len0
> NBPG
)
5089 else if (!IS_P2ALIGNED(data
, NBPG
) &&
5090 P2ROUNDUP(data
, NBPG
) < (data
+ len0
))
5091 len
= P2ROUNDUP(data
, NBPG
) - data
;
5096 VERIFY(m
->m_flags
& M_EXT
);
5097 m
->m_data
= (void *)data
;
5109 n
= _M_RETRY(M_DONTWAIT
, MT_DATA
);
5116 n
->m_ext
= m
->m_ext
;
5118 n
->m_flags
|= M_EXT
;
5125 m_normalize(struct mbuf
*m
)
5127 struct mbuf
*top
= NULL
;
5128 struct mbuf
**nm
= &top
;
5129 boolean_t expanded
= FALSE
;
5137 /* Does the data cross one or more page boundaries? */
5138 if (MBUF_MULTIPAGES(m
)) {
5140 if ((m
= m_expand(m
, &last
)) == NULL
) {
5156 atomic_add_32(&mb_normalized
, 1);
5161 m_mchtype(struct mbuf
*m
, int t
)
5164 mtype_stat_dec(m
->m_type
);
5169 m_mtod(struct mbuf
*m
)
5171 return (MTOD(m
, void *));
5177 return ((struct mbuf
*)((uintptr_t)(x
) & ~(MSIZE
-1)));
5181 m_mcheck(struct mbuf
*m
)
5187 * Inform the corresponding mcache(s) that there's a waiter below.
5190 mbuf_waiter_inc(mbuf_class_t
class, boolean_t comp
)
5192 mcache_waiter_inc(m_cache(class));
5194 if (class == MC_CL
) {
5195 mcache_waiter_inc(m_cache(MC_MBUF_CL
));
5196 } else if (class == MC_BIGCL
) {
5197 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL
));
5198 } else if (class == MC_16KCL
) {
5199 mcache_waiter_inc(m_cache(MC_MBUF_16KCL
));
5201 mcache_waiter_inc(m_cache(MC_MBUF_CL
));
5202 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL
));
5208 * Inform the corresponding mcache(s) that there's no more waiter below.
5211 mbuf_waiter_dec(mbuf_class_t
class, boolean_t comp
)
5213 mcache_waiter_dec(m_cache(class));
5215 if (class == MC_CL
) {
5216 mcache_waiter_dec(m_cache(MC_MBUF_CL
));
5217 } else if (class == MC_BIGCL
) {
5218 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL
));
5219 } else if (class == MC_16KCL
) {
5220 mcache_waiter_dec(m_cache(MC_MBUF_16KCL
));
5222 mcache_waiter_dec(m_cache(MC_MBUF_CL
));
5223 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL
));
5229 * Called during blocking allocation. Returns TRUE if one or more objects
5230 * are available at the per-CPU caches layer and that allocation should be
5231 * retried at that level.
5234 mbuf_sleep(mbuf_class_t
class, unsigned int num
, int wait
)
5236 boolean_t mcache_retry
= FALSE
;
5238 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
5240 /* Check if there's anything at the cache layer */
5241 if (mbuf_cached_above(class, wait
)) {
5242 mcache_retry
= TRUE
;
5246 /* Nothing? Then try hard to get it from somewhere */
5247 m_reclaim(class, num
, (wait
& MCR_COMP
));
5249 /* We tried hard and got something? */
5250 if (m_infree(class) > 0) {
5253 } else if (mbuf_cached_above(class, wait
)) {
5255 mcache_retry
= TRUE
;
5257 } else if (wait
& MCR_TRYHARD
) {
5258 mcache_retry
= TRUE
;
5263 * There's really nothing for us right now; inform the
5264 * cache(s) that there is a waiter below and go to sleep.
5266 mbuf_waiter_inc(class, (wait
& MCR_COMP
));
5268 VERIFY(!(wait
& MCR_NOSLEEP
));
5270 (void) msleep(mb_waitchan
, mbuf_mlock
, (PZERO
-1), m_cname(class), NULL
);
5272 /* We are now up; stop getting notified until next round */
5273 mbuf_waiter_dec(class, (wait
& MCR_COMP
));
5275 /* We waited and got something */
5276 if (m_infree(class) > 0) {
5279 } else if (mbuf_cached_above(class, wait
)) {
5281 mcache_retry
= TRUE
;
5284 return (mcache_retry
);
5288 mbuf_worker_thread(void)
5293 lck_mtx_lock(mbuf_mlock
);
5296 if (mbuf_expand_mcl
) {
5299 /* Adjust to current number of cluster in use */
5300 n
= mbuf_expand_mcl
-
5301 (m_total(MC_CL
) - m_infree(MC_CL
));
5302 if ((n
+ m_total(MC_CL
)) > m_maxlimit(MC_CL
))
5303 n
= m_maxlimit(MC_CL
) - m_total(MC_CL
);
5304 mbuf_expand_mcl
= 0;
5306 if (n
> 0 && freelist_populate(MC_CL
, n
, M_WAIT
) > 0)
5309 if (mbuf_expand_big
) {
5312 /* Adjust to current number of 4 KB cluster in use */
5313 n
= mbuf_expand_big
-
5314 (m_total(MC_BIGCL
) - m_infree(MC_BIGCL
));
5315 if ((n
+ m_total(MC_BIGCL
)) > m_maxlimit(MC_BIGCL
))
5316 n
= m_maxlimit(MC_BIGCL
) - m_total(MC_BIGCL
);
5317 mbuf_expand_big
= 0;
5319 if (n
> 0 && freelist_populate(MC_BIGCL
, n
, M_WAIT
) > 0)
5322 if (mbuf_expand_16k
) {
5325 /* Adjust to current number of 16 KB cluster in use */
5326 n
= mbuf_expand_16k
-
5327 (m_total(MC_16KCL
) - m_infree(MC_16KCL
));
5328 if ((n
+ m_total(MC_16KCL
)) > m_maxlimit(MC_16KCL
))
5329 n
= m_maxlimit(MC_16KCL
) - m_total(MC_16KCL
);
5330 mbuf_expand_16k
= 0;
5333 (void) freelist_populate(MC_16KCL
, n
, M_WAIT
);
5337 * Because we can run out of memory before filling the mbuf
5338 * map, we should not allocate more clusters than they are
5339 * mbufs -- otherwise we could have a large number of useless
5340 * clusters allocated.
5343 while (m_total(MC_MBUF
) <
5344 (m_total(MC_BIGCL
) + m_total(MC_CL
))) {
5345 if (freelist_populate(MC_MBUF
, 1, M_WAIT
) == 0)
5350 lck_mtx_unlock(mbuf_mlock
);
5352 assert_wait(&mbuf_worker_run
, THREAD_UNINT
);
5353 (void) thread_block((thread_continue_t
)mbuf_worker_thread
);
5358 mbuf_worker_thread_init(void)
5360 mbuf_worker_ready
++;
5361 mbuf_worker_thread();
5370 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
5372 VERIFY(MBUF_IN_MAP(buf
));
5373 ix
= ((char *)buf
- (char *)mbutl
) >> MBSHIFT
;
5374 VERIFY(ix
< maxslabgrp
);
5376 if ((slg
= slabstbl
[ix
]) == NULL
) {
5378 * In the current implementation, we never shrink the memory
5379 * pool (hence the cluster map); if we attempt to reallocate
5380 * a cluster group when it's already allocated, panic since
5381 * this is a sign of a memory corruption (slabstbl[ix] got
5382 * nullified). This also means that there shouldn't be any
5383 * hole in the kernel sub-map for the mbuf pool.
5386 VERIFY(ix
< slabgrp
);
5388 * Slabs expansion can only be done single threaded; when
5389 * we get here, it must be as a result of m_clalloc() which
5390 * is serialized and therefore mb_clalloc_busy must be set.
5392 VERIFY(mb_clalloc_busy
);
5393 lck_mtx_unlock(mbuf_mlock
);
5395 /* This is a new buffer; create the slabs group for it */
5396 MALLOC(slg
, mcl_slabg_t
*, sizeof (*slg
), M_TEMP
,
5398 VERIFY(slg
!= NULL
);
5400 lck_mtx_lock(mbuf_mlock
);
5402 * No other thread could have gone into m_clalloc() after
5403 * we dropped the lock above, so verify that it's true.
5405 VERIFY(mb_clalloc_busy
);
5409 /* Chain each slab in the group to its forward neighbor */
5410 for (k
= 1; k
< NSLABSPMB
; k
++)
5411 slg
->slg_slab
[k
- 1].sl_next
= &slg
->slg_slab
[k
];
5412 VERIFY(slg
->slg_slab
[NSLABSPMB
- 1].sl_next
== NULL
);
5414 /* And chain the last slab in the previous group to this */
5416 VERIFY(slabstbl
[ix
- 1]->
5417 slg_slab
[NSLABSPMB
- 1].sl_next
== NULL
);
5418 slabstbl
[ix
- 1]->slg_slab
[NSLABSPMB
- 1].sl_next
=
5423 ix
= MTOCL(buf
) % NSLABSPMB
;
5424 VERIFY(ix
< NSLABSPMB
);
5426 return (&slg
->slg_slab
[ix
]);
5430 slab_init(mcl_slab_t
*sp
, mbuf_class_t
class, u_int32_t flags
,
5431 void *base
, void *head
, unsigned int len
, int refcnt
, int chunks
)
5433 sp
->sl_class
= class;
5434 sp
->sl_flags
= flags
;
5438 sp
->sl_refcnt
= refcnt
;
5439 sp
->sl_chunks
= chunks
;
5444 slab_insert(mcl_slab_t
*sp
, mbuf_class_t
class)
5446 VERIFY(slab_is_detached(sp
));
5447 m_slab_cnt(class)++;
5448 TAILQ_INSERT_TAIL(&m_slablist(class), sp
, sl_link
);
5449 sp
->sl_flags
&= ~SLF_DETACHED
;
5450 if (class == MC_BIGCL
) {
5452 /* Next slab must already be present */
5454 VERIFY(slab_is_detached(sp
));
5455 sp
->sl_flags
&= ~SLF_DETACHED
;
5456 } else if (class == MC_16KCL
) {
5458 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
5460 /* Next slab must already be present */
5462 VERIFY(slab_is_detached(sp
));
5463 sp
->sl_flags
&= ~SLF_DETACHED
;
5469 slab_remove(mcl_slab_t
*sp
, mbuf_class_t
class)
5471 VERIFY(!slab_is_detached(sp
));
5472 VERIFY(m_slab_cnt(class) > 0);
5473 m_slab_cnt(class)--;
5474 TAILQ_REMOVE(&m_slablist(class), sp
, sl_link
);
5476 if (class == MC_BIGCL
) {
5478 /* Next slab must already be present */
5480 VERIFY(!slab_is_detached(sp
));
5482 } else if (class == MC_16KCL
) {
5484 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
5486 /* Next slab must already be present */
5488 VERIFY(!slab_is_detached(sp
));
5495 slab_inrange(mcl_slab_t
*sp
, void *buf
)
5497 return ((uintptr_t)buf
>= (uintptr_t)sp
->sl_base
&&
5498 (uintptr_t)buf
< ((uintptr_t)sp
->sl_base
+ sp
->sl_len
));
5504 slab_nextptr_panic(mcl_slab_t
*sp
, void *addr
)
5507 unsigned int chunk_len
= sp
->sl_len
/ sp
->sl_chunks
;
5508 uintptr_t buf
= (uintptr_t)sp
->sl_base
;
5510 for (i
= 0; i
< sp
->sl_chunks
; i
++, buf
+= chunk_len
) {
5511 void *next
= ((mcache_obj_t
*)buf
)->obj_next
;
5514 if (mclaudit
== NULL
) {
5515 if (next
!= NULL
&& !MBUF_IN_MAP(next
)) {
5516 mcache_t
*cp
= m_cache(sp
->sl_class
);
5517 panic("%s: %s buffer %p in slab %p modified "
5518 "after free at offset 0: %p out of range "
5519 "[%p-%p)\n", __func__
, cp
->mc_name
,
5520 (void *)buf
, sp
, next
, mbutl
, embutl
);
5524 mcache_audit_t
*mca
= mcl_audit_buf2mca(sp
->sl_class
,
5525 (mcache_obj_t
*)buf
);
5526 mcl_audit_verify_nextptr(next
, mca
);
5532 slab_detach(mcl_slab_t
*sp
)
5534 sp
->sl_link
.tqe_next
= (mcl_slab_t
*)-1;
5535 sp
->sl_link
.tqe_prev
= (mcl_slab_t
**)-1;
5536 sp
->sl_flags
|= SLF_DETACHED
;
5540 slab_is_detached(mcl_slab_t
*sp
)
5542 return ((intptr_t)sp
->sl_link
.tqe_next
== -1 &&
5543 (intptr_t)sp
->sl_link
.tqe_prev
== -1 &&
5544 (sp
->sl_flags
& SLF_DETACHED
));
5548 mcl_audit_init(void *buf
, mcache_audit_t
**mca_list
,
5549 mcache_obj_t
**con_list
, size_t con_size
, unsigned int num
)
5551 mcache_audit_t
*mca
, *mca_tail
;
5552 mcache_obj_t
*con
= NULL
;
5553 boolean_t save_contents
= (con_list
!= NULL
);
5556 ASSERT(num
<= NMBPCL
);
5557 ASSERT(con_list
== NULL
|| con_size
!= 0);
5560 /* Make sure we haven't been here before */
5561 for (i
= 0; i
< NMBPCL
; i
++)
5562 VERIFY(mclaudit
[ix
].cl_audit
[i
] == NULL
);
5564 mca
= mca_tail
= *mca_list
;
5568 for (i
= 0; i
< num
; i
++) {
5569 mcache_audit_t
*next
;
5571 next
= mca
->mca_next
;
5572 bzero(mca
, sizeof (*mca
));
5573 mca
->mca_next
= next
;
5574 mclaudit
[ix
].cl_audit
[i
] = mca
;
5576 /* Attach the contents buffer if requested */
5577 if (save_contents
) {
5578 VERIFY(con
!= NULL
);
5579 mca
->mca_contents_size
= con_size
;
5580 mca
->mca_contents
= con
;
5581 con
= con
->obj_next
;
5582 bzero(mca
->mca_contents
, mca
->mca_contents_size
);
5586 mca
= mca
->mca_next
;
5592 *mca_list
= mca_tail
->mca_next
;
5593 mca_tail
->mca_next
= NULL
;
5597 * Given an address of a buffer (mbuf/cluster/big cluster), return
5598 * the corresponding audit structure for that buffer.
5600 static mcache_audit_t
*
5601 mcl_audit_buf2mca(mbuf_class_t
class, mcache_obj_t
*o
)
5603 mcache_audit_t
*mca
= NULL
;
5606 VERIFY(IS_P2ALIGNED(o
, MIN(m_maxsize(class), NBPG
)));
5611 * For the mbuf case, find the index of the cluster
5612 * used by the mbuf and use that index to locate the
5613 * base address of the cluster. Then find out the
5614 * mbuf index relative to the cluster base and use
5615 * it to locate the audit structure.
5617 VERIFY(MCLIDX(CLTOM(ix
), o
) < (int)NMBPCL
);
5618 mca
= mclaudit
[ix
].cl_audit
[MCLIDX(CLTOM(ix
), o
)];
5625 * Same as above, but only return the first element.
5627 mca
= mclaudit
[ix
].cl_audit
[0];
5639 mcl_audit_mbuf(mcache_audit_t
*mca
, void *addr
, boolean_t composite
,
5642 struct mbuf
*m
= addr
;
5643 mcache_obj_t
*next
= ((mcache_obj_t
*)m
)->obj_next
;
5645 VERIFY(mca
->mca_contents
!= NULL
&&
5646 mca
->mca_contents_size
== AUDIT_CONTENTS_SIZE
);
5648 mcl_audit_verify_nextptr(next
, mca
);
5651 /* Save constructed mbuf fields */
5652 mcl_audit_save_mbuf(m
, mca
);
5653 mcache_set_pattern(MCACHE_FREE_PATTERN
, m
, m_maxsize(MC_MBUF
));
5654 ((mcache_obj_t
*)m
)->obj_next
= next
;
5658 /* Check if the buffer has been corrupted while in freelist */
5659 mcache_audit_free_verify_set(mca
, addr
, 0, m_maxsize(MC_MBUF
));
5661 /* Restore constructed mbuf fields */
5662 mcl_audit_restore_mbuf(m
, mca
, composite
);
5666 mcl_audit_restore_mbuf(struct mbuf
*m
, mcache_audit_t
*mca
, boolean_t composite
)
5668 struct mbuf
*ms
= (struct mbuf
*)mca
->mca_contents
;
5671 struct mbuf
*next
= m
->m_next
;
5672 VERIFY(ms
->m_flags
== M_EXT
&& MEXT_RFA(ms
) != NULL
&&
5673 MBUF_IS_COMPOSITE(ms
));
5675 * We could have hand-picked the mbuf fields and restore
5676 * them individually, but that will be a maintenance
5677 * headache. Instead, restore everything that was saved;
5678 * the mbuf layer will recheck and reinitialize anyway.
5680 bcopy(ms
, m
, mca
->mca_contents_size
);
5684 * For a regular mbuf (no cluster attached) there's nothing
5685 * to restore other than the type field, which is expected
5688 m
->m_type
= ms
->m_type
;
5694 mcl_audit_save_mbuf(struct mbuf
*m
, mcache_audit_t
*mca
)
5697 bcopy(m
, mca
->mca_contents
, mca
->mca_contents_size
);
5701 mcl_audit_cluster(mcache_audit_t
*mca
, void *addr
, size_t size
, boolean_t alloc
,
5702 boolean_t save_next
)
5704 mcache_obj_t
*next
= ((mcache_obj_t
*)addr
)->obj_next
;
5707 mcache_set_pattern(MCACHE_FREE_PATTERN
, addr
, size
);
5709 mcl_audit_verify_nextptr(next
, mca
);
5710 ((mcache_obj_t
*)addr
)->obj_next
= next
;
5713 /* Check if the buffer has been corrupted while in freelist */
5714 mcl_audit_verify_nextptr(next
, mca
);
5715 mcache_audit_free_verify_set(mca
, addr
, 0, size
);
5720 mcl_audit_mcheck_panic(struct mbuf
*m
)
5722 mcache_audit_t
*mca
;
5725 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
5727 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
5728 m
, (u_int16_t
)m
->m_type
, MT_FREE
, mcache_dump_mca(mca
));
5733 mcl_audit_verify_nextptr(void *next
, mcache_audit_t
*mca
)
5735 if (next
!= NULL
&& next
!= (void *)MCACHE_FREE_PATTERN
&&
5736 !MBUF_IN_MAP(next
)) {
5737 panic("mcl_audit: buffer %p modified after free at offset 0: "
5738 "%p out of range [%p-%p)\n%s\n",
5739 mca
->mca_addr
, next
, mbutl
, embutl
, mcache_dump_mca(mca
));
5744 SYSCTL_DECL(_kern_ipc
);
5745 SYSCTL_PROC(_kern_ipc
, KIPC_MBSTAT
, mbstat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5746 0, 0, mbstat_sysctl
, "S,mbstat", "");
5747 SYSCTL_PROC(_kern_ipc
, OID_AUTO
, mb_stat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5748 0, 0, mb_stat_sysctl
, "S,mb_stat", "");
5749 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mb_normalized
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5750 &mb_normalized
, 0, "");