2 * Copyright (c) 2008 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
82 #include <kern/kern_types.h>
83 #include <kern/simple_lock.h>
84 #include <kern/queue.h>
85 #include <kern/sched_prim.h>
86 #include <kern/cpu_number.h>
88 #include <libkern/OSAtomic.h>
89 #include <libkern/libkern.h>
91 #include <IOKit/IOMapper.h>
93 #include <machine/limits.h>
94 #include <machine/machine_routines.h>
97 #include <security/mac_framework.h>
100 #include <sys/mcache.h>
103 * MBUF IMPLEMENTATION NOTES.
105 * There is a total of 5 per-CPU caches:
108 * This is a cache of rudimentary objects of MSIZE in size; each
109 * object represents an mbuf structure. This cache preserves only
110 * the m_type field of the mbuf during its transactions.
113 * This is a cache of rudimentary objects of MCLBYTES in size; each
114 * object represents a mcluster structure. This cache does not
115 * preserve the contents of the objects during its transactions.
118 * This is a cache of rudimentary objects of NBPG in size; each
119 * object represents a mbigcluster structure. This cache does not
120 * preserve the contents of the objects during its transaction.
123 * This is a cache of mbufs each having a cluster attached to it.
124 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
125 * fields of the mbuf related to the external cluster are preserved
126 * during transactions.
129 * This is a cache of mbufs each having a big cluster attached to it.
130 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
131 * fields of the mbuf related to the external cluster are preserved
132 * during transactions.
136 * Allocation requests are handled first at the per-CPU (mcache) layer
137 * before falling back to the slab layer. Performance is optimal when
138 * the request is satisfied at the CPU layer because global data/lock
139 * never gets accessed. When the slab layer is entered for allocation,
140 * the slab freelist will be checked first for available objects before
141 * the VM backing store is invoked. Slab layer operations are serialized
142 * for all of the caches as the mbuf global lock is held most of the time.
143 * Allocation paths are different depending on the class of objects:
145 * a. Rudimentary object:
147 * { m_get_common(), m_clattach(), m_mclget(),
148 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
149 * composite object allocation }
152 * | +-----------------------+
154 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
157 * [CPU cache] -------> (found?) -------+
160 * mbuf_slab_alloc() |
163 * +---------> [freelist] -------> (found?) -------+
169 * +---<<---- kmem_mb_alloc()
171 * b. Composite object:
173 * { m_getpackets_internal(), m_allocpacket_internal() }
176 * | +------ (done) ---------+
178 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
181 * [CPU cache] -------> (found?) -------+
184 * mbuf_cslab_alloc() |
187 * [freelist] -------> (found?) -------+
190 * (rudimentary object) |
191 * mcache_alloc/mcache_alloc_ext() ------>>-----+
193 * Auditing notes: If auditing is enabled, buffers will be subjected to
194 * integrity checks by the audit routine. This is done by verifying their
195 * contents against DEADBEEF (free) pattern before returning them to caller.
196 * As part of this step, the routine will also record the transaction and
197 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
198 * also restore any constructed data structure fields if necessary.
200 * OBJECT DEALLOCATION:
202 * Freeing an object simply involves placing it into the CPU cache; this
203 * pollutes the cache to benefit subsequent allocations. The slab layer
204 * will only be entered if the object is to be purged out of the cache.
205 * During normal operations, this happens only when the CPU layer resizes
206 * its bucket while it's adjusting to the allocation load. Deallocation
207 * paths are different depending on the class of objects:
209 * a. Rudimentary object:
211 * { m_free(), m_freem_list(), composite object deallocation }
214 * | +------ (done) ---------+
216 * mcache_free/mcache_free_ext() |
219 * mbuf_slab_audit() |
222 * [CPU cache] ---> (not purging?) -----+
228 * [freelist] ----------->>------------+
229 * (objects never get purged to VM)
231 * b. Composite object:
233 * { m_free(), m_freem_list() }
236 * | +------ (done) ---------+
238 * mcache_free/mcache_free_ext() |
241 * mbuf_cslab_audit() |
244 * [CPU cache] ---> (not purging?) -----+
247 * mbuf_cslab_free() |
250 * [freelist] ---> (not purging?) -----+
253 * (rudimentary object) |
254 * mcache_free/mcache_free_ext() ------->>------+
256 * Auditing notes: If auditing is enabled, the audit routine will save
257 * any constructed data structure fields (if necessary) before filling the
258 * contents of the buffers with DEADBEEF (free) pattern and recording the
259 * transaction. Buffers that are freed (whether at CPU or slab layer) are
260 * expected to contain the free pattern.
264 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
265 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
266 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
267 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Note
268 * that debugging consumes more CPU and memory.
270 * Each object is associated with exactly one mcache_audit_t structure that
271 * contains the information related to its last buffer transaction. Given
272 * an address of an object, the audit structure can be retrieved by finding
273 * the position of the object relevant to the base address of the cluster:
275 * +------------+ +=============+
276 * | mbuf addr | | mclaudit[i] |
277 * +------------+ +=============+
279 * i = MTOCL(addr) +-------------+
280 * | +-----> | cl_audit[1] | -----> mcache_audit_t
281 * b = CLTOM(i) | +-------------+
283 * x = MCLIDX(b, addr) | +-------------+
284 * | | | cl_audit[7] |
285 * +-----------------+ +-------------+
288 * The mclaudit[] array is allocated at initialization time, but its contents
289 * get populated when the corresponding cluster is created. Because a cluster
290 * can be turned into NMBPCL number of mbufs, we preserve enough space for the
291 * mbufs so that there is a 1-to-1 mapping between them. A cluster that never
292 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
293 * remaining entries unused. For big clusters, only one entry is allocated
294 * and used for the entire cluster pair.
297 /* TODO: should be in header file */
298 /* kernel translater */
299 extern vm_offset_t
kmem_mb_alloc(vm_map_t
, int, int);
300 extern ppnum_t
pmap_find_phys(pmap_t pmap
, addr64_t va
);
301 extern vm_map_t mb_map
; /* special map */
304 static lck_mtx_t
*mbuf_mlock
;
305 static lck_attr_t
*mbuf_mlock_attr
;
306 static lck_grp_t
*mbuf_mlock_grp
;
307 static lck_grp_attr_t
*mbuf_mlock_grp_attr
;
309 /* Back-end (common) layer */
310 static void *mbuf_worker_run
; /* wait channel for worker thread */
311 static int mbuf_worker_ready
; /* worker thread is runnable */
312 static int mbuf_expand_mcl
; /* number of cluster creation requets */
313 static int mbuf_expand_big
; /* number of big cluster creation requests */
314 static int mbuf_expand_16k
; /* number of 16K cluster creation requests */
315 static int ncpu
; /* number of CPUs */
316 static ppnum_t
*mcl_paddr
; /* Array of cluster physical addresses */
317 static ppnum_t mcl_pages
; /* Size of array (# physical pages) */
318 static ppnum_t mcl_paddr_base
; /* Handle returned by IOMapper::iovmAlloc() */
319 static mcache_t
*ref_cache
; /* Cache of cluster reference & flags */
320 static mcache_t
*mcl_audit_con_cache
; /* Audit contents cache */
321 static unsigned int mbuf_debug
; /* patchable mbuf mcache flags */
322 static unsigned int mb_normalized
; /* number of packets "normalized" */
323 static unsigned int mbuf_gscale
; /* Power-of-two growth scale for m_howmany */
325 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
326 #define MB_GROWTH_NORMAL 4 /* Threshold: 15/16 of total */
329 MC_MBUF
= 0, /* Regular mbuf */
331 MC_BIGCL
, /* Large (4K) cluster */
332 MC_16KCL
, /* Jumbo (16K) cluster */
333 MC_MBUF_CL
, /* mbuf + cluster */
334 MC_MBUF_BIGCL
, /* mbuf + large (4K) cluster */
335 MC_MBUF_16KCL
/* mbuf + jumbo (16K) cluster */
338 #define MBUF_CLASS_MIN MC_MBUF
339 #define MBUF_CLASS_MAX MC_MBUF_16KCL
340 #define MBUF_CLASS_LAST MC_16KCL
341 #define MBUF_CLASS_VALID(c) \
342 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
343 #define MBUF_CLASS_COMPOSITE(c) \
344 ((int)(c) > MBUF_CLASS_LAST)
348 * mbuf specific mcache allocation request flags.
350 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
353 * Per-cluster slab structure.
355 * A slab is a cluster control structure that contains one or more object
356 * chunks; the available chunks are chained in the slab's freelist (sl_head).
357 * Each time a chunk is taken out of the slab, the slab's reference count
358 * gets incremented. When all chunks have been taken out, the empty slab
359 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
360 * returned to a slab causes the slab's reference count to be decremented;
361 * it also causes the slab to be reinserted back to class's slab list, if
362 * it's not already done.
364 * Compartmentalizing of the object chunks into slabs allows us to easily
365 * merge one or more slabs together when the adjacent slabs are idle, as
366 * well as to convert or move a slab from one class to another; e.g. the
367 * mbuf cluster slab can be converted to a regular cluster slab when all
368 * mbufs in the slab have been freed.
370 * A slab may also span across multiple clusters for chunks larger than
371 * a cluster's size. In this case, only the slab of the first cluster is
372 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
373 * that they are part of the larger slab.
375 typedef struct mcl_slab
{
376 struct mcl_slab
*sl_next
; /* neighboring slab */
377 u_int8_t sl_class
; /* controlling mbuf class */
378 int8_t sl_refcnt
; /* outstanding allocations */
379 int8_t sl_chunks
; /* chunks (bufs) in this slab */
380 u_int16_t sl_flags
; /* slab flags (see below) */
381 u_int16_t sl_len
; /* slab length */
382 void *sl_base
; /* base of allocated memory */
383 void *sl_head
; /* first free buffer */
384 TAILQ_ENTRY(mcl_slab
) sl_link
; /* next/prev slab on freelist */
387 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
388 #define SLF_PARTIAL 0x0002 /* part of another slab */
389 #define SLF_DETACHED 0x0004 /* not in slab freelist */
392 * The array of slabs are broken into groups of arrays per 1MB of kernel
393 * memory to reduce the footprint. Each group is allocated on demand
394 * whenever a new piece of memory mapped in from the VM crosses the 1MB
397 #define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */
399 typedef struct mcl_slabg
{
400 mcl_slab_t slg_slab
[NSLABSPMB
]; /* group of slabs */
404 * Per-cluster audit structure.
407 mcache_audit_t
*cl_audit
[NMBPCL
]; /* array of audits */
410 #if CONFIG_MBUF_NOEXPAND
411 static unsigned int maxmbufcl
;
412 #endif /* CONFIG_MBUF_NOEXPAND */
415 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
416 * and m_ext structures. If auditing is enabled, we allocate a shadow
417 * mbuf structure of this size inside each audit structure, and the
418 * contents of the real mbuf gets copied into it when the mbuf is freed.
419 * This allows us to pattern-fill the mbuf for integrity check, and to
420 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
421 * Note that we don't save the contents of clusters when they are freed;
422 * we simply pattern-fill them.
424 #define AUDIT_CONTENTS_SIZE ((MSIZE - MHLEN) + sizeof (_m_ext_t))
427 * mbuf specific mcache audit flags
429 #define MB_INUSE 0x01 /* object has not been returned to slab */
430 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
431 #define MB_SCVALID 0x04 /* object has valid saved contents */
434 * Each of the following two arrays hold up to nmbclusters elements.
436 static mcl_audit_t
*mclaudit
; /* array of cluster audit information */
437 static mcl_slabg_t
**slabstbl
; /* cluster slabs table */
438 static unsigned int maxslabgrp
; /* max # of entries in slabs table */
439 static unsigned int slabgrp
; /* # of entries in slabs table */
442 int nclusters
; /* # of clusters for non-jumbo (legacy) sizes */
443 int njcl
; /* # of clusters for jumbo sizes */
444 int njclbytes
; /* size of a jumbo cluster */
445 union mcluster
*mbutl
; /* first mapped cluster address */
446 union mcluster
*embutl
; /* ending virtual address of mclusters */
447 int max_linkhdr
; /* largest link-level header */
448 int max_protohdr
; /* largest protocol header */
449 int max_hdr
; /* largest link+protocol header */
450 int max_datalen
; /* MHLEN - max_hdr */
452 extern u_int32_t high_sb_max
;
454 /* TODO: should be in header file */
457 /* The minimum number of objects that are allocated, to start. */
459 #define MINBIGCL (MINCL >> 1)
460 #define MIN16KCL (MINCL >> 2)
462 /* Low watermarks (only map in pages once free counts go below) */
463 #define MCL_LOWAT MINCL
464 #define MBIGCL_LOWAT MINBIGCL
465 #define M16KCL_LOWAT MIN16KCL
468 mbuf_class_t mtbl_class
; /* class type */
469 mcache_t
*mtbl_cache
; /* mcache for this buffer class */
470 TAILQ_HEAD(mcl_slhead
, mcl_slab
) mtbl_slablist
; /* slab list */
471 mcache_obj_t
*mtbl_cobjlist
; /* composite objects freelist */
472 mb_class_stat_t
*mtbl_stats
; /* statistics fetchable via sysctl */
473 u_int32_t mtbl_maxsize
; /* maximum buffer size */
474 int mtbl_minlimit
; /* minimum allowed */
475 int mtbl_maxlimit
; /* maximum allowed */
476 u_int32_t mtbl_wantpurge
; /* purge during next reclaim */
479 #define m_class(c) mbuf_table[c].mtbl_class
480 #define m_cache(c) mbuf_table[c].mtbl_cache
481 #define m_slablist(c) mbuf_table[c].mtbl_slablist
482 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
483 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
484 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
485 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
486 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
487 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
488 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
489 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
490 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
491 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
492 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
493 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
494 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
495 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
496 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
497 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
498 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
500 static mbuf_table_t mbuf_table
[] = {
502 * The caches for mbufs, regular clusters and big clusters.
504 { MC_MBUF
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF
)),
505 NULL
, NULL
, 0, 0, 0, 0 },
506 { MC_CL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL
)),
507 NULL
, NULL
, 0, 0, 0, 0 },
508 { MC_BIGCL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL
)),
509 NULL
, NULL
, 0, 0, 0, 0 },
510 { MC_16KCL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL
)),
511 NULL
, NULL
, 0, 0, 0, 0 },
513 * The following are special caches; they serve as intermediate
514 * caches backed by the above rudimentary caches. Each object
515 * in the cache is an mbuf with a cluster attached to it. Unlike
516 * the above caches, these intermediate caches do not directly
517 * deal with the slab structures; instead, the constructed
518 * cached elements are simply stored in the freelists.
520 { MC_MBUF_CL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
521 { MC_MBUF_BIGCL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
522 { MC_MBUF_16KCL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
525 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
527 static void *mb_waitchan
= &mbuf_table
; /* wait channel for all caches */
528 static int mb_waiters
; /* number of sleepers */
530 /* The following are used to serialize m_clalloc() */
531 static boolean_t mb_clalloc_busy
;
532 static void *mb_clalloc_waitchan
= &mb_clalloc_busy
;
533 static int mb_clalloc_waiters
;
535 static int mbstat_sysctl SYSCTL_HANDLER_ARGS
;
536 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS
;
537 static void mbuf_table_init(void);
538 static inline void m_incref(struct mbuf
*);
539 static inline u_int32_t
m_decref(struct mbuf
*);
540 static int m_clalloc(const u_int32_t
, const int, const u_int32_t
);
541 static void mbuf_worker_thread_init(void);
542 static mcache_obj_t
*slab_alloc(mbuf_class_t
, int);
543 static void slab_free(mbuf_class_t
, mcache_obj_t
*);
544 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t
***,
546 static void mbuf_slab_free(void *, mcache_obj_t
*, int);
547 static void mbuf_slab_audit(void *, mcache_obj_t
*, boolean_t
);
548 static void mbuf_slab_notify(void *, u_int32_t
);
549 static unsigned int cslab_alloc(mbuf_class_t
, mcache_obj_t
***,
551 static unsigned int cslab_free(mbuf_class_t
, mcache_obj_t
*, int);
552 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t
***,
554 static void mbuf_cslab_free(void *, mcache_obj_t
*, int);
555 static void mbuf_cslab_audit(void *, mcache_obj_t
*, boolean_t
);
556 static int freelist_populate(mbuf_class_t
, unsigned int, int);
557 static boolean_t
mbuf_cached_above(mbuf_class_t
, int);
558 static boolean_t
mbuf_steal(mbuf_class_t
, unsigned int);
559 static void m_reclaim(mbuf_class_t
, unsigned int, boolean_t
);
560 static int m_howmany(int, size_t);
561 static void mbuf_worker_thread(void);
562 static boolean_t
mbuf_sleep(mbuf_class_t
, unsigned int, int);
564 static void mcl_audit_init(void *, mcache_audit_t
**, mcache_obj_t
**,
565 size_t, unsigned int);
566 static mcache_audit_t
*mcl_audit_buf2mca(mbuf_class_t
, mcache_obj_t
*);
567 static void mcl_audit_mbuf(mcache_audit_t
*, void *, boolean_t
, boolean_t
);
568 static void mcl_audit_cluster(mcache_audit_t
*, void *, size_t, boolean_t
,
570 static void mcl_audit_restore_mbuf(struct mbuf
*, mcache_audit_t
*, boolean_t
);
571 static void mcl_audit_save_mbuf(struct mbuf
*, mcache_audit_t
*);
572 static void mcl_audit_mcheck_panic(struct mbuf
*);
573 static void mcl_audit_verify_nextptr(void *, mcache_audit_t
*);
575 static mcl_slab_t
*slab_get(void *);
576 static void slab_init(mcl_slab_t
*, mbuf_class_t
, u_int32_t
,
577 void *, void *, unsigned int, int, int);
578 static void slab_insert(mcl_slab_t
*, mbuf_class_t
);
579 static void slab_remove(mcl_slab_t
*, mbuf_class_t
);
580 static boolean_t
slab_inrange(mcl_slab_t
*, void *);
581 static void slab_nextptr_panic(mcl_slab_t
*, void *);
582 static void slab_detach(mcl_slab_t
*);
583 static boolean_t
slab_is_detached(mcl_slab_t
*);
585 static unsigned int m_length(struct mbuf
*);
586 static int m_copyback0(struct mbuf
**, int, int, const void *, int, int);
587 static struct mbuf
*m_split0(struct mbuf
*, int, int, int);
589 /* flags for m_copyback0 */
590 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
591 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
592 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
593 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
596 * This flag is set for all mbufs that come out of and into the composite
597 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
598 * are marked with such a flag have clusters attached to them, and will be
599 * treated differently when they are freed; instead of being placed back
600 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
601 * are placed back into the appropriate composite cache's freelist, and the
602 * actual freeing is deferred until the composite objects are purged. At
603 * such a time, this flag will be cleared from the mbufs and the objects
604 * will be freed into their own separate freelists.
606 #define EXTF_COMPOSITE 0x1
608 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
609 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
610 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
611 #define MBUF_IS_COMPOSITE(m) \
612 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE))
615 * Macros used to verify the integrity of the mbuf.
617 #define _MCHECK(m) { \
618 if ((m)->m_type != MT_FREE) { \
619 if (mclaudit == NULL) \
620 panic("MCHECK: m_type=%d m=%p", \
621 (u_int16_t)(m)->m_type, m); \
623 mcl_audit_mcheck_panic(m); \
627 #define MBUF_IN_MAP(addr) \
628 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
630 #define MRANGE(addr) { \
631 if (!MBUF_IN_MAP(addr)) \
632 panic("MRANGE: address out of range 0x%p", addr); \
636 * Macro version of mtod.
638 #define MTOD(m, t) ((t)((m)->m_data))
641 * Macros to obtain cluster index and base cluster address.
643 #define MTOCL(x) (((char *)(x) - (char *)mbutl) >> MCLSHIFT)
644 #define CLTOM(x) ((union mcluster *)(mbutl + (x)))
647 * Macro to find the mbuf index relative to the cluster base.
649 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> 8)
652 * Macros used during mbuf and cluster initialization.
654 #define MBUF_INIT(m, pkthdr, type) { \
656 (m)->m_next = (m)->m_nextpkt = NULL; \
658 (m)->m_type = type; \
659 if ((pkthdr) == 0) { \
660 (m)->m_data = (m)->m_dat; \
663 (m)->m_data = (m)->m_pktdat; \
664 (m)->m_flags = M_PKTHDR; \
665 (m)->m_pkthdr.rcvif = NULL; \
666 (m)->m_pkthdr.len = 0; \
667 (m)->m_pkthdr.header = NULL; \
668 (m)->m_pkthdr.csum_flags = 0; \
669 (m)->m_pkthdr.csum_data = 0; \
670 (m)->m_pkthdr.tso_segsz = 0; \
671 (m)->m_pkthdr.vlan_tag = 0; \
672 (m)->m_pkthdr.socket_id = 0; \
677 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
678 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
679 (m)->m_flags |= M_EXT; \
680 (m)->m_ext.ext_size = (size); \
681 (m)->m_ext.ext_free = (free); \
682 (m)->m_ext.ext_arg = (arg); \
683 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
684 &(m)->m_ext.ext_refs; \
685 MEXT_RFA(m) = (rfa); \
686 MEXT_REF(m) = (ref); \
687 MEXT_FLAGS(m) = (flag); \
690 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
691 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
693 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
694 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
696 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
697 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
700 * Macro to convert BSD malloc sleep flag to mcache's
702 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
705 * The structure that holds all mbuf class statistics exportable via sysctl.
706 * Similar to mbstat structure, the mb_stat structure is protected by the
707 * global mbuf lock. It contains additional information about the classes
708 * that allows for a more accurate view of the state of the allocator.
710 struct mb_stat
*mb_stat
;
711 struct omb_stat
*omb_stat
; /* For backwards compatibility */
713 #define MB_STAT_SIZE(n) \
714 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
715 #define OMB_STAT_SIZE(n) \
716 ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
719 * The legacy structure holding all of the mbuf allocation statistics.
720 * The actual statistics used by the kernel are stored in the mbuf_table
721 * instead, and are updated atomically while the global mbuf lock is held.
722 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
723 * Unlike before, the kernel no longer relies on the contents of mbstat for
724 * its operations (e.g. cluster expansion) because the structure is exposed
725 * to outside and could possibly be modified, therefore making it unsafe.
726 * With the exception of the mbstat.m_mtypes array (see below), all of the
727 * statistics are updated as they change.
729 struct mbstat mbstat
;
731 #define MBSTAT_MTYPES_MAX \
732 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
735 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
736 * atomically and stored in a per-CPU structure which is lock-free; this is
737 * done in order to avoid writing to the global mbstat data structure which
738 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
739 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
740 * array and returned to the application. Any updates for types greater or
741 * equal than MT_MAX would be done atomically to the mbstat; this slows down
742 * performance but is okay since the kernel uses only up to MT_MAX-1 while
743 * anything beyond that (up to type 255) is considered a corner case.
746 unsigned int cpu_mtypes
[MT_MAX
];
747 } __attribute__((aligned(CPU_CACHE_SIZE
), packed
)) mtypes_cpu_t
;
750 mtypes_cpu_t mbs_cpu
[1];
753 static mbuf_mtypes_t
*mbuf_mtypes
; /* per-CPU statistics */
755 #define MBUF_MTYPES_SIZE(n) \
756 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
758 #define MTYPES_CPU(p) \
759 ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
761 /* This should be in a header file */
762 #define atomic_add_16(a, n) ((void) OSAddAtomic16(n, a))
763 #define atomic_add_32(a, n) ((void) OSAddAtomic(n, a))
765 #define mtype_stat_add(type, n) { \
766 if ((unsigned)(type) < MT_MAX) { \
767 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
768 atomic_add_32(&mbs->cpu_mtypes[type], n); \
769 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
770 atomic_add_16((int16_t*)&mbstat.m_mtypes[type], n); \
774 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
775 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
776 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
779 mbstat_sysctl SYSCTL_HANDLER_ARGS
781 #pragma unused(oidp, arg1, arg2)
785 bzero(&mtc
, sizeof (mtc
));
786 for (m
= 0; m
< ncpu
; m
++) {
787 mtypes_cpu_t
*scp
= &mbuf_mtypes
->mbs_cpu
[m
];
790 bcopy(&scp
->cpu_mtypes
, &temp
.cpu_mtypes
,
791 sizeof (temp
.cpu_mtypes
));
793 for (n
= 0; n
< MT_MAX
; n
++)
794 mtc
.cpu_mtypes
[n
] += temp
.cpu_mtypes
[n
];
796 lck_mtx_lock(mbuf_mlock
);
797 for (n
= 0; n
< MT_MAX
; n
++)
798 mbstat
.m_mtypes
[n
] = mtc
.cpu_mtypes
[n
];
799 lck_mtx_unlock(mbuf_mlock
);
801 return (SYSCTL_OUT(req
, &mbstat
, sizeof (mbstat
)));
805 mb_stat_sysctl SYSCTL_HANDLER_ARGS
807 #pragma unused(oidp, arg1, arg2)
812 int k
, m
, bktsize
, statsz
, proc64
= proc_is64bit(req
->p
);
814 lck_mtx_lock(mbuf_mlock
);
815 for (k
= 0; k
< NELEM(mbuf_table
); k
++) {
817 ccp
= &cp
->mc_cpu
[0];
818 bktsize
= ccp
->cc_bktsize
;
819 sp
= mbuf_table
[k
].mtbl_stats
;
821 if (cp
->mc_flags
& MCF_NOCPUCACHE
)
822 sp
->mbcl_mc_state
= MCS_DISABLED
;
823 else if (cp
->mc_purge_cnt
> 0)
824 sp
->mbcl_mc_state
= MCS_PURGING
;
825 else if (bktsize
== 0)
826 sp
->mbcl_mc_state
= MCS_OFFLINE
;
828 sp
->mbcl_mc_state
= MCS_ONLINE
;
830 sp
->mbcl_mc_cached
= 0;
831 for (m
= 0; m
< ncpu
; m
++) {
832 ccp
= &cp
->mc_cpu
[m
];
833 if (ccp
->cc_objs
> 0)
834 sp
->mbcl_mc_cached
+= ccp
->cc_objs
;
835 if (ccp
->cc_pobjs
> 0)
836 sp
->mbcl_mc_cached
+= ccp
->cc_pobjs
;
838 sp
->mbcl_mc_cached
+= (cp
->mc_full
.bl_total
* bktsize
);
839 sp
->mbcl_active
= sp
->mbcl_total
- sp
->mbcl_mc_cached
-
842 sp
->mbcl_mc_waiter_cnt
= cp
->mc_waiter_cnt
;
843 sp
->mbcl_mc_wretry_cnt
= cp
->mc_wretry_cnt
;
844 sp
->mbcl_mc_nwretry_cnt
= cp
->mc_nwretry_cnt
;
846 /* Calculate total count specific to each class */
847 sp
->mbcl_ctotal
= sp
->mbcl_total
;
848 switch (m_class(k
)) {
850 /* Deduct mbufs used in composite caches */
851 sp
->mbcl_ctotal
-= (m_total(MC_MBUF_CL
) +
852 m_total(MC_MBUF_BIGCL
));
856 /* Deduct clusters used in composite cache and mbufs */
857 sp
->mbcl_ctotal
-= (m_total(MC_MBUF_CL
) +
858 (P2ROUNDUP(m_total(MC_MBUF
), NMBPCL
)/NMBPCL
));
862 /* Deduct clusters used in composite cache */
863 sp
->mbcl_ctotal
-= m_total(MC_MBUF_BIGCL
);
867 /* Deduct clusters used in composite cache */
868 sp
->mbcl_ctotal
-= m_total(MC_MBUF_16KCL
);
877 struct omb_class_stat
*oc
;
878 struct mb_class_stat
*c
;
880 omb_stat
->mbs_cnt
= mb_stat
->mbs_cnt
;
881 oc
= &omb_stat
->mbs_class
[0];
882 c
= &mb_stat
->mbs_class
[0];
883 for (k
= 0; k
< omb_stat
->mbs_cnt
; k
++, oc
++, c
++) {
884 (void) snprintf(oc
->mbcl_cname
, sizeof (oc
->mbcl_cname
),
885 "%s", c
->mbcl_cname
);
886 oc
->mbcl_size
= c
->mbcl_size
;
887 oc
->mbcl_total
= c
->mbcl_total
;
888 oc
->mbcl_active
= c
->mbcl_active
;
889 oc
->mbcl_infree
= c
->mbcl_infree
;
890 oc
->mbcl_slab_cnt
= c
->mbcl_slab_cnt
;
891 oc
->mbcl_alloc_cnt
= c
->mbcl_alloc_cnt
;
892 oc
->mbcl_free_cnt
= c
->mbcl_free_cnt
;
893 oc
->mbcl_notified
= c
->mbcl_notified
;
894 oc
->mbcl_purge_cnt
= c
->mbcl_purge_cnt
;
895 oc
->mbcl_fail_cnt
= c
->mbcl_fail_cnt
;
896 oc
->mbcl_ctotal
= c
->mbcl_ctotal
;
897 oc
->mbcl_mc_state
= c
->mbcl_mc_state
;
898 oc
->mbcl_mc_cached
= c
->mbcl_mc_cached
;
899 oc
->mbcl_mc_waiter_cnt
= c
->mbcl_mc_waiter_cnt
;
900 oc
->mbcl_mc_wretry_cnt
= c
->mbcl_mc_wretry_cnt
;
901 oc
->mbcl_mc_nwretry_cnt
= c
->mbcl_mc_nwretry_cnt
;
904 statsz
= OMB_STAT_SIZE(NELEM(mbuf_table
));
907 statsz
= MB_STAT_SIZE(NELEM(mbuf_table
));
910 lck_mtx_unlock(mbuf_mlock
);
912 return (SYSCTL_OUT(req
, statp
, statsz
));
916 m_incref(struct mbuf
*m
)
919 volatile UInt32
*addr
= (volatile UInt32
*)&MEXT_REF(m
);
925 } while (!OSCompareAndSwap(old
, new, addr
));
928 static inline u_int32_t
929 m_decref(struct mbuf
*m
)
932 volatile UInt32
*addr
= (volatile UInt32
*)&MEXT_REF(m
);
938 } while (!OSCompareAndSwap(old
, new, addr
));
944 mbuf_table_init(void)
948 MALLOC(omb_stat
, struct omb_stat
*, OMB_STAT_SIZE(NELEM(mbuf_table
)),
949 M_TEMP
, M_WAITOK
| M_ZERO
);
950 VERIFY(omb_stat
!= NULL
);
952 MALLOC(mb_stat
, mb_stat_t
*, MB_STAT_SIZE(NELEM(mbuf_table
)),
953 M_TEMP
, M_WAITOK
| M_ZERO
);
954 VERIFY(mb_stat
!= NULL
);
956 mb_stat
->mbs_cnt
= NELEM(mbuf_table
);
957 for (m
= 0; m
< NELEM(mbuf_table
); m
++)
958 mbuf_table
[m
].mtbl_stats
= &mb_stat
->mbs_class
[m
];
960 #if CONFIG_MBUF_JUMBO
962 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
963 * this only on platforms where jumbo cluster pool is enabled.
965 njcl
= nmbclusters
/ 3;
966 njclbytes
= M16KCLBYTES
;
967 #endif /* CONFIG_MBUF_JUMBO */
970 * nclusters is going to be split in 2 to hold both the 2K
971 * and the 4K pools, so make sure each half is even.
973 nclusters
= P2ROUNDDOWN(nmbclusters
- njcl
, 4);
976 * Each jumbo cluster takes 8 2K clusters, so make
977 * sure that the pool size is evenly divisible by 8.
979 njcl
= P2ROUNDDOWN(nmbclusters
- nclusters
, 8);
982 #if CONFIG_MBUF_NOEXPAND
983 /* Only use 4k clusters if we're setting aside more than 256k */
984 if (nmbclusters
<= 128) {
985 maxmbufcl
= nmbclusters
/ 4;
987 /* Half to big clusters, half to small */
988 maxmbufcl
= (nmbclusters
/ 4) * 3;
990 #endif /* CONFIG_MBUF_NOEXPAND */
993 * 1/2 of the map is reserved for 2K clusters. Out of this, 1/16th
994 * of the total number of 2K clusters allocated is reserved and cannot
995 * be turned into mbufs. It can only be used for pure cluster objects.
997 m_minlimit(MC_CL
) = (nclusters
>> 5);
998 m_maxlimit(MC_CL
) = (nclusters
>> 1);
999 m_maxsize(MC_CL
) = m_size(MC_CL
) = MCLBYTES
;
1000 (void) snprintf(m_cname(MC_CL
), MAX_MBUF_CNAME
, "cl");
1003 * The remaining (15/16th) can be turned into mbufs.
1005 m_minlimit(MC_MBUF
) = 0;
1006 m_maxlimit(MC_MBUF
) = (m_maxlimit(MC_CL
) - m_minlimit(MC_CL
)) * NMBPCL
;
1007 m_maxsize(MC_MBUF
) = m_size(MC_MBUF
) = MSIZE
;
1008 (void) snprintf(m_cname(MC_MBUF
), MAX_MBUF_CNAME
, "mbuf");
1011 * The other 1/2 of the map is reserved for 4K clusters.
1013 m_minlimit(MC_BIGCL
) = 0;
1014 m_maxlimit(MC_BIGCL
) = m_maxlimit(MC_CL
) >> 1;
1015 m_maxsize(MC_BIGCL
) = m_size(MC_BIGCL
) = NBPG
;
1016 (void) snprintf(m_cname(MC_BIGCL
), MAX_MBUF_CNAME
, "bigcl");
1019 * Set limits for the composite classes.
1021 m_minlimit(MC_MBUF_CL
) = 0;
1022 m_maxlimit(MC_MBUF_CL
) = m_maxlimit(MC_CL
) - m_minlimit(MC_CL
);
1023 m_maxsize(MC_MBUF_CL
) = MCLBYTES
;
1024 m_size(MC_MBUF_CL
) = m_size(MC_MBUF
) + m_size(MC_CL
);
1025 (void) snprintf(m_cname(MC_MBUF_CL
), MAX_MBUF_CNAME
, "mbuf_cl");
1027 m_minlimit(MC_MBUF_BIGCL
) = 0;
1028 m_maxlimit(MC_MBUF_BIGCL
) = m_maxlimit(MC_BIGCL
);
1029 m_maxsize(MC_MBUF_BIGCL
) = NBPG
;
1030 m_size(MC_MBUF_BIGCL
) = m_size(MC_MBUF
) + m_size(MC_BIGCL
);
1031 (void) snprintf(m_cname(MC_MBUF_BIGCL
), MAX_MBUF_CNAME
, "mbuf_bigcl");
1034 * And for jumbo classes.
1036 m_minlimit(MC_16KCL
) = 0;
1037 m_maxlimit(MC_16KCL
) = (njcl
>> 3);
1038 m_maxsize(MC_16KCL
) = m_size(MC_16KCL
) = M16KCLBYTES
;
1039 (void) snprintf(m_cname(MC_16KCL
), MAX_MBUF_CNAME
, "16kcl");
1041 m_minlimit(MC_MBUF_16KCL
) = 0;
1042 m_maxlimit(MC_MBUF_16KCL
) = m_maxlimit(MC_16KCL
);
1043 m_maxsize(MC_MBUF_16KCL
) = M16KCLBYTES
;
1044 m_size(MC_MBUF_16KCL
) = m_size(MC_MBUF
) + m_size(MC_16KCL
);
1045 (void) snprintf(m_cname(MC_MBUF_16KCL
), MAX_MBUF_CNAME
, "mbuf_16kcl");
1048 * Initialize the legacy mbstat structure.
1050 bzero(&mbstat
, sizeof (mbstat
));
1051 mbstat
.m_msize
= m_maxsize(MC_MBUF
);
1052 mbstat
.m_mclbytes
= m_maxsize(MC_CL
);
1053 mbstat
.m_minclsize
= MINCLSIZE
;
1054 mbstat
.m_mlen
= MLEN
;
1055 mbstat
.m_mhlen
= MHLEN
;
1056 mbstat
.m_bigmclbytes
= m_maxsize(MC_BIGCL
);
1059 #if defined(__LP64__)
1060 typedef struct ncl_tbl
{
1061 uint64_t nt_maxmem
; /* memory (sane) size */
1062 uint32_t nt_mbpool
; /* mbuf pool size */
1066 static ncl_tbl_t ncl_table
[] = {
1067 { (1ULL << GBSHIFT
) /* 1 GB */, (64 << MBSHIFT
) /* 64 MB */ },
1068 { (1ULL << (GBSHIFT
+ 3)) /* 8 GB */, (96 << MBSHIFT
) /* 96 MB */ },
1069 { (1ULL << (GBSHIFT
+ 4)) /* 16 GB */, (128 << MBSHIFT
) /* 128 MB */ },
1074 static ncl_tbl_t ncl_table_srv
[] = {
1075 { (1ULL << GBSHIFT
) /* 1 GB */, (96 << MBSHIFT
) /* 96 MB */ },
1076 { (1ULL << (GBSHIFT
+ 2)) /* 4 GB */, (128 << MBSHIFT
) /* 128 MB */ },
1077 { (1ULL << (GBSHIFT
+ 3)) /* 8 GB */, (160 << MBSHIFT
) /* 160 MB */ },
1078 { (1ULL << (GBSHIFT
+ 4)) /* 16 GB */, (192 << MBSHIFT
) /* 192 MB */ },
1079 { (1ULL << (GBSHIFT
+ 5)) /* 32 GB */, (256 << MBSHIFT
) /* 256 MB */ },
1080 { (1ULL << (GBSHIFT
+ 6)) /* 64 GB */, (384 << MBSHIFT
) /* 384 MB */ },
1083 #endif /* __LP64__ */
1085 __private_extern__
unsigned int
1086 mbuf_default_ncl(int srv
, uint64_t mem
)
1088 #if !defined(__LP64__)
1092 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1094 if ((n
= ((mem
/ 16) / MCLBYTES
)) > 32768)
1098 ncl_tbl_t
*tbl
= (srv
? ncl_table_srv
: ncl_table
);
1100 * 64-bit kernel (mbuf pool size based on table).
1102 n
= tbl
[0].nt_mbpool
;
1103 for (i
= 0; tbl
[i
].nt_mbpool
!= 0; i
++) {
1104 if (mem
< tbl
[i
].nt_maxmem
)
1106 n
= tbl
[i
].nt_mbpool
;
1109 #endif /* !__LP64__ */
1113 __private_extern__
void
1117 int initmcl
= MINCL
;
1119 thread_t thread
= THREAD_NULL
;
1121 if (nmbclusters
== 0)
1122 nmbclusters
= NMBCLUSTERS
;
1124 /* Setup the mbuf table */
1127 /* Global lock for common layer */
1128 mbuf_mlock_grp_attr
= lck_grp_attr_alloc_init();
1129 mbuf_mlock_grp
= lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr
);
1130 mbuf_mlock_attr
= lck_attr_alloc_init();
1131 mbuf_mlock
= lck_mtx_alloc_init(mbuf_mlock_grp
, mbuf_mlock_attr
);
1133 /* Allocate cluster slabs table */
1134 maxslabgrp
= P2ROUNDUP(nmbclusters
, NSLABSPMB
) / NSLABSPMB
;
1135 MALLOC(slabstbl
, mcl_slabg_t
**, maxslabgrp
* sizeof (mcl_slabg_t
*),
1136 M_TEMP
, M_WAITOK
| M_ZERO
);
1137 VERIFY(slabstbl
!= NULL
);
1139 /* Allocate audit structures if needed */
1140 PE_parse_boot_argn("mbuf_debug", &mbuf_debug
, sizeof (mbuf_debug
));
1141 mbuf_debug
|= mcache_getflags();
1142 if (mbuf_debug
& MCF_AUDIT
) {
1143 MALLOC(mclaudit
, mcl_audit_t
*,
1144 nmbclusters
* sizeof (*mclaudit
), M_TEMP
,
1146 VERIFY(mclaudit
!= NULL
);
1148 mcl_audit_con_cache
= mcache_create("mcl_audit_contents",
1149 AUDIT_CONTENTS_SIZE
, 0, 0, MCR_SLEEP
);
1150 VERIFY(mcl_audit_con_cache
!= NULL
);
1153 /* Calculate the number of pages assigned to the cluster pool */
1154 mcl_pages
= (nmbclusters
* MCLBYTES
) / CLBYTES
;
1155 MALLOC(mcl_paddr
, ppnum_t
*, mcl_pages
* sizeof (ppnum_t
),
1157 VERIFY(mcl_paddr
!= NULL
);
1159 /* Register with the I/O Bus mapper */
1160 mcl_paddr_base
= IOMapperIOVMAlloc(mcl_pages
);
1161 bzero((char *)mcl_paddr
, mcl_pages
* sizeof (ppnum_t
));
1163 embutl
= (union mcluster
*)
1164 ((unsigned char *)mbutl
+ (nmbclusters
* MCLBYTES
));
1166 PE_parse_boot_argn("initmcl", &initmcl
, sizeof (initmcl
));
1168 lck_mtx_lock(mbuf_mlock
);
1170 if (m_clalloc(MAX(NBPG
/CLBYTES
, 1) * initmcl
, M_WAIT
, MCLBYTES
) == 0)
1171 panic("mbinit: m_clalloc failed\n");
1173 lck_mtx_unlock(mbuf_mlock
);
1175 (void) kernel_thread_start((thread_continue_t
)mbuf_worker_thread_init
, NULL
, &thread
);
1176 thread_deallocate(thread
);
1178 ref_cache
= mcache_create("mext_ref", sizeof (struct ext_ref
),
1181 /* Create the cache for each class */
1182 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
1183 void *allocfunc
, *freefunc
, *auditfunc
;
1187 if (m_class(m
) == MC_MBUF_CL
|| m_class(m
) == MC_MBUF_BIGCL
||
1188 m_class(m
) == MC_MBUF_16KCL
) {
1189 allocfunc
= mbuf_cslab_alloc
;
1190 freefunc
= mbuf_cslab_free
;
1191 auditfunc
= mbuf_cslab_audit
;
1193 allocfunc
= mbuf_slab_alloc
;
1194 freefunc
= mbuf_slab_free
;
1195 auditfunc
= mbuf_slab_audit
;
1199 * Disable per-CPU caches for jumbo classes if there
1200 * is no jumbo cluster pool available in the system.
1201 * The cache itself is still created (but will never
1202 * be populated) since it simplifies the code.
1204 if ((m_class(m
) == MC_MBUF_16KCL
|| m_class(m
) == MC_16KCL
) &&
1206 flags
|= MCF_NOCPUCACHE
;
1208 m_cache(m
) = mcache_create_ext(m_cname(m
), m_maxsize(m
),
1209 allocfunc
, freefunc
, auditfunc
, mbuf_slab_notify
,
1210 (void *)(uintptr_t)m
, flags
, MCR_SLEEP
);
1214 * Allocate structure for per-CPU statistics that's aligned
1215 * on the CPU cache boundary; this code assumes that we never
1216 * uninitialize this framework, since the original address
1217 * before alignment is not saved.
1219 ncpu
= ml_get_max_cpus();
1220 MALLOC(buf
, void *, MBUF_MTYPES_SIZE(ncpu
) + CPU_CACHE_SIZE
,
1222 VERIFY(buf
!= NULL
);
1224 mbuf_mtypes
= (mbuf_mtypes_t
*)P2ROUNDUP((intptr_t)buf
, CPU_CACHE_SIZE
);
1225 bzero(mbuf_mtypes
, MBUF_MTYPES_SIZE(ncpu
));
1227 mbuf_gscale
= MB_GROWTH_NORMAL
;
1230 * Set the max limit on sb_max to be 1/16 th of the size of
1231 * memory allocated for mbuf clusters.
1233 high_sb_max
= (nmbclusters
<< (MCLSHIFT
- 4));
1234 if (high_sb_max
< sb_max
) {
1235 /* sb_max is too large for this configuration, scale it down */
1236 if (high_sb_max
> (1 << MBSHIFT
)) {
1237 /* We have atleast 16 M of mbuf pool */
1238 sb_max
= high_sb_max
;
1239 } else if ((nmbclusters
<< MCLSHIFT
) > (1 << MBSHIFT
)) {
1240 /* If we have more than 1M of mbufpool, cap the size of
1241 * max sock buf at 1M
1243 sb_max
= high_sb_max
= (1 << MBSHIFT
);
1245 sb_max
= high_sb_max
;
1249 printf("mbinit: done (%d MB memory set for mbuf pool)\n",
1250 (nmbclusters
<< MCLSHIFT
) >> MBSHIFT
);
1254 * Obtain a slab of object(s) from the class's freelist.
1256 static mcache_obj_t
*
1257 slab_alloc(mbuf_class_t
class, int wait
)
1262 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1264 VERIFY(class != MC_16KCL
|| njcl
> 0);
1266 /* This should always be NULL for us */
1267 VERIFY(m_cobjlist(class) == NULL
);
1270 * Treat composite objects as having longer lifespan by using
1271 * a slab from the reverse direction, in hoping that this could
1272 * reduce the probability of fragmentation for slabs that hold
1273 * more than one buffer chunks (e.g. mbuf slabs). For other
1274 * slabs, this probably doesn't make much of a difference.
1276 if (class == MC_MBUF
&& (wait
& MCR_COMP
))
1277 sp
= (mcl_slab_t
*)TAILQ_LAST(&m_slablist(class), mcl_slhead
);
1279 sp
= (mcl_slab_t
*)TAILQ_FIRST(&m_slablist(class));
1282 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1283 /* The slab list for this class is empty */
1287 VERIFY(m_infree(class) > 0);
1288 VERIFY(!slab_is_detached(sp
));
1289 VERIFY(sp
->sl_class
== class &&
1290 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
1292 VERIFY(slab_inrange(sp
, buf
) && sp
== slab_get(buf
));
1294 if (class == MC_MBUF
) {
1295 sp
->sl_head
= buf
->obj_next
;
1296 VERIFY(sp
->sl_head
!= NULL
|| sp
->sl_refcnt
== (NMBPCL
- 1));
1300 if (sp
->sl_head
!= NULL
&& !slab_inrange(sp
, sp
->sl_head
)) {
1301 slab_nextptr_panic(sp
, sp
->sl_head
);
1302 /* In case sl_head is in the map but not in the slab */
1303 VERIFY(slab_inrange(sp
, sp
->sl_head
));
1307 /* Increment slab reference */
1310 if (mclaudit
!= NULL
) {
1311 mcache_audit_t
*mca
= mcl_audit_buf2mca(class, buf
);
1312 mca
->mca_uflags
= 0;
1313 /* Save contents on mbuf objects only */
1314 if (class == MC_MBUF
)
1315 mca
->mca_uflags
|= MB_SCVALID
;
1318 if (class == MC_CL
) {
1319 mbstat
.m_clfree
= (--m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1321 * A 2K cluster slab can have at most 1 reference.
1323 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1324 sp
->sl_len
== m_maxsize(MC_CL
) && sp
->sl_head
== NULL
);
1325 } else if (class == MC_BIGCL
) {
1326 mcl_slab_t
*nsp
= sp
->sl_next
;
1327 mbstat
.m_bigclfree
= (--m_infree(MC_BIGCL
)) +
1328 m_infree(MC_MBUF_BIGCL
);
1330 * Increment 2nd slab. A 4K big cluster takes
1331 * 2 slabs, each having at most 1 reference.
1333 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1334 sp
->sl_len
== m_maxsize(MC_BIGCL
) && sp
->sl_head
== NULL
);
1335 /* Next slab must already be present */
1336 VERIFY(nsp
!= NULL
);
1338 VERIFY(!slab_is_detached(nsp
));
1339 VERIFY(nsp
->sl_class
== MC_BIGCL
&&
1340 nsp
->sl_flags
== (SLF_MAPPED
| SLF_PARTIAL
) &&
1341 nsp
->sl_refcnt
== 1 && nsp
->sl_chunks
== 0 &&
1342 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1343 nsp
->sl_head
== NULL
);
1344 } else if (class == MC_16KCL
) {
1348 --m_infree(MC_16KCL
);
1349 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1350 sp
->sl_len
== m_maxsize(MC_16KCL
) && sp
->sl_head
== NULL
);
1352 * Increment 2nd-8th slab. A 16K big cluster takes
1353 * 8 cluster slabs, each having at most 1 reference.
1355 for (nsp
= sp
, k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1357 /* Next slab must already be present */
1358 VERIFY(nsp
!= NULL
);
1360 VERIFY(!slab_is_detached(nsp
));
1361 VERIFY(nsp
->sl_class
== MC_16KCL
&&
1362 nsp
->sl_flags
== (SLF_MAPPED
| SLF_PARTIAL
) &&
1363 nsp
->sl_refcnt
== 1 && nsp
->sl_chunks
== 0 &&
1364 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1365 nsp
->sl_head
== NULL
);
1368 ASSERT(class == MC_MBUF
);
1369 --m_infree(MC_MBUF
);
1371 * If auditing is turned on, this check is
1372 * deferred until later in mbuf_slab_audit().
1374 if (mclaudit
== NULL
)
1375 _MCHECK((struct mbuf
*)buf
);
1377 * Since we have incremented the reference count above,
1378 * an mbuf slab (formerly a 2K cluster slab that was cut
1379 * up into mbufs) must have a reference count between 1
1380 * and NMBPCL at this point.
1382 VERIFY(sp
->sl_refcnt
>= 1 &&
1383 (unsigned short)sp
->sl_refcnt
<= NMBPCL
&&
1384 sp
->sl_chunks
== NMBPCL
&& sp
->sl_len
== m_maxsize(MC_CL
));
1385 VERIFY((unsigned short)sp
->sl_refcnt
< NMBPCL
||
1386 sp
->sl_head
== NULL
);
1389 /* If empty, remove this slab from the class's freelist */
1390 if (sp
->sl_head
== NULL
) {
1391 VERIFY(class != MC_MBUF
|| sp
->sl_refcnt
== NMBPCL
);
1392 slab_remove(sp
, class);
1399 * Place a slab of object(s) back into a class's slab list.
1402 slab_free(mbuf_class_t
class, mcache_obj_t
*buf
)
1406 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1408 VERIFY(class != MC_16KCL
|| njcl
> 0);
1409 VERIFY(buf
->obj_next
== NULL
);
1411 VERIFY(sp
->sl_class
== class && slab_inrange(sp
, buf
) &&
1412 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
1414 /* Decrement slab reference */
1417 if (class == MC_CL
|| class == MC_BIGCL
) {
1418 VERIFY(IS_P2ALIGNED(buf
, MCLBYTES
));
1420 * A 2K cluster slab can have at most 1 reference
1421 * which must be 0 at this point.
1423 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_chunks
== 1 &&
1424 sp
->sl_len
== m_maxsize(class) && sp
->sl_head
== NULL
);
1425 VERIFY(slab_is_detached(sp
));
1426 if (class == MC_BIGCL
) {
1427 mcl_slab_t
*nsp
= sp
->sl_next
;
1428 VERIFY(IS_P2ALIGNED(buf
, NBPG
));
1429 /* Next slab must already be present */
1430 VERIFY(nsp
!= NULL
);
1431 /* Decrement 2nd slab reference */
1434 * A 4K big cluster takes 2 slabs, both
1435 * must now have 0 reference.
1437 VERIFY(slab_is_detached(nsp
));
1438 VERIFY(nsp
->sl_class
== MC_BIGCL
&&
1439 (nsp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) &&
1440 nsp
->sl_refcnt
== 0 && nsp
->sl_chunks
== 0 &&
1441 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1442 nsp
->sl_head
== NULL
);
1444 } else if (class == MC_16KCL
) {
1448 * A 16K cluster takes 8 cluster slabs, all must
1449 * now have 0 reference.
1451 VERIFY(IS_P2ALIGNED(buf
, NBPG
));
1452 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_chunks
== 1 &&
1453 sp
->sl_len
== m_maxsize(MC_16KCL
) && sp
->sl_head
== NULL
);
1454 VERIFY(slab_is_detached(sp
));
1455 for (nsp
= sp
, k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1457 /* Next slab must already be present */
1458 VERIFY(nsp
!= NULL
);
1460 VERIFY(slab_is_detached(nsp
));
1461 VERIFY(nsp
->sl_class
== MC_16KCL
&&
1462 (nsp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) &&
1463 nsp
->sl_refcnt
== 0 && nsp
->sl_chunks
== 0 &&
1464 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1465 nsp
->sl_head
== NULL
);
1469 * An mbuf slab has a total of NMBPL reference counts.
1470 * Since we have decremented the reference above, it
1471 * must now be between 0 and NMBPCL-1.
1473 VERIFY(sp
->sl_refcnt
>= 0 &&
1474 (unsigned short)sp
->sl_refcnt
<= (NMBPCL
- 1) &&
1475 sp
->sl_chunks
== NMBPCL
&& sp
->sl_len
== m_maxsize(MC_CL
));
1476 VERIFY(sp
->sl_refcnt
< (NMBPCL
- 1) ||
1477 (slab_is_detached(sp
) && sp
->sl_head
== NULL
));
1481 * When auditing is enabled, ensure that the buffer still
1482 * contains the free pattern. Otherwise it got corrupted
1483 * while at the CPU cache layer.
1485 if (mclaudit
!= NULL
) {
1486 mcache_audit_t
*mca
= mcl_audit_buf2mca(class, buf
);
1487 mcache_audit_free_verify(mca
, buf
, 0, m_maxsize(class));
1488 mca
->mca_uflags
&= ~MB_SCVALID
;
1491 if (class == MC_CL
) {
1492 mbstat
.m_clfree
= (++m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1493 } else if (class == MC_BIGCL
) {
1494 mbstat
.m_bigclfree
= (++m_infree(MC_BIGCL
)) +
1495 m_infree(MC_MBUF_BIGCL
);
1496 } else if (class == MC_16KCL
) {
1497 ++m_infree(MC_16KCL
);
1499 ++m_infree(MC_MBUF
);
1500 buf
->obj_next
= sp
->sl_head
;
1504 /* All mbufs are freed; return the cluster that we stole earlier */
1505 if (sp
->sl_refcnt
== 0 && class == MC_MBUF
) {
1508 m_total(MC_MBUF
) -= NMBPCL
;
1509 mbstat
.m_mbufs
= m_total(MC_MBUF
);
1510 m_infree(MC_MBUF
) -= NMBPCL
;
1511 mtype_stat_add(MT_FREE
, -((unsigned)NMBPCL
));
1514 struct mbuf
*m
= sp
->sl_head
;
1516 sp
->sl_head
= m
->m_next
;
1519 VERIFY(sp
->sl_head
== NULL
);
1521 /* Remove the slab from the mbuf class's slab list */
1522 slab_remove(sp
, class);
1524 /* Reinitialize it as a 2K cluster slab */
1525 slab_init(sp
, MC_CL
, sp
->sl_flags
, sp
->sl_base
, sp
->sl_base
,
1528 if (mclaudit
!= NULL
)
1529 mcache_set_pattern(MCACHE_FREE_PATTERN
,
1530 (caddr_t
)sp
->sl_head
, m_maxsize(MC_CL
));
1532 mbstat
.m_clfree
= (++m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1534 VERIFY(slab_is_detached(sp
));
1535 /* And finally switch class */
1539 /* Reinsert the slab to the class's slab list */
1540 if (slab_is_detached(sp
))
1541 slab_insert(sp
, class);
1545 * Common allocator for rudimentary objects called by the CPU cache layer
1546 * during an allocation request whenever there is no available element in the
1547 * bucket layer. It returns one or more elements from the appropriate global
1548 * freelist. If the freelist is empty, it will attempt to populate it and
1549 * retry the allocation.
1552 mbuf_slab_alloc(void *arg
, mcache_obj_t
***plist
, unsigned int num
, int wait
)
1554 mbuf_class_t
class = (mbuf_class_t
)arg
;
1555 unsigned int need
= num
;
1556 mcache_obj_t
**list
= *plist
;
1558 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1561 lck_mtx_lock(mbuf_mlock
);
1564 if ((*list
= slab_alloc(class, wait
)) != NULL
) {
1565 (*list
)->obj_next
= NULL
;
1566 list
= *plist
= &(*list
)->obj_next
;
1570 * If the number of elements in freelist has
1571 * dropped below low watermark, asynchronously
1572 * populate the freelist now rather than doing
1573 * it later when we run out of elements.
1575 if (!mbuf_cached_above(class, wait
) &&
1576 m_infree(class) < m_total(class) >> 5) {
1577 (void) freelist_populate(class, 1,
1583 VERIFY(m_infree(class) == 0 || class == MC_CL
);
1585 (void) freelist_populate(class, 1,
1586 (wait
& MCR_NOSLEEP
) ? M_DONTWAIT
: M_WAIT
);
1588 if (m_infree(class) > 0)
1591 /* Check if there's anything at the cache layer */
1592 if (mbuf_cached_above(class, wait
))
1595 /* We have nothing and cannot block; give up */
1596 if (wait
& MCR_NOSLEEP
) {
1597 if (!(wait
& MCR_TRYHARD
)) {
1598 m_fail_cnt(class)++;
1605 * If the freelist is still empty and the caller is
1606 * willing to be blocked, sleep on the wait channel
1607 * until an element is available. Otherwise, if
1608 * MCR_TRYHARD is set, do our best to satisfy the
1609 * request without having to go to sleep.
1611 if (mbuf_worker_ready
&&
1612 mbuf_sleep(class, need
, wait
))
1615 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1619 m_alloc_cnt(class) += num
- need
;
1620 lck_mtx_unlock(mbuf_mlock
);
1622 return (num
- need
);
1626 * Common de-allocator for rudimentary objects called by the CPU cache
1627 * layer when one or more elements need to be returned to the appropriate
1631 mbuf_slab_free(void *arg
, mcache_obj_t
*list
, __unused
int purged
)
1633 mbuf_class_t
class = (mbuf_class_t
)arg
;
1634 mcache_obj_t
*nlist
;
1635 unsigned int num
= 0;
1638 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1640 lck_mtx_lock(mbuf_mlock
);
1643 nlist
= list
->obj_next
;
1644 list
->obj_next
= NULL
;
1645 slab_free(class, list
);
1647 if ((list
= nlist
) == NULL
)
1650 m_free_cnt(class) += num
;
1652 if ((w
= mb_waiters
) > 0)
1655 lck_mtx_unlock(mbuf_mlock
);
1658 wakeup(mb_waitchan
);
1662 * Common auditor for rudimentary objects called by the CPU cache layer
1663 * during an allocation or free request. For the former, this is called
1664 * after the objects are obtained from either the bucket or slab layer
1665 * and before they are returned to the caller. For the latter, this is
1666 * called immediately during free and before placing the objects into
1667 * the bucket or slab layer.
1670 mbuf_slab_audit(void *arg
, mcache_obj_t
*list
, boolean_t alloc
)
1672 mbuf_class_t
class = (mbuf_class_t
)arg
;
1673 mcache_audit_t
*mca
;
1675 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1677 while (list
!= NULL
) {
1678 lck_mtx_lock(mbuf_mlock
);
1679 mca
= mcl_audit_buf2mca(class, list
);
1681 /* Do the sanity checks */
1682 if (class == MC_MBUF
) {
1683 mcl_audit_mbuf(mca
, list
, FALSE
, alloc
);
1684 ASSERT(mca
->mca_uflags
& MB_SCVALID
);
1686 mcl_audit_cluster(mca
, list
, m_maxsize(class),
1688 ASSERT(!(mca
->mca_uflags
& MB_SCVALID
));
1690 /* Record this transaction */
1691 mcache_buffer_log(mca
, list
, m_cache(class));
1693 mca
->mca_uflags
|= MB_INUSE
;
1695 mca
->mca_uflags
&= ~MB_INUSE
;
1696 /* Unpair the object (unconditionally) */
1697 mca
->mca_uptr
= NULL
;
1698 lck_mtx_unlock(mbuf_mlock
);
1700 list
= list
->obj_next
;
1705 * Common notify routine for all caches. It is called by mcache when
1706 * one or more objects get freed. We use this indication to trigger
1707 * the wakeup of any sleeping threads so that they can retry their
1708 * allocation requests.
1711 mbuf_slab_notify(void *arg
, u_int32_t reason
)
1713 mbuf_class_t
class = (mbuf_class_t
)arg
;
1716 ASSERT(MBUF_CLASS_VALID(class));
1718 if (reason
!= MCN_RETRYALLOC
)
1721 lck_mtx_lock(mbuf_mlock
);
1722 if ((w
= mb_waiters
) > 0) {
1723 m_notified(class)++;
1726 lck_mtx_unlock(mbuf_mlock
);
1729 wakeup(mb_waitchan
);
1733 * Obtain object(s) from the composite class's freelist.
1736 cslab_alloc(mbuf_class_t
class, mcache_obj_t
***plist
, unsigned int num
)
1738 unsigned int need
= num
;
1739 mcl_slab_t
*sp
, *clsp
, *nsp
;
1741 mcache_obj_t
**list
= *plist
;
1745 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1746 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1748 /* Get what we can from the freelist */
1749 while ((*list
= m_cobjlist(class)) != NULL
) {
1752 m
= (struct mbuf
*)*list
;
1754 cl
= m
->m_ext
.ext_buf
;
1755 clsp
= slab_get(cl
);
1756 VERIFY(m
->m_flags
== M_EXT
&& cl
!= NULL
);
1757 VERIFY(MEXT_RFA(m
) != NULL
&& MBUF_IS_COMPOSITE(m
));
1758 VERIFY(clsp
->sl_refcnt
== 1);
1759 if (class == MC_MBUF_BIGCL
) {
1760 nsp
= clsp
->sl_next
;
1761 /* Next slab must already be present */
1762 VERIFY(nsp
!= NULL
);
1763 VERIFY(nsp
->sl_refcnt
== 1);
1764 } else if (class == MC_MBUF_16KCL
) {
1766 for (nsp
= clsp
, k
= 1;
1767 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1769 /* Next slab must already be present */
1770 VERIFY(nsp
!= NULL
);
1771 VERIFY(nsp
->sl_refcnt
== 1);
1775 if ((m_cobjlist(class) = (*list
)->obj_next
) != NULL
&&
1776 !MBUF_IN_MAP(m_cobjlist(class))) {
1777 slab_nextptr_panic(sp
, m_cobjlist(class));
1780 (*list
)->obj_next
= NULL
;
1781 list
= *plist
= &(*list
)->obj_next
;
1786 m_infree(class) -= (num
- need
);
1788 return (num
- need
);
1792 * Place object(s) back into a composite class's freelist.
1795 cslab_free(mbuf_class_t
class, mcache_obj_t
*list
, int purged
)
1797 mcache_obj_t
*o
, *tail
;
1798 unsigned int num
= 0;
1799 struct mbuf
*m
, *ms
;
1800 mcache_audit_t
*mca
= NULL
;
1801 mcache_obj_t
*ref_list
= NULL
;
1802 mcl_slab_t
*clsp
, *nsp
;
1805 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1806 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1807 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1811 while ((m
= ms
= (struct mbuf
*)o
) != NULL
) {
1812 mcache_obj_t
*rfa
, *nexto
= o
->obj_next
;
1814 /* Do the mbuf sanity checks */
1815 if (mclaudit
!= NULL
) {
1816 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
1817 mcache_audit_free_verify(mca
, m
, 0, m_maxsize(MC_MBUF
));
1818 ms
= (struct mbuf
*)mca
->mca_contents
;
1821 /* Do the cluster sanity checks */
1822 cl
= ms
->m_ext
.ext_buf
;
1823 clsp
= slab_get(cl
);
1824 if (mclaudit
!= NULL
) {
1826 if (class == MC_MBUF_CL
)
1827 size
= m_maxsize(MC_CL
);
1828 else if (class == MC_MBUF_BIGCL
)
1829 size
= m_maxsize(MC_BIGCL
);
1831 size
= m_maxsize(MC_16KCL
);
1832 mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL
,
1833 (mcache_obj_t
*)cl
), cl
, 0, size
);
1835 VERIFY(ms
->m_type
== MT_FREE
);
1836 VERIFY(ms
->m_flags
== M_EXT
);
1837 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
1838 VERIFY(clsp
->sl_refcnt
== 1);
1839 if (class == MC_MBUF_BIGCL
) {
1840 nsp
= clsp
->sl_next
;
1841 /* Next slab must already be present */
1842 VERIFY(nsp
!= NULL
);
1843 VERIFY(nsp
->sl_refcnt
== 1);
1844 } else if (class == MC_MBUF_16KCL
) {
1846 for (nsp
= clsp
, k
= 1;
1847 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1849 /* Next slab must already be present */
1850 VERIFY(nsp
!= NULL
);
1851 VERIFY(nsp
->sl_refcnt
== 1);
1856 * If we're asked to purge, restore the actual mbuf using
1857 * contents of the shadow structure (if auditing is enabled)
1858 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1859 * about to free it and the attached cluster into their caches.
1862 /* Restore constructed mbuf fields */
1863 if (mclaudit
!= NULL
)
1864 mcl_audit_restore_mbuf(m
, mca
, TRUE
);
1869 rfa
= (mcache_obj_t
*)MEXT_RFA(m
);
1870 rfa
->obj_next
= ref_list
;
1874 m
->m_type
= MT_FREE
;
1875 m
->m_flags
= m
->m_len
= 0;
1876 m
->m_next
= m
->m_nextpkt
= NULL
;
1878 /* Save mbuf fields and make auditing happy */
1879 if (mclaudit
!= NULL
)
1880 mcl_audit_mbuf(mca
, o
, FALSE
, FALSE
);
1882 VERIFY(m_total(class) > 0);
1887 slab_free(MC_MBUF
, o
);
1889 /* And free the cluster */
1890 ((mcache_obj_t
*)cl
)->obj_next
= NULL
;
1891 if (class == MC_MBUF_CL
)
1892 slab_free(MC_CL
, cl
);
1893 else if (class == MC_MBUF_BIGCL
)
1894 slab_free(MC_BIGCL
, cl
);
1896 slab_free(MC_16KCL
, cl
);
1905 tail
->obj_next
= m_cobjlist(class);
1906 m_cobjlist(class) = list
;
1907 m_infree(class) += num
;
1908 } else if (ref_list
!= NULL
) {
1909 mcache_free_ext(ref_cache
, ref_list
);
1916 * Common allocator for composite objects called by the CPU cache layer
1917 * during an allocation request whenever there is no available element in
1918 * the bucket layer. It returns one or more composite elements from the
1919 * appropriate global freelist. If the freelist is empty, it will attempt
1920 * to obtain the rudimentary objects from their caches and construct them
1921 * into composite mbuf + cluster objects.
1924 mbuf_cslab_alloc(void *arg
, mcache_obj_t
***plist
, unsigned int needed
,
1927 mbuf_class_t
class = (mbuf_class_t
)arg
;
1928 mcache_t
*cp
= NULL
;
1929 unsigned int num
= 0, cnum
= 0, want
= needed
;
1930 mcache_obj_t
*ref_list
= NULL
;
1931 mcache_obj_t
*mp_list
= NULL
;
1932 mcache_obj_t
*clp_list
= NULL
;
1933 mcache_obj_t
**list
;
1934 struct ext_ref
*rfa
;
1938 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1941 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1943 /* There should not be any slab for this class */
1944 VERIFY(m_slab_cnt(class) == 0 &&
1945 m_slablist(class).tqh_first
== NULL
&&
1946 m_slablist(class).tqh_last
== NULL
);
1948 lck_mtx_lock(mbuf_mlock
);
1950 /* Try using the freelist first */
1951 num
= cslab_alloc(class, plist
, needed
);
1953 if (num
== needed
) {
1954 m_alloc_cnt(class) += num
;
1955 lck_mtx_unlock(mbuf_mlock
);
1959 lck_mtx_unlock(mbuf_mlock
);
1962 * We could not satisfy the request using the freelist alone;
1963 * allocate from the appropriate rudimentary caches and use
1964 * whatever we can get to construct the composite objects.
1969 * Mark these allocation requests as coming from a composite cache.
1970 * Also, if the caller is willing to be blocked, mark the request
1971 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
1972 * slab layer waiting for the individual object when one or more
1973 * of the already-constructed composite objects are available.
1976 if (!(wait
& MCR_NOSLEEP
))
1979 needed
= mcache_alloc_ext(m_cache(MC_MBUF
), &mp_list
, needed
, wait
);
1981 ASSERT(mp_list
== NULL
);
1984 if (class == MC_MBUF_CL
)
1985 cp
= m_cache(MC_CL
);
1986 else if (class == MC_MBUF_BIGCL
)
1987 cp
= m_cache(MC_BIGCL
);
1989 cp
= m_cache(MC_16KCL
);
1990 needed
= mcache_alloc_ext(cp
, &clp_list
, needed
, wait
);
1992 ASSERT(clp_list
== NULL
);
1995 needed
= mcache_alloc_ext(ref_cache
, &ref_list
, needed
, wait
);
1997 ASSERT(ref_list
== NULL
);
2002 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2003 * overs will get freed accordingly before we return to caller.
2005 for (cnum
= 0; cnum
< needed
; cnum
++) {
2008 m
= ms
= (struct mbuf
*)mp_list
;
2009 mp_list
= mp_list
->obj_next
;
2012 clp_list
= clp_list
->obj_next
;
2013 ((mcache_obj_t
*)cl
)->obj_next
= NULL
;
2015 rfa
= (struct ext_ref
*)ref_list
;
2016 ref_list
= ref_list
->obj_next
;
2017 ((mcache_obj_t
*)rfa
)->obj_next
= NULL
;
2020 * If auditing is enabled, construct the shadow mbuf
2021 * in the audit structure instead of in the actual one.
2022 * mbuf_cslab_audit() will take care of restoring the
2023 * contents after the integrity check.
2025 if (mclaudit
!= NULL
) {
2026 mcache_audit_t
*mca
, *cl_mca
;
2029 lck_mtx_lock(mbuf_mlock
);
2030 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
2031 ms
= ((struct mbuf
*)mca
->mca_contents
);
2032 cl_mca
= mcl_audit_buf2mca(MC_CL
, (mcache_obj_t
*)cl
);
2035 * Pair them up. Note that this is done at the time
2036 * the mbuf+cluster objects are constructed. This
2037 * information should be treated as "best effort"
2038 * debugging hint since more than one mbufs can refer
2039 * to a cluster. In that case, the cluster might not
2040 * be freed along with the mbuf it was paired with.
2042 mca
->mca_uptr
= cl_mca
;
2043 cl_mca
->mca_uptr
= mca
;
2045 ASSERT(mca
->mca_uflags
& MB_SCVALID
);
2046 ASSERT(!(cl_mca
->mca_uflags
& MB_SCVALID
));
2047 lck_mtx_unlock(mbuf_mlock
);
2049 /* Technically, they are in the freelist */
2050 mcache_set_pattern(MCACHE_FREE_PATTERN
, m
,
2051 m_maxsize(MC_MBUF
));
2052 if (class == MC_MBUF_CL
)
2053 size
= m_maxsize(MC_CL
);
2054 else if (class == MC_MBUF_BIGCL
)
2055 size
= m_maxsize(MC_BIGCL
);
2057 size
= m_maxsize(MC_16KCL
);
2058 mcache_set_pattern(MCACHE_FREE_PATTERN
, cl
, size
);
2061 MBUF_INIT(ms
, 0, MT_FREE
);
2062 if (class == MC_MBUF_16KCL
) {
2063 MBUF_16KCL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
2064 } else if (class == MC_MBUF_BIGCL
) {
2065 MBUF_BIGCL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
2067 MBUF_CL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
2069 VERIFY(ms
->m_flags
== M_EXT
);
2070 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
2072 *list
= (mcache_obj_t
*)m
;
2073 (*list
)->obj_next
= NULL
;
2074 list
= *plist
= &(*list
)->obj_next
;
2079 * Free up what's left of the above.
2081 if (mp_list
!= NULL
)
2082 mcache_free_ext(m_cache(MC_MBUF
), mp_list
);
2083 if (clp_list
!= NULL
)
2084 mcache_free_ext(cp
, clp_list
);
2085 if (ref_list
!= NULL
)
2086 mcache_free_ext(ref_cache
, ref_list
);
2088 lck_mtx_lock(mbuf_mlock
);
2089 if (num
> 0 || cnum
> 0) {
2090 m_total(class) += cnum
;
2091 VERIFY(m_total(class) <= m_maxlimit(class));
2092 m_alloc_cnt(class) += num
+ cnum
;
2094 if ((num
+ cnum
) < want
)
2095 m_fail_cnt(class) += (want
- (num
+ cnum
));
2096 lck_mtx_unlock(mbuf_mlock
);
2098 return (num
+ cnum
);
2102 * Common de-allocator for composite objects called by the CPU cache
2103 * layer when one or more elements need to be returned to the appropriate
2107 mbuf_cslab_free(void *arg
, mcache_obj_t
*list
, int purged
)
2109 mbuf_class_t
class = (mbuf_class_t
)arg
;
2113 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2115 lck_mtx_lock(mbuf_mlock
);
2117 num
= cslab_free(class, list
, purged
);
2118 m_free_cnt(class) += num
;
2120 if ((w
= mb_waiters
) > 0)
2123 lck_mtx_unlock(mbuf_mlock
);
2126 wakeup(mb_waitchan
);
2130 * Common auditor for composite objects called by the CPU cache layer
2131 * during an allocation or free request. For the former, this is called
2132 * after the objects are obtained from either the bucket or slab layer
2133 * and before they are returned to the caller. For the latter, this is
2134 * called immediately during free and before placing the objects into
2135 * the bucket or slab layer.
2138 mbuf_cslab_audit(void *arg
, mcache_obj_t
*list
, boolean_t alloc
)
2140 mbuf_class_t
class = (mbuf_class_t
)arg
;
2141 mcache_audit_t
*mca
;
2142 struct mbuf
*m
, *ms
;
2143 mcl_slab_t
*clsp
, *nsp
;
2147 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2149 while ((m
= ms
= (struct mbuf
*)list
) != NULL
) {
2150 lck_mtx_lock(mbuf_mlock
);
2151 /* Do the mbuf sanity checks and record its transaction */
2152 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
2153 mcl_audit_mbuf(mca
, m
, TRUE
, alloc
);
2154 mcache_buffer_log(mca
, m
, m_cache(class));
2156 mca
->mca_uflags
|= MB_COMP_INUSE
;
2158 mca
->mca_uflags
&= ~MB_COMP_INUSE
;
2161 * Use the shadow mbuf in the audit structure if we are
2162 * freeing, since the contents of the actual mbuf has been
2163 * pattern-filled by the above call to mcl_audit_mbuf().
2166 ms
= (struct mbuf
*)mca
->mca_contents
;
2168 /* Do the cluster sanity checks and record its transaction */
2169 cl
= ms
->m_ext
.ext_buf
;
2170 clsp
= slab_get(cl
);
2171 VERIFY(ms
->m_flags
== M_EXT
&& cl
!= NULL
);
2172 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
2173 VERIFY(clsp
->sl_refcnt
== 1);
2174 if (class == MC_MBUF_BIGCL
) {
2175 nsp
= clsp
->sl_next
;
2176 /* Next slab must already be present */
2177 VERIFY(nsp
!= NULL
);
2178 VERIFY(nsp
->sl_refcnt
== 1);
2179 } else if (class == MC_MBUF_16KCL
) {
2181 for (nsp
= clsp
, k
= 1;
2182 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
2184 /* Next slab must already be present */
2185 VERIFY(nsp
!= NULL
);
2186 VERIFY(nsp
->sl_refcnt
== 1);
2190 mca
= mcl_audit_buf2mca(MC_CL
, cl
);
2191 if (class == MC_MBUF_CL
)
2192 size
= m_maxsize(MC_CL
);
2193 else if (class == MC_MBUF_BIGCL
)
2194 size
= m_maxsize(MC_BIGCL
);
2196 size
= m_maxsize(MC_16KCL
);
2197 mcl_audit_cluster(mca
, cl
, size
, alloc
, FALSE
);
2198 mcache_buffer_log(mca
, cl
, m_cache(class));
2200 mca
->mca_uflags
|= MB_COMP_INUSE
;
2202 mca
->mca_uflags
&= ~MB_COMP_INUSE
;
2203 lck_mtx_unlock(mbuf_mlock
);
2205 list
= list
->obj_next
;
2210 * Allocate some number of mbuf clusters and place on cluster freelist.
2213 m_clalloc(const u_int32_t num
, const int wait
, const u_int32_t bufsize
)
2217 int numpages
= 0, large_buffer
= (bufsize
== m_maxsize(MC_16KCL
));
2218 vm_offset_t page
= 0;
2219 mcache_audit_t
*mca_list
= NULL
;
2220 mcache_obj_t
*con_list
= NULL
;
2223 VERIFY(bufsize
== m_maxsize(MC_CL
) ||
2224 bufsize
== m_maxsize(MC_BIGCL
) || bufsize
== m_maxsize(MC_16KCL
));
2226 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2229 * Multiple threads may attempt to populate the cluster map one
2230 * after another. Since we drop the lock below prior to acquiring
2231 * the physical page(s), our view of the cluster map may no longer
2232 * be accurate, and we could end up over-committing the pages beyond
2233 * the maximum allowed for each class. To prevent it, this entire
2234 * operation (including the page mapping) is serialized.
2236 while (mb_clalloc_busy
) {
2237 mb_clalloc_waiters
++;
2238 (void) msleep(mb_clalloc_waitchan
, mbuf_mlock
,
2239 (PZERO
-1), "m_clalloc", NULL
);
2240 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2243 /* We are busy now; tell everyone else to go away */
2244 mb_clalloc_busy
= TRUE
;
2247 * Honor the caller's wish to block or not block. We have a way
2248 * to grow the pool asynchronously using the mbuf worker thread.
2250 i
= m_howmany(num
, bufsize
);
2251 if (i
== 0 || (wait
& M_DONTWAIT
))
2254 lck_mtx_unlock(mbuf_mlock
);
2256 size
= round_page(i
* bufsize
);
2257 page
= kmem_mb_alloc(mb_map
, size
, large_buffer
);
2260 * If we did ask for "n" 16K physically contiguous chunks
2261 * and didn't get them, then please try again without this
2264 if (large_buffer
&& page
== 0)
2265 page
= kmem_mb_alloc(mb_map
, size
, 0);
2268 if (bufsize
<= m_maxsize(MC_BIGCL
)) {
2269 /* Try for 1 page if failed, only for 2KB/4KB request */
2271 page
= kmem_mb_alloc(mb_map
, size
, 0);
2275 lck_mtx_lock(mbuf_mlock
);
2280 VERIFY(IS_P2ALIGNED(page
, NBPG
));
2281 numpages
= size
/ NBPG
;
2283 /* If auditing is enabled, allocate the audit structures now */
2284 if (mclaudit
!= NULL
) {
2288 * Yes, I realize this is a waste of memory for clusters
2289 * that never get transformed into mbufs, as we may end
2290 * up with NMBPCL-1 unused audit structures per cluster.
2291 * But doing so tremendously simplifies the allocation
2292 * strategy, since at this point we are not holding the
2293 * mbuf lock and the caller is okay to be blocked. For
2294 * the case of big clusters, we allocate one structure
2295 * for each as we never turn them into mbufs.
2297 if (bufsize
== m_maxsize(MC_CL
)) {
2298 needed
= numpages
* 2 * NMBPCL
;
2300 i
= mcache_alloc_ext(mcl_audit_con_cache
,
2301 &con_list
, needed
, MCR_SLEEP
);
2303 VERIFY(con_list
!= NULL
&& i
== needed
);
2304 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2307 needed
= numpages
/ (M16KCLBYTES
/ NBPG
);
2310 i
= mcache_alloc_ext(mcache_audit_cache
,
2311 (mcache_obj_t
**)&mca_list
, needed
, MCR_SLEEP
);
2313 VERIFY(mca_list
!= NULL
&& i
== needed
);
2316 lck_mtx_lock(mbuf_mlock
);
2318 for (i
= 0; i
< numpages
; i
++, page
+= NBPG
) {
2319 ppnum_t offset
= ((char *)page
- (char *)mbutl
) / NBPG
;
2320 ppnum_t new_page
= pmap_find_phys(kernel_pmap
,
2324 * In the case of no mapper being available the following
2325 * code noops and returns the input page; if there is a
2326 * mapper the appropriate I/O page is returned.
2328 VERIFY(offset
< mcl_pages
);
2329 new_page
= IOMapperInsertPage(mcl_paddr_base
, offset
, new_page
);
2330 mcl_paddr
[offset
] = new_page
<< PGSHIFT
;
2332 /* Pattern-fill this fresh page */
2333 if (mclaudit
!= NULL
)
2334 mcache_set_pattern(MCACHE_FREE_PATTERN
,
2335 (caddr_t
)page
, NBPG
);
2337 if (bufsize
== m_maxsize(MC_CL
)) {
2338 union mcluster
*mcl
= (union mcluster
*)page
;
2340 /* 1st cluster in the page */
2342 if (mclaudit
!= NULL
)
2343 mcl_audit_init(mcl
, &mca_list
, &con_list
,
2344 AUDIT_CONTENTS_SIZE
, NMBPCL
);
2346 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2347 slab_init(sp
, MC_CL
, SLF_MAPPED
,
2348 mcl
, mcl
, bufsize
, 0, 1);
2350 /* Insert this slab */
2351 slab_insert(sp
, MC_CL
);
2353 /* Update stats now since slab_get() drops the lock */
2354 mbstat
.m_clfree
= ++m_infree(MC_CL
) +
2355 m_infree(MC_MBUF_CL
);
2356 mbstat
.m_clusters
= ++m_total(MC_CL
);
2357 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2359 /* 2nd cluster in the page */
2360 sp
= slab_get(++mcl
);
2361 if (mclaudit
!= NULL
)
2362 mcl_audit_init(mcl
, &mca_list
, &con_list
,
2363 AUDIT_CONTENTS_SIZE
, NMBPCL
);
2365 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2366 slab_init(sp
, MC_CL
, SLF_MAPPED
,
2367 mcl
, mcl
, bufsize
, 0, 1);
2369 /* Insert this slab */
2370 slab_insert(sp
, MC_CL
);
2372 /* Update stats now since slab_get() drops the lock */
2373 mbstat
.m_clfree
= ++m_infree(MC_CL
) +
2374 m_infree(MC_MBUF_CL
);
2375 mbstat
.m_clusters
= ++m_total(MC_CL
);
2376 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2377 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2378 union mbigcluster
*mbc
= (union mbigcluster
*)page
;
2381 /* One for the entire page */
2383 if (mclaudit
!= NULL
)
2384 mcl_audit_init(mbc
, &mca_list
, NULL
, 0, 1);
2386 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2387 slab_init(sp
, MC_BIGCL
, SLF_MAPPED
,
2388 mbc
, mbc
, bufsize
, 0, 1);
2390 /* 2nd cluster's slab is part of the previous one */
2391 nsp
= slab_get(((union mcluster
*)page
) + 1);
2392 slab_init(nsp
, MC_BIGCL
, SLF_MAPPED
| SLF_PARTIAL
,
2393 mbc
, NULL
, 0, 0, 0);
2395 /* Insert this slab */
2396 slab_insert(sp
, MC_BIGCL
);
2398 /* Update stats now since slab_get() drops the lock */
2399 mbstat
.m_bigclfree
= ++m_infree(MC_BIGCL
) +
2400 m_infree(MC_MBUF_BIGCL
);
2401 mbstat
.m_bigclusters
= ++m_total(MC_BIGCL
);
2402 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
2403 } else if ((i
% (M16KCLBYTES
/ NBPG
)) == 0) {
2404 union m16kcluster
*m16kcl
= (union m16kcluster
*)page
;
2409 /* One for the entire 16KB */
2410 sp
= slab_get(m16kcl
);
2411 if (mclaudit
!= NULL
)
2412 mcl_audit_init(m16kcl
, &mca_list
, NULL
, 0, 1);
2414 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2415 slab_init(sp
, MC_16KCL
, SLF_MAPPED
,
2416 m16kcl
, m16kcl
, bufsize
, 0, 1);
2418 /* 2nd-8th cluster's slab is part of the first one */
2419 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
2420 nsp
= slab_get(((union mcluster
*)page
) + k
);
2421 VERIFY(nsp
->sl_refcnt
== 0 &&
2422 nsp
->sl_flags
== 0);
2423 slab_init(nsp
, MC_16KCL
,
2424 SLF_MAPPED
| SLF_PARTIAL
,
2425 m16kcl
, NULL
, 0, 0, 0);
2428 /* Insert this slab */
2429 slab_insert(sp
, MC_16KCL
);
2431 /* Update stats now since slab_get() drops the lock */
2432 m_infree(MC_16KCL
)++;
2433 m_total(MC_16KCL
)++;
2434 VERIFY(m_total(MC_16KCL
) <= m_maxlimit(MC_16KCL
));
2437 VERIFY(mca_list
== NULL
&& con_list
== NULL
);
2439 /* We're done; let others enter */
2440 mb_clalloc_busy
= FALSE
;
2441 if (mb_clalloc_waiters
> 0) {
2442 mb_clalloc_waiters
= 0;
2443 wakeup(mb_clalloc_waitchan
);
2446 if (bufsize
== m_maxsize(MC_CL
))
2447 return (numpages
<< 1);
2448 else if (bufsize
== m_maxsize(MC_BIGCL
))
2451 VERIFY(bufsize
== m_maxsize(MC_16KCL
));
2452 return (numpages
/ (M16KCLBYTES
/ NBPG
));
2455 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2457 /* We're done; let others enter */
2458 mb_clalloc_busy
= FALSE
;
2459 if (mb_clalloc_waiters
> 0) {
2460 mb_clalloc_waiters
= 0;
2461 wakeup(mb_clalloc_waitchan
);
2465 * When non-blocking we kick a thread if we have to grow the
2466 * pool or if the number of free clusters is less than requested.
2468 if (bufsize
== m_maxsize(MC_CL
)) {
2471 * Remember total number of clusters needed
2474 i
+= m_total(MC_CL
);
2475 if (i
> mbuf_expand_mcl
) {
2476 mbuf_expand_mcl
= i
;
2477 if (mbuf_worker_ready
)
2478 wakeup((caddr_t
)&mbuf_worker_run
);
2482 if (m_infree(MC_CL
) >= num
)
2484 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2487 * Remember total number of 4KB clusters needed
2490 i
+= m_total(MC_BIGCL
);
2491 if (i
> mbuf_expand_big
) {
2492 mbuf_expand_big
= i
;
2493 if (mbuf_worker_ready
)
2494 wakeup((caddr_t
)&mbuf_worker_run
);
2498 if (m_infree(MC_BIGCL
) >= num
)
2503 * Remember total number of 16KB clusters needed
2506 i
+= m_total(MC_16KCL
);
2507 if (i
> mbuf_expand_16k
) {
2508 mbuf_expand_16k
= i
;
2509 if (mbuf_worker_ready
)
2510 wakeup((caddr_t
)&mbuf_worker_run
);
2514 if (m_infree(MC_16KCL
) >= num
)
2521 * Populate the global freelist of the corresponding buffer class.
2524 freelist_populate(mbuf_class_t
class, unsigned int num
, int wait
)
2526 mcache_obj_t
*o
= NULL
;
2529 VERIFY(class == MC_MBUF
|| class == MC_CL
|| class == MC_BIGCL
||
2532 #if CONFIG_MBUF_NOEXPAND
2533 if ((mbstat
.m_mbufs
/ NMBPCL
) >= maxmbufcl
) {
2535 static int printonce
= 1;
2536 if (printonce
== 1) {
2538 printf("m_expand failed, allocated %ld out of %d "
2539 "clusters\n", mbstat
.m_mbufs
/ NMBPCL
,
2545 #endif /* CONFIG_MBUF_NOEXPAND */
2547 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2552 i
= m_clalloc(num
, wait
, m_maxsize(MC_CL
));
2554 /* Respect the 2K clusters minimum limit */
2555 if (m_total(MC_CL
) == m_maxlimit(MC_CL
) &&
2556 m_infree(MC_CL
) <= m_minlimit(MC_CL
)) {
2557 if (class != MC_CL
|| (wait
& MCR_COMP
))
2566 return (m_clalloc(num
, wait
, m_maxsize(class)) != 0);
2574 /* Steal a cluster and cut it up to create NMBPCL mbufs */
2575 if ((o
= slab_alloc(MC_CL
, wait
)) != NULL
) {
2576 struct mbuf
*m
= (struct mbuf
*)o
;
2577 mcache_audit_t
*mca
= NULL
;
2578 mcl_slab_t
*sp
= slab_get(o
);
2580 VERIFY(slab_is_detached(sp
) &&
2581 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
2583 /* Make sure that the cluster is unmolested while in freelist */
2584 if (mclaudit
!= NULL
) {
2585 mca
= mcl_audit_buf2mca(MC_CL
, o
);
2586 mcache_audit_free_verify(mca
, o
, 0, m_maxsize(MC_CL
));
2589 /* Reinitialize it as an mbuf slab */
2590 slab_init(sp
, MC_MBUF
, sp
->sl_flags
, sp
->sl_base
, NULL
,
2591 sp
->sl_len
, 0, NMBPCL
);
2593 VERIFY(m
== (struct mbuf
*)sp
->sl_base
);
2594 VERIFY(sp
->sl_head
== NULL
);
2596 m_total(MC_MBUF
) += NMBPCL
;
2597 mbstat
.m_mbufs
= m_total(MC_MBUF
);
2598 m_infree(MC_MBUF
) += NMBPCL
;
2599 mtype_stat_add(MT_FREE
, NMBPCL
);
2604 * If auditing is enabled, construct the shadow mbuf
2605 * in the audit structure instead of the actual one.
2606 * mbuf_slab_audit() will take care of restoring the
2607 * contents after the integrity check.
2609 if (mclaudit
!= NULL
) {
2611 mca
= mcl_audit_buf2mca(MC_MBUF
,
2613 ms
= ((struct mbuf
*)mca
->mca_contents
);
2614 ms
->m_type
= MT_FREE
;
2616 m
->m_type
= MT_FREE
;
2618 m
->m_next
= sp
->sl_head
;
2619 sp
->sl_head
= (void *)m
++;
2622 /* Insert it into the mbuf class's slab list */
2623 slab_insert(sp
, MC_MBUF
);
2625 if ((i
= mb_waiters
) > 0)
2628 wakeup(mb_waitchan
);
2637 * (Inaccurately) check if it might be worth a trip back to the
2638 * mcache layer due the availability of objects there. We'll
2639 * end up back here if there's nothing up there.
2642 mbuf_cached_above(mbuf_class_t
class, int wait
)
2646 if (wait
& MCR_COMP
)
2647 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL
)) ||
2648 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL
)));
2652 if (wait
& MCR_COMP
)
2653 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL
)));
2657 if (wait
& MCR_COMP
)
2658 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL
)));
2662 if (wait
& MCR_COMP
)
2663 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL
)));
2676 return (!mcache_bkt_isempty(m_cache(class)));
2680 * If possible, convert constructed objects to raw ones.
2683 mbuf_steal(mbuf_class_t
class, unsigned int num
)
2685 mcache_obj_t
*top
= NULL
;
2686 mcache_obj_t
**list
= &top
;
2687 unsigned int tot
= 0;
2689 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2701 /* Get the required number of constructed objects if possible */
2702 if (m_infree(class) > m_minlimit(class)) {
2703 tot
= cslab_alloc(class, &list
,
2704 MIN(num
, m_infree(class)));
2707 /* And destroy them to get back the raw objects */
2709 (void) cslab_free(class, top
, 1);
2717 return (tot
== num
);
2721 m_reclaim(mbuf_class_t
class, unsigned int num
, boolean_t comp
)
2725 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2727 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2728 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
2729 VERIFY(m_total(MC_16KCL
) <= m_maxlimit(MC_16KCL
));
2732 * This logic can be made smarter; for now, simply mark
2733 * all other related classes as potential victims.
2737 m_wantpurge(MC_CL
)++;
2738 m_wantpurge(MC_MBUF_CL
)++;
2739 m_wantpurge(MC_MBUF_BIGCL
)++;
2743 m_wantpurge(MC_MBUF
)++;
2745 m_wantpurge(MC_MBUF_CL
)++;
2750 m_wantpurge(MC_MBUF_BIGCL
)++;
2755 m_wantpurge(MC_MBUF_16KCL
)++;
2764 * Run through each marked class and check if we really need to
2765 * purge (and therefore temporarily disable) the per-CPU caches
2766 * layer used by the class. If so, remember the classes since
2767 * we are going to drop the lock below prior to purging.
2769 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
2770 if (m_wantpurge(m
) > 0) {
2773 * Try hard to steal the required number of objects
2774 * from the freelist of other mbuf classes. Only
2775 * purge and disable the per-CPU caches layer when
2776 * we don't have enough; it's the last resort.
2778 if (!mbuf_steal(m
, num
))
2783 lck_mtx_unlock(mbuf_mlock
);
2786 /* drain is performed in pfslowtimo(), to avoid deadlocks */
2789 /* Sigh; we have no other choices but to ask mcache to purge */
2790 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
2791 if ((bmap
& (1 << m
)) &&
2792 mcache_purge_cache(m_cache(m
))) {
2793 lck_mtx_lock(mbuf_mlock
);
2796 lck_mtx_unlock(mbuf_mlock
);
2801 * Request mcache to reap extra elements from all of its caches;
2802 * note that all reaps are serialized and happen only at a fixed
2807 lck_mtx_lock(mbuf_mlock
);
2810 static inline struct mbuf
*
2811 m_get_common(int wait
, short type
, int hdr
)
2814 int mcflags
= MSLEEPF(wait
);
2816 /* Is this due to a non-blocking retry? If so, then try harder */
2817 if (mcflags
& MCR_NOSLEEP
)
2818 mcflags
|= MCR_TRYHARD
;
2820 m
= mcache_alloc(m_cache(MC_MBUF
), mcflags
);
2822 MBUF_INIT(m
, hdr
, type
);
2823 mtype_stat_inc(type
);
2824 mtype_stat_dec(MT_FREE
);
2826 if (hdr
&& mac_init_mbuf(m
, wait
) != 0) {
2830 #endif /* MAC_NET */
2836 * Space allocation routines; these are also available as macros
2837 * for critical paths.
2839 #define _M_GET(wait, type) m_get_common(wait, type, 0)
2840 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
2841 #define _M_RETRY(wait, type) _M_GET(wait, type)
2842 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
2843 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
2844 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
2847 m_get(int wait
, int type
)
2849 return (_M_GET(wait
, type
));
2853 m_gethdr(int wait
, int type
)
2855 return (_M_GETHDR(wait
, type
));
2859 m_retry(int wait
, int type
)
2861 return (_M_RETRY(wait
, type
));
2865 m_retryhdr(int wait
, int type
)
2867 return (_M_RETRYHDR(wait
, type
));
2871 m_getclr(int wait
, int type
)
2875 _MGET(m
, wait
, type
);
2877 bzero(MTOD(m
, caddr_t
), MLEN
);
2882 m_free(struct mbuf
*m
)
2884 struct mbuf
*n
= m
->m_next
;
2886 if (m
->m_type
== MT_FREE
)
2887 panic("m_free: freeing an already freed mbuf");
2889 /* Free the aux data and tags if there is any */
2890 if (m
->m_flags
& M_PKTHDR
) {
2891 m_tag_delete_chain(m
, NULL
);
2894 if (m
->m_flags
& M_EXT
) {
2898 refcnt
= m_decref(m
);
2899 flags
= MEXT_FLAGS(m
);
2900 if (refcnt
== 0 && flags
== 0) {
2901 if (m
->m_ext
.ext_free
== NULL
) {
2902 mcache_free(m_cache(MC_CL
), m
->m_ext
.ext_buf
);
2903 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2904 mcache_free(m_cache(MC_BIGCL
),
2906 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
2907 mcache_free(m_cache(MC_16KCL
),
2910 (*(m
->m_ext
.ext_free
))(m
->m_ext
.ext_buf
,
2911 m
->m_ext
.ext_size
, m
->m_ext
.ext_arg
);
2913 mcache_free(ref_cache
, MEXT_RFA(m
));
2915 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
2916 VERIFY(m
->m_type
!= MT_FREE
);
2918 mtype_stat_dec(m
->m_type
);
2919 mtype_stat_inc(MT_FREE
);
2921 m
->m_type
= MT_FREE
;
2924 m
->m_next
= m
->m_nextpkt
= NULL
;
2926 /* "Free" into the intermediate cache */
2927 if (m
->m_ext
.ext_free
== NULL
) {
2928 mcache_free(m_cache(MC_MBUF_CL
), m
);
2929 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2930 mcache_free(m_cache(MC_MBUF_BIGCL
), m
);
2932 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
2933 mcache_free(m_cache(MC_MBUF_16KCL
), m
);
2939 if (m
->m_type
!= MT_FREE
) {
2940 mtype_stat_dec(m
->m_type
);
2941 mtype_stat_inc(MT_FREE
);
2944 m
->m_type
= MT_FREE
;
2945 m
->m_flags
= m
->m_len
= 0;
2946 m
->m_next
= m
->m_nextpkt
= NULL
;
2948 mcache_free(m_cache(MC_MBUF
), m
);
2953 __private_extern__
struct mbuf
*
2954 m_clattach(struct mbuf
*m
, int type
, caddr_t extbuf
,
2955 void (*extfree
)(caddr_t
, u_int
, caddr_t
), u_int extsize
, caddr_t extarg
,
2958 struct ext_ref
*rfa
= NULL
;
2960 if (m
== NULL
&& (m
= _M_GETHDR(wait
, type
)) == NULL
)
2963 if (m
->m_flags
& M_EXT
) {
2967 refcnt
= m_decref(m
);
2968 flags
= MEXT_FLAGS(m
);
2969 if (refcnt
== 0 && flags
== 0) {
2970 if (m
->m_ext
.ext_free
== NULL
) {
2971 mcache_free(m_cache(MC_CL
), m
->m_ext
.ext_buf
);
2972 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2973 mcache_free(m_cache(MC_BIGCL
),
2975 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
2976 mcache_free(m_cache(MC_16KCL
),
2979 (*(m
->m_ext
.ext_free
))(m
->m_ext
.ext_buf
,
2980 m
->m_ext
.ext_size
, m
->m_ext
.ext_arg
);
2982 /* Re-use the reference structure */
2984 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
2985 VERIFY(m
->m_type
!= MT_FREE
);
2987 mtype_stat_dec(m
->m_type
);
2988 mtype_stat_inc(MT_FREE
);
2990 m
->m_type
= MT_FREE
;
2993 m
->m_next
= m
->m_nextpkt
= NULL
;
2994 /* "Free" into the intermediate cache */
2995 if (m
->m_ext
.ext_free
== NULL
) {
2996 mcache_free(m_cache(MC_MBUF_CL
), m
);
2997 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2998 mcache_free(m_cache(MC_MBUF_BIGCL
), m
);
3000 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
3001 mcache_free(m_cache(MC_MBUF_16KCL
), m
);
3004 * Allocate a new mbuf, since we didn't divorce
3005 * the composite mbuf + cluster pair above.
3007 if ((m
= _M_GETHDR(wait
, type
)) == NULL
)
3013 (rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
) {
3018 MEXT_INIT(m
, extbuf
, extsize
, extfree
, extarg
, rfa
, 1, 0);
3024 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3025 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3028 m_getcl(int wait
, int type
, int flags
)
3031 int mcflags
= MSLEEPF(wait
);
3032 int hdr
= (flags
& M_PKTHDR
);
3034 /* Is this due to a non-blocking retry? If so, then try harder */
3035 if (mcflags
& MCR_NOSLEEP
)
3036 mcflags
|= MCR_TRYHARD
;
3038 m
= mcache_alloc(m_cache(MC_MBUF_CL
), mcflags
);
3040 MBUF_INIT(m
, hdr
, type
);
3041 mtype_stat_inc(type
);
3042 mtype_stat_dec(MT_FREE
);
3044 if (hdr
&& mac_init_mbuf(m
, wait
) != 0) {
3048 #endif /* MAC_NET */
3053 /* m_mclget() add an mbuf cluster to a normal mbuf */
3055 m_mclget(struct mbuf
*m
, int wait
)
3057 struct ext_ref
*rfa
;
3059 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
3062 m
->m_ext
.ext_buf
= m_mclalloc(wait
);
3063 if (m
->m_ext
.ext_buf
!= NULL
) {
3064 MBUF_CL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
3066 mcache_free(ref_cache
, rfa
);
3071 /* Allocate an mbuf cluster */
3073 m_mclalloc(int wait
)
3075 int mcflags
= MSLEEPF(wait
);
3077 /* Is this due to a non-blocking retry? If so, then try harder */
3078 if (mcflags
& MCR_NOSLEEP
)
3079 mcflags
|= MCR_TRYHARD
;
3081 return (mcache_alloc(m_cache(MC_CL
), mcflags
));
3084 /* Free an mbuf cluster */
3086 m_mclfree(caddr_t p
)
3088 mcache_free(m_cache(MC_CL
), p
);
3092 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3096 m_mclhasreference(struct mbuf
*m
)
3098 if (!(m
->m_flags
& M_EXT
))
3101 ASSERT(MEXT_RFA(m
) != NULL
);
3103 return (MEXT_REF(m
) > 1);
3106 __private_extern__ caddr_t
3107 m_bigalloc(int wait
)
3109 int mcflags
= MSLEEPF(wait
);
3111 /* Is this due to a non-blocking retry? If so, then try harder */
3112 if (mcflags
& MCR_NOSLEEP
)
3113 mcflags
|= MCR_TRYHARD
;
3115 return (mcache_alloc(m_cache(MC_BIGCL
), mcflags
));
3118 __private_extern__
void
3119 m_bigfree(caddr_t p
, __unused u_int size
, __unused caddr_t arg
)
3121 mcache_free(m_cache(MC_BIGCL
), p
);
3124 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3125 __private_extern__
struct mbuf
*
3126 m_mbigget(struct mbuf
*m
, int wait
)
3128 struct ext_ref
*rfa
;
3130 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
3133 m
->m_ext
.ext_buf
= m_bigalloc(wait
);
3134 if (m
->m_ext
.ext_buf
!= NULL
) {
3135 MBUF_BIGCL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
3137 mcache_free(ref_cache
, rfa
);
3142 __private_extern__ caddr_t
3143 m_16kalloc(int wait
)
3145 int mcflags
= MSLEEPF(wait
);
3147 /* Is this due to a non-blocking retry? If so, then try harder */
3148 if (mcflags
& MCR_NOSLEEP
)
3149 mcflags
|= MCR_TRYHARD
;
3151 return (mcache_alloc(m_cache(MC_16KCL
), mcflags
));
3154 __private_extern__
void
3155 m_16kfree(caddr_t p
, __unused u_int size
, __unused caddr_t arg
)
3157 mcache_free(m_cache(MC_16KCL
), p
);
3160 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3161 __private_extern__
struct mbuf
*
3162 m_m16kget(struct mbuf
*m
, int wait
)
3164 struct ext_ref
*rfa
;
3166 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
3169 m
->m_ext
.ext_buf
= m_16kalloc(wait
);
3170 if (m
->m_ext
.ext_buf
!= NULL
) {
3171 MBUF_16KCL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
3173 mcache_free(ref_cache
, rfa
);
3179 * "Move" mbuf pkthdr from "from" to "to".
3180 * "from" must have M_PKTHDR set, and "to" must be empty.
3183 m_copy_pkthdr(struct mbuf
*to
, struct mbuf
*from
)
3185 /* We will be taking over the tags of 'to' */
3186 if (to
->m_flags
& M_PKTHDR
)
3187 m_tag_delete_chain(to
, NULL
);
3188 to
->m_pkthdr
= from
->m_pkthdr
; /* especially tags */
3189 m_tag_init(from
); /* purge tags from src */
3190 to
->m_flags
= (from
->m_flags
& M_COPYFLAGS
) | (to
->m_flags
& M_EXT
);
3191 if ((to
->m_flags
& M_EXT
) == 0)
3192 to
->m_data
= to
->m_pktdat
;
3196 * Duplicate "from"'s mbuf pkthdr in "to".
3197 * "from" must have M_PKTHDR set, and "to" must be empty.
3198 * In particular, this does a deep copy of the packet tags.
3201 m_dup_pkthdr(struct mbuf
*to
, struct mbuf
*from
, int how
)
3203 if (to
->m_flags
& M_PKTHDR
)
3204 m_tag_delete_chain(to
, NULL
);
3205 to
->m_flags
= (from
->m_flags
& M_COPYFLAGS
) | (to
->m_flags
& M_EXT
);
3206 if ((to
->m_flags
& M_EXT
) == 0)
3207 to
->m_data
= to
->m_pktdat
;
3208 to
->m_pkthdr
= from
->m_pkthdr
;
3210 return (m_tag_copy_chain(to
, from
, how
));
3214 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3215 * if wantall is not set, return whatever number were available. Set up the
3216 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3217 * are chained on the m_nextpkt field. Any packets requested beyond this
3218 * are chained onto the last packet header's m_next field. The size of
3219 * the cluster is controlled by the parameter bufsize.
3221 __private_extern__
struct mbuf
*
3222 m_getpackets_internal(unsigned int *num_needed
, int num_with_pkthdrs
,
3223 int wait
, int wantall
, size_t bufsize
)
3226 struct mbuf
**np
, *top
;
3227 unsigned int pnum
, needed
= *num_needed
;
3228 mcache_obj_t
*mp_list
= NULL
;
3229 int mcflags
= MSLEEPF(wait
);
3231 struct ext_ref
*rfa
;
3235 ASSERT(bufsize
== m_maxsize(MC_CL
) ||
3236 bufsize
== m_maxsize(MC_BIGCL
) ||
3237 bufsize
== m_maxsize(MC_16KCL
));
3240 * Caller must first check for njcl because this
3241 * routine is internal and not exposed/used via KPI.
3243 VERIFY(bufsize
!= m_maxsize(MC_16KCL
) || njcl
> 0);
3250 * The caller doesn't want all the requested buffers; only some.
3251 * Try hard to get what we can, but don't block. This effectively
3252 * overrides MCR_SLEEP, since this thread will not go to sleep
3253 * if we can't get all the buffers.
3255 if (!wantall
|| (mcflags
& MCR_NOSLEEP
))
3256 mcflags
|= MCR_TRYHARD
;
3258 /* Allocate the composite mbuf + cluster elements from the cache */
3259 if (bufsize
== m_maxsize(MC_CL
))
3260 cp
= m_cache(MC_MBUF_CL
);
3261 else if (bufsize
== m_maxsize(MC_BIGCL
))
3262 cp
= m_cache(MC_MBUF_BIGCL
);
3264 cp
= m_cache(MC_MBUF_16KCL
);
3265 needed
= mcache_alloc_ext(cp
, &mp_list
, needed
, mcflags
);
3267 for (pnum
= 0; pnum
< needed
; pnum
++) {
3268 m
= (struct mbuf
*)mp_list
;
3269 mp_list
= mp_list
->obj_next
;
3271 VERIFY(m
->m_type
== MT_FREE
&& m
->m_flags
== M_EXT
);
3272 cl
= m
->m_ext
.ext_buf
;
3275 ASSERT(cl
!= NULL
&& rfa
!= NULL
);
3276 VERIFY(MBUF_IS_COMPOSITE(m
));
3278 flag
= MEXT_FLAGS(m
);
3280 MBUF_INIT(m
, num_with_pkthdrs
, MT_DATA
);
3281 if (bufsize
== m_maxsize(MC_16KCL
)) {
3282 MBUF_16KCL_INIT(m
, cl
, rfa
, 1, flag
);
3283 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
3284 MBUF_BIGCL_INIT(m
, cl
, rfa
, 1, flag
);
3286 MBUF_CL_INIT(m
, cl
, rfa
, 1, flag
);
3289 if (num_with_pkthdrs
> 0) {
3292 if (mac_mbuf_label_init(m
, wait
) != 0) {
3296 #endif /* MAC_NET */
3300 if (num_with_pkthdrs
> 0)
3305 ASSERT(pnum
!= *num_needed
|| mp_list
== NULL
);
3306 if (mp_list
!= NULL
)
3307 mcache_free_ext(cp
, mp_list
);
3310 mtype_stat_add(MT_DATA
, pnum
);
3311 mtype_stat_sub(MT_FREE
, pnum
);
3314 if (wantall
&& (pnum
!= *num_needed
)) {
3325 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3326 * wantall is not set, return whatever number were available. The size of
3327 * each mbuf in the list is controlled by the parameter packetlen. Each
3328 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3329 * in the chain is called a segment. If maxsegments is not null and the
3330 * value pointed to is not null, this specify the maximum number of segments
3331 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3332 * is zero the caller does not have any restriction on the number of segments.
3333 * The actual number of segments of a mbuf chain is return in the value
3334 * pointed to by maxsegments.
3336 __private_extern__
struct mbuf
*
3337 m_allocpacket_internal(unsigned int *numlist
, size_t packetlen
,
3338 unsigned int *maxsegments
, int wait
, int wantall
, size_t wantsize
)
3340 struct mbuf
**np
, *top
, *first
= NULL
;
3341 size_t bufsize
, r_bufsize
;
3342 unsigned int num
= 0;
3343 unsigned int nsegs
= 0;
3344 unsigned int needed
, resid
;
3345 int mcflags
= MSLEEPF(wait
);
3346 mcache_obj_t
*mp_list
= NULL
, *rmp_list
= NULL
;
3347 mcache_t
*cp
= NULL
, *rcp
= NULL
;
3355 if (wantsize
== 0) {
3356 if (packetlen
<= MINCLSIZE
) {
3357 bufsize
= packetlen
;
3358 } else if (packetlen
> m_maxsize(MC_CL
)) {
3359 /* Use 4KB if jumbo cluster pool isn't available */
3360 if (packetlen
<= m_maxsize(MC_BIGCL
) || njcl
== 0)
3361 bufsize
= m_maxsize(MC_BIGCL
);
3363 bufsize
= m_maxsize(MC_16KCL
);
3365 bufsize
= m_maxsize(MC_CL
);
3367 } else if (wantsize
== m_maxsize(MC_CL
) ||
3368 wantsize
== m_maxsize(MC_BIGCL
) ||
3369 (wantsize
== m_maxsize(MC_16KCL
) && njcl
> 0)) {
3375 if (bufsize
<= MHLEN
) {
3377 } else if (bufsize
<= MINCLSIZE
) {
3378 if (maxsegments
!= NULL
&& *maxsegments
== 1) {
3379 bufsize
= m_maxsize(MC_CL
);
3384 } else if (bufsize
== m_maxsize(MC_16KCL
)) {
3386 nsegs
= ((packetlen
- 1) >> (PGSHIFT
+ 2)) + 1;
3387 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
3388 nsegs
= ((packetlen
- 1) >> PGSHIFT
) + 1;
3390 nsegs
= ((packetlen
- 1) >> MCLSHIFT
) + 1;
3392 if (maxsegments
!= NULL
) {
3393 if (*maxsegments
&& nsegs
> *maxsegments
) {
3394 *maxsegments
= nsegs
;
3397 *maxsegments
= nsegs
;
3401 * The caller doesn't want all the requested buffers; only some.
3402 * Try hard to get what we can, but don't block. This effectively
3403 * overrides MCR_SLEEP, since this thread will not go to sleep
3404 * if we can't get all the buffers.
3406 if (!wantall
|| (mcflags
& MCR_NOSLEEP
))
3407 mcflags
|= MCR_TRYHARD
;
3410 * Simple case where all elements in the lists/chains are mbufs.
3411 * Unless bufsize is greater than MHLEN, each segment chain is made
3412 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3413 * of 2 mbufs; the second one is used for the residual data, i.e.
3414 * the remaining data that cannot fit into the first mbuf.
3416 if (bufsize
<= MINCLSIZE
) {
3417 /* Allocate the elements in one shot from the mbuf cache */
3418 ASSERT(bufsize
<= MHLEN
|| nsegs
== 2);
3419 cp
= m_cache(MC_MBUF
);
3420 needed
= mcache_alloc_ext(cp
, &mp_list
,
3421 (*numlist
) * nsegs
, mcflags
);
3424 * The number of elements must be even if we are to use an
3425 * mbuf (instead of a cluster) to store the residual data.
3426 * If we couldn't allocate the requested number of mbufs,
3427 * trim the number down (if it's odd) in order to avoid
3428 * creating a partial segment chain.
3430 if (bufsize
> MHLEN
&& (needed
& 0x1))
3433 while (num
< needed
) {
3436 m
= (struct mbuf
*)mp_list
;
3437 mp_list
= mp_list
->obj_next
;
3440 MBUF_INIT(m
, 1, MT_DATA
);
3442 if (mac_init_mbuf(m
, wait
) != 0) {
3446 #endif /* MAC_NET */
3448 if (bufsize
> MHLEN
) {
3449 /* A second mbuf for this segment chain */
3450 m
->m_next
= (struct mbuf
*)mp_list
;
3451 mp_list
= mp_list
->obj_next
;
3452 ASSERT(m
->m_next
!= NULL
);
3454 MBUF_INIT(m
->m_next
, 0, MT_DATA
);
3460 ASSERT(num
!= *numlist
|| mp_list
== NULL
);
3463 mtype_stat_add(MT_DATA
, num
);
3464 mtype_stat_sub(MT_FREE
, num
);
3468 /* We've got them all; return to caller */
3469 if (num
== *numlist
)
3476 * Complex cases where elements are made up of one or more composite
3477 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3478 * be illustrated as follows:
3480 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3482 * Every composite mbuf + cluster element comes from the intermediate
3483 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
3484 * the last composite element will come from the MC_MBUF_CL cache,
3485 * unless the residual data is larger than 2KB where we use the
3486 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
3487 * data is defined as extra data beyond the first element that cannot
3488 * fit into the previous element, i.e. there is no residual data if
3489 * the chain only has 1 segment.
3491 r_bufsize
= bufsize
;
3492 resid
= packetlen
> bufsize
? packetlen
% bufsize
: 0;
3494 /* There is residual data; figure out the cluster size */
3495 if (wantsize
== 0 && packetlen
> MINCLSIZE
) {
3497 * Caller didn't request that all of the segments
3498 * in the chain use the same cluster size; use the
3499 * smaller of the cluster sizes.
3501 if (njcl
> 0 && resid
> m_maxsize(MC_BIGCL
))
3502 r_bufsize
= m_maxsize(MC_16KCL
);
3503 else if (resid
> m_maxsize(MC_CL
))
3504 r_bufsize
= m_maxsize(MC_BIGCL
);
3506 r_bufsize
= m_maxsize(MC_CL
);
3508 /* Use the same cluster size as the other segments */
3516 * Attempt to allocate composite mbuf + cluster elements for
3517 * the residual data in each chain; record the number of such
3518 * elements that can be allocated so that we know how many
3519 * segment chains we can afford to create.
3521 if (r_bufsize
<= m_maxsize(MC_CL
))
3522 rcp
= m_cache(MC_MBUF_CL
);
3523 else if (r_bufsize
<= m_maxsize(MC_BIGCL
))
3524 rcp
= m_cache(MC_MBUF_BIGCL
);
3526 rcp
= m_cache(MC_MBUF_16KCL
);
3527 needed
= mcache_alloc_ext(rcp
, &rmp_list
, *numlist
, mcflags
);
3532 /* This is temporarily reduced for calculation */
3538 * Attempt to allocate the rest of the composite mbuf + cluster
3539 * elements for the number of segment chains that we need.
3541 if (bufsize
<= m_maxsize(MC_CL
))
3542 cp
= m_cache(MC_MBUF_CL
);
3543 else if (bufsize
<= m_maxsize(MC_BIGCL
))
3544 cp
= m_cache(MC_MBUF_BIGCL
);
3546 cp
= m_cache(MC_MBUF_16KCL
);
3547 needed
= mcache_alloc_ext(cp
, &mp_list
, needed
* nsegs
, mcflags
);
3549 /* Round it down to avoid creating a partial segment chain */
3550 needed
= (needed
/ nsegs
) * nsegs
;
3556 * We're about to construct the chain(s); take into account
3557 * the number of segments we have created above to hold the
3558 * residual data for each chain, as well as restore the
3559 * original count of segments per chain.
3562 needed
+= needed
/ nsegs
;
3569 struct ext_ref
*rfa
;
3574 if (nsegs
== 1 || (num
% nsegs
) != 0 || resid
== 0) {
3575 m
= (struct mbuf
*)mp_list
;
3576 mp_list
= mp_list
->obj_next
;
3578 m
= (struct mbuf
*)rmp_list
;
3579 rmp_list
= rmp_list
->obj_next
;
3582 VERIFY(m
->m_type
== MT_FREE
&& m
->m_flags
== M_EXT
);
3583 VERIFY(m
->m_ext
.ext_free
== NULL
||
3584 m
->m_ext
.ext_free
== m_bigfree
||
3585 m
->m_ext
.ext_free
== m_16kfree
);
3587 cl
= m
->m_ext
.ext_buf
;
3590 ASSERT(cl
!= NULL
&& rfa
!= NULL
);
3591 VERIFY(MBUF_IS_COMPOSITE(m
));
3593 flag
= MEXT_FLAGS(m
);
3595 pkthdr
= (nsegs
== 1 || (num
% nsegs
) == 1);
3598 MBUF_INIT(m
, pkthdr
, MT_DATA
);
3599 if (m
->m_ext
.ext_free
== m_16kfree
) {
3600 MBUF_16KCL_INIT(m
, cl
, rfa
, 1, flag
);
3601 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3602 MBUF_BIGCL_INIT(m
, cl
, rfa
, 1, flag
);
3604 MBUF_CL_INIT(m
, cl
, rfa
, 1, flag
);
3607 if (pkthdr
&& mac_init_mbuf(m
, wait
) != 0) {
3612 #endif /* MAC_NET */
3615 if ((num
% nsegs
) == 0)
3616 np
= &first
->m_nextpkt
;
3625 mtype_stat_add(MT_DATA
, num
);
3626 mtype_stat_sub(MT_FREE
, num
);
3631 /* We've got them all; return to caller */
3632 if (num
== *numlist
) {
3633 ASSERT(mp_list
== NULL
&& rmp_list
== NULL
);
3638 /* Free up what's left of the above */
3639 if (mp_list
!= NULL
)
3640 mcache_free_ext(cp
, mp_list
);
3641 if (rmp_list
!= NULL
)
3642 mcache_free_ext(rcp
, rmp_list
);
3643 if (wantall
&& top
!= NULL
) {
3652 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3653 * packets on receive ring.
3655 __private_extern__
struct mbuf
*
3656 m_getpacket_how(int wait
)
3658 unsigned int num_needed
= 1;
3660 return (m_getpackets_internal(&num_needed
, 1, wait
, 1,
3665 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3666 * packets on receive ring.
3671 unsigned int num_needed
= 1;
3673 return (m_getpackets_internal(&num_needed
, 1, M_WAIT
, 1,
3678 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3679 * if this can't be met, return whatever number were available. Set up the
3680 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
3681 * are chained on the m_nextpkt field. Any packets requested beyond this are
3682 * chained onto the last packet header's m_next field.
3685 m_getpackets(int num_needed
, int num_with_pkthdrs
, int how
)
3687 unsigned int n
= num_needed
;
3689 return (m_getpackets_internal(&n
, num_with_pkthdrs
, how
, 0,
3694 * Return a list of mbuf hdrs set up as packet hdrs chained together
3695 * on the m_nextpkt field
3698 m_getpackethdrs(int num_needed
, int how
)
3701 struct mbuf
**np
, *top
;
3706 while (num_needed
--) {
3707 m
= _M_RETRYHDR(how
, MT_DATA
);
3719 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
3720 * for mbufs packets freed. Used by the drivers.
3723 m_freem_list(struct mbuf
*m
)
3725 struct mbuf
*nextpkt
;
3726 mcache_obj_t
*mp_list
= NULL
;
3727 mcache_obj_t
*mcl_list
= NULL
;
3728 mcache_obj_t
*mbc_list
= NULL
;
3729 mcache_obj_t
*m16k_list
= NULL
;
3730 mcache_obj_t
*m_mcl_list
= NULL
;
3731 mcache_obj_t
*m_mbc_list
= NULL
;
3732 mcache_obj_t
*m_m16k_list
= NULL
;
3733 mcache_obj_t
*ref_list
= NULL
;
3735 int mt_free
= 0, mt_data
= 0, mt_header
= 0, mt_soname
= 0, mt_tag
= 0;
3740 nextpkt
= m
->m_nextpkt
;
3741 m
->m_nextpkt
= NULL
;
3744 struct mbuf
*next
= m
->m_next
;
3745 mcache_obj_t
*o
, *rfa
;
3746 u_int32_t refcnt
, flags
;
3748 if (m
->m_type
== MT_FREE
)
3749 panic("m_free: freeing an already freed mbuf");
3751 if (m
->m_type
!= MT_FREE
)
3754 if (m
->m_flags
& M_PKTHDR
) {
3755 m_tag_delete_chain(m
, NULL
);
3758 if (!(m
->m_flags
& M_EXT
))
3761 o
= (mcache_obj_t
*)m
->m_ext
.ext_buf
;
3762 refcnt
= m_decref(m
);
3763 flags
= MEXT_FLAGS(m
);
3764 if (refcnt
== 0 && flags
== 0) {
3765 if (m
->m_ext
.ext_free
== NULL
) {
3766 o
->obj_next
= mcl_list
;
3768 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3769 o
->obj_next
= mbc_list
;
3771 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
3772 o
->obj_next
= m16k_list
;
3775 (*(m
->m_ext
.ext_free
))((caddr_t
)o
,
3779 rfa
= (mcache_obj_t
*)MEXT_RFA(m
);
3780 rfa
->obj_next
= ref_list
;
3783 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
3784 VERIFY(m
->m_type
!= MT_FREE
);
3786 * Amortize the costs of atomic operations
3787 * by doing them at the end, if possible.
3789 if (m
->m_type
== MT_DATA
)
3791 else if (m
->m_type
== MT_HEADER
)
3793 else if (m
->m_type
== MT_SONAME
)
3795 else if (m
->m_type
== MT_TAG
)
3798 mtype_stat_dec(m
->m_type
);
3800 m
->m_type
= MT_FREE
;
3803 m
->m_next
= m
->m_nextpkt
= NULL
;
3805 /* "Free" into the intermediate cache */
3806 o
= (mcache_obj_t
*)m
;
3807 if (m
->m_ext
.ext_free
== NULL
) {
3808 o
->obj_next
= m_mcl_list
;
3810 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3811 o
->obj_next
= m_mbc_list
;
3814 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
3815 o
->obj_next
= m_m16k_list
;
3823 * Amortize the costs of atomic operations
3824 * by doing them at the end, if possible.
3826 if (m
->m_type
== MT_DATA
)
3828 else if (m
->m_type
== MT_HEADER
)
3830 else if (m
->m_type
== MT_SONAME
)
3832 else if (m
->m_type
== MT_TAG
)
3834 else if (m
->m_type
!= MT_FREE
)
3835 mtype_stat_dec(m
->m_type
);
3837 m
->m_type
= MT_FREE
;
3838 m
->m_flags
= m
->m_len
= 0;
3839 m
->m_next
= m
->m_nextpkt
= NULL
;
3841 ((mcache_obj_t
*)m
)->obj_next
= mp_list
;
3842 mp_list
= (mcache_obj_t
*)m
;
3851 mtype_stat_add(MT_FREE
, mt_free
);
3853 mtype_stat_sub(MT_DATA
, mt_data
);
3855 mtype_stat_sub(MT_HEADER
, mt_header
);
3857 mtype_stat_sub(MT_SONAME
, mt_soname
);
3859 mtype_stat_sub(MT_TAG
, mt_tag
);
3861 if (mp_list
!= NULL
)
3862 mcache_free_ext(m_cache(MC_MBUF
), mp_list
);
3863 if (mcl_list
!= NULL
)
3864 mcache_free_ext(m_cache(MC_CL
), mcl_list
);
3865 if (mbc_list
!= NULL
)
3866 mcache_free_ext(m_cache(MC_BIGCL
), mbc_list
);
3867 if (m16k_list
!= NULL
)
3868 mcache_free_ext(m_cache(MC_16KCL
), m16k_list
);
3869 if (m_mcl_list
!= NULL
)
3870 mcache_free_ext(m_cache(MC_MBUF_CL
), m_mcl_list
);
3871 if (m_mbc_list
!= NULL
)
3872 mcache_free_ext(m_cache(MC_MBUF_BIGCL
), m_mbc_list
);
3873 if (m_m16k_list
!= NULL
)
3874 mcache_free_ext(m_cache(MC_MBUF_16KCL
), m_m16k_list
);
3875 if (ref_list
!= NULL
)
3876 mcache_free_ext(ref_cache
, ref_list
);
3882 m_freem(struct mbuf
*m
)
3889 * Mbuffer utility routines.
3893 * Compute the amount of space available before the current start
3894 * of data in an mbuf.
3897 m_leadingspace(struct mbuf
*m
)
3899 if (m
->m_flags
& M_EXT
) {
3900 if (MCLHASREFERENCE(m
))
3902 return (m
->m_data
- m
->m_ext
.ext_buf
);
3904 if (m
->m_flags
& M_PKTHDR
)
3905 return (m
->m_data
- m
->m_pktdat
);
3906 return (m
->m_data
- m
->m_dat
);
3910 * Compute the amount of space available after the end of data in an mbuf.
3913 m_trailingspace(struct mbuf
*m
)
3915 if (m
->m_flags
& M_EXT
) {
3916 if (MCLHASREFERENCE(m
))
3918 return (m
->m_ext
.ext_buf
+ m
->m_ext
.ext_size
-
3919 (m
->m_data
+ m
->m_len
));
3921 return (&m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
));
3925 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
3926 * copy junk along. Does not adjust packet header length.
3929 m_prepend(struct mbuf
*m
, int len
, int how
)
3933 _MGET(mn
, how
, m
->m_type
);
3938 if (m
->m_flags
& M_PKTHDR
) {
3939 M_COPY_PKTHDR(mn
, m
);
3940 m
->m_flags
&= ~M_PKTHDR
;
3951 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3952 * chain, copy junk along, and adjust length.
3955 m_prepend_2(struct mbuf
*m
, int len
, int how
)
3957 if (M_LEADINGSPACE(m
) >= len
) {
3961 m
= m_prepend(m
, len
, how
);
3963 if ((m
) && (m
->m_flags
& M_PKTHDR
))
3964 m
->m_pkthdr
.len
+= len
;
3969 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3970 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
3971 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3976 m_copym(struct mbuf
*m
, int off0
, int len
, int wait
)
3978 struct mbuf
*n
, *mhdr
= NULL
, **np
;
3983 if (off
< 0 || len
< 0)
3984 panic("m_copym: invalid offset %d or len %d", off
, len
);
3986 if (off
== 0 && (m
->m_flags
& M_PKTHDR
)) {
3991 while (off
>= m
->m_len
) {
3992 if (m
->m_next
== NULL
)
3993 panic("m_copym: invalid mbuf chain");
4002 if (len
!= M_COPYALL
)
4003 panic("m_copym: len != M_COPYALL");
4007 n
= _M_RETRY(wait
, m
->m_type
);
4014 M_COPY_PKTHDR(n
, mhdr
);
4015 if (len
== M_COPYALL
)
4016 n
->m_pkthdr
.len
-= off0
;
4018 n
->m_pkthdr
.len
= len
;
4021 if (len
== M_COPYALL
) {
4022 if (MIN(len
, (m
->m_len
- off
)) == len
) {
4023 printf("m->m_len %d - off %d = %d, %d\n",
4024 m
->m_len
, off
, m
->m_len
- off
,
4025 MIN(len
, (m
->m_len
- off
)));
4028 n
->m_len
= MIN(len
, (m
->m_len
- off
));
4029 if (n
->m_len
== M_COPYALL
) {
4030 printf("n->m_len == M_COPYALL, fixing\n");
4033 if (m
->m_flags
& M_EXT
) {
4034 n
->m_ext
= m
->m_ext
;
4036 n
->m_data
= m
->m_data
+ off
;
4037 n
->m_flags
|= M_EXT
;
4039 bcopy(MTOD(m
, caddr_t
)+off
, MTOD(n
, caddr_t
),
4040 (unsigned)n
->m_len
);
4042 if (len
!= M_COPYALL
)
4061 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4062 * within this routine also, the last mbuf and offset accessed are passed
4063 * out and can be passed back in to avoid having to rescan the entire mbuf
4064 * list (normally hung off of the socket)
4067 m_copym_with_hdrs(struct mbuf
*m
, int off0
, int len0
, int wait
,
4068 struct mbuf
**m_last
, int *m_off
)
4070 struct mbuf
*n
, **np
= NULL
;
4071 int off
= off0
, len
= len0
;
4072 struct mbuf
*top
= NULL
;
4073 int mcflags
= MSLEEPF(wait
);
4076 mcache_obj_t
*list
= NULL
;
4079 if (off
== 0 && (m
->m_flags
& M_PKTHDR
))
4082 if (*m_last
!= NULL
) {
4086 while (off
>= m
->m_len
) {
4096 len
-= MIN(len
, (n
->m_len
- ((needed
== 1) ? off
: 0)));
4103 * If the caller doesn't want to be put to sleep, mark it with
4104 * MCR_TRYHARD so that we may reclaim buffers from other places
4107 if (mcflags
& MCR_NOSLEEP
)
4108 mcflags
|= MCR_TRYHARD
;
4110 if (mcache_alloc_ext(m_cache(MC_MBUF
), &list
, needed
,
4116 n
= (struct mbuf
*)list
;
4117 list
= list
->obj_next
;
4118 ASSERT(n
!= NULL
&& m
!= NULL
);
4120 type
= (top
== NULL
) ? MT_HEADER
: m
->m_type
;
4121 MBUF_INIT(n
, (top
== NULL
), type
);
4123 if (top
== NULL
&& mac_mbuf_label_init(n
, wait
) != 0) {
4124 mtype_stat_inc(MT_HEADER
);
4125 mtype_stat_dec(MT_FREE
);
4129 #endif /* MAC_NET */
4141 M_COPY_PKTHDR(n
, m
);
4142 n
->m_pkthdr
.len
= len
;
4145 n
->m_len
= MIN(len
, (m
->m_len
- off
));
4147 if (m
->m_flags
& M_EXT
) {
4148 n
->m_ext
= m
->m_ext
;
4150 n
->m_data
= m
->m_data
+ off
;
4151 n
->m_flags
|= M_EXT
;
4153 bcopy(MTOD(m
, caddr_t
)+off
, MTOD(n
, caddr_t
),
4154 (unsigned)n
->m_len
);
4159 if ((off
+ n
->m_len
) == m
->m_len
) {
4160 *m_last
= m
->m_next
;
4164 *m_off
= off
+ n
->m_len
;
4173 mtype_stat_inc(MT_HEADER
);
4174 mtype_stat_add(type
, needed
);
4175 mtype_stat_sub(MT_FREE
, needed
+ 1);
4177 ASSERT(list
== NULL
);
4182 mcache_free_ext(m_cache(MC_MBUF
), list
);
4190 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4191 * continuing for "len" bytes, into the indicated buffer.
4194 m_copydata(struct mbuf
*m
, int off
, int len
, void *vp
)
4199 if (off
< 0 || len
< 0)
4200 panic("m_copydata: invalid offset %d or len %d", off
, len
);
4204 panic("m_copydata: invalid mbuf chain");
4212 panic("m_copydata: invalid mbuf chain");
4213 count
= MIN(m
->m_len
- off
, len
);
4214 bcopy(MTOD(m
, caddr_t
) + off
, cp
, count
);
4223 * Concatenate mbuf chain n to m. Both chains must be of the same type
4224 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4227 m_cat(struct mbuf
*m
, struct mbuf
*n
)
4232 if ((m
->m_flags
& M_EXT
) ||
4233 m
->m_data
+ m
->m_len
+ n
->m_len
>= &m
->m_dat
[MLEN
]) {
4234 /* just join the two chains */
4238 /* splat the data from one into the other */
4239 bcopy(MTOD(n
, caddr_t
), MTOD(m
, caddr_t
) + m
->m_len
,
4241 m
->m_len
+= n
->m_len
;
4247 m_adj(struct mbuf
*mp
, int req_len
)
4253 if ((m
= mp
) == NULL
)
4259 while (m
!= NULL
&& len
> 0) {
4260 if (m
->m_len
<= len
) {
4271 if (m
->m_flags
& M_PKTHDR
)
4272 m
->m_pkthdr
.len
-= (req_len
- len
);
4275 * Trim from tail. Scan the mbuf chain,
4276 * calculating its length and finding the last mbuf.
4277 * If the adjustment only affects this mbuf, then just
4278 * adjust and return. Otherwise, rescan and truncate
4279 * after the remaining size.
4285 if (m
->m_next
== (struct mbuf
*)0)
4289 if (m
->m_len
>= len
) {
4292 if (m
->m_flags
& M_PKTHDR
)
4293 m
->m_pkthdr
.len
-= len
;
4300 * Correct length for chain is "count".
4301 * Find the mbuf with last data, adjust its length,
4302 * and toss data from remaining mbufs on chain.
4305 if (m
->m_flags
& M_PKTHDR
)
4306 m
->m_pkthdr
.len
= count
;
4307 for (; m
; m
= m
->m_next
) {
4308 if (m
->m_len
>= count
) {
4314 while ((m
= m
->m_next
))
4320 * Rearange an mbuf chain so that len bytes are contiguous
4321 * and in the data area of an mbuf (so that mtod and dtom
4322 * will work for a structure of size len). Returns the resulting
4323 * mbuf chain on success, frees it and returns null on failure.
4324 * If there is room, it will add up to max_protohdr-len extra bytes to the
4325 * contiguous region in an attempt to avoid being called next time.
4330 m_pullup(struct mbuf
*n
, int len
)
4337 * If first mbuf has no cluster, and has room for len bytes
4338 * without shifting current data, pullup into it,
4339 * otherwise allocate a new mbuf to prepend to the chain.
4341 if ((n
->m_flags
& M_EXT
) == 0 &&
4342 n
->m_data
+ len
< &n
->m_dat
[MLEN
] && n
->m_next
) {
4343 if (n
->m_len
>= len
)
4351 _MGET(m
, M_DONTWAIT
, n
->m_type
);
4355 if (n
->m_flags
& M_PKTHDR
) {
4356 M_COPY_PKTHDR(m
, n
);
4357 n
->m_flags
&= ~M_PKTHDR
;
4360 space
= &m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
);
4362 count
= MIN(MIN(MAX(len
, max_protohdr
), space
), n
->m_len
);
4363 bcopy(MTOD(n
, caddr_t
), MTOD(m
, caddr_t
) + m
->m_len
,
4373 } while (len
> 0 && n
);
4387 * Partition an mbuf chain in two pieces, returning the tail --
4388 * all but the first len0 bytes. In case of failure, it returns NULL and
4389 * attempts to restore the chain to its original state.
4392 m_split(struct mbuf
*m0
, int len0
, int wait
)
4394 return (m_split0(m0
, len0
, wait
, 1));
4397 static struct mbuf
*
4398 m_split0(struct mbuf
*m0
, int len0
, int wait
, int copyhdr
)
4401 unsigned len
= len0
, remain
;
4403 for (m
= m0
; m
&& len
> m
->m_len
; m
= m
->m_next
)
4407 remain
= m
->m_len
- len
;
4408 if (copyhdr
&& (m0
->m_flags
& M_PKTHDR
)) {
4409 _MGETHDR(n
, wait
, m0
->m_type
);
4412 n
->m_pkthdr
.rcvif
= m0
->m_pkthdr
.rcvif
;
4413 n
->m_pkthdr
.len
= m0
->m_pkthdr
.len
- len0
;
4414 m0
->m_pkthdr
.len
= len0
;
4415 if (m
->m_flags
& M_EXT
)
4417 if (remain
> MHLEN
) {
4418 /* m can't be the lead packet */
4420 n
->m_next
= m_split(m
, len
, wait
);
4421 if (n
->m_next
== NULL
) {
4427 MH_ALIGN(n
, remain
);
4428 } else if (remain
== 0) {
4433 _MGET(n
, wait
, m
->m_type
);
4439 if (m
->m_flags
& M_EXT
) {
4440 n
->m_flags
|= M_EXT
;
4441 n
->m_ext
= m
->m_ext
;
4443 n
->m_data
= m
->m_data
+ len
;
4445 bcopy(MTOD(m
, caddr_t
) + len
, MTOD(n
, caddr_t
), remain
);
4449 n
->m_next
= m
->m_next
;
4455 * Routine to copy from device local memory into mbufs.
4458 m_devget(char *buf
, int totlen
, int off0
, struct ifnet
*ifp
,
4459 void (*copy
)(const void *, void *, size_t))
4462 struct mbuf
*top
= NULL
, **mp
= &top
;
4463 int off
= off0
, len
;
4471 * If 'off' is non-zero, packet is trailer-encapsulated,
4472 * so we have to skip the type and length fields.
4474 cp
+= off
+ 2 * sizeof (u_int16_t
);
4475 totlen
-= 2 * sizeof (u_int16_t
);
4477 _MGETHDR(m
, M_DONTWAIT
, MT_DATA
);
4480 m
->m_pkthdr
.rcvif
= ifp
;
4481 m
->m_pkthdr
.len
= totlen
;
4484 while (totlen
> 0) {
4486 _MGET(m
, M_DONTWAIT
, MT_DATA
);
4493 len
= MIN(totlen
, epkt
- cp
);
4494 if (len
>= MINCLSIZE
) {
4495 MCLGET(m
, M_DONTWAIT
);
4496 if (m
->m_flags
& M_EXT
) {
4497 m
->m_len
= len
= MIN(len
, m_maxsize(MC_CL
));
4499 /* give up when it's out of cluster mbufs */
4507 * Place initial small packet/header at end of mbuf.
4509 if (len
< m
->m_len
) {
4511 len
+ max_linkhdr
<= m
->m_len
)
4512 m
->m_data
+= max_linkhdr
;
4519 copy(cp
, MTOD(m
, caddr_t
), (unsigned)len
);
4521 bcopy(cp
, MTOD(m
, caddr_t
), (unsigned)len
);
4533 mbuf_growth_aggressive(void)
4535 lck_mtx_lock(mbuf_mlock
);
4537 * Don't start to grow the pool until we are at least
4538 * 1/2 (50%) of current total capacity.
4540 mbuf_gscale
= MB_GROWTH_AGGRESSIVE
;
4541 lck_mtx_unlock(mbuf_mlock
);
4545 mbuf_growth_normal(void)
4547 lck_mtx_lock(mbuf_mlock
);
4549 * Don't start to grow the pool until we are at least
4550 * 15/16 (93.75%) of current total capacity.
4552 mbuf_gscale
= MB_GROWTH_NORMAL
;
4553 lck_mtx_unlock(mbuf_mlock
);
4557 * Cluster freelist allocation check.
4560 m_howmany(int num
, size_t bufsize
)
4563 u_int32_t m_clusters
, m_bigclusters
, m_16kclusters
;
4564 u_int32_t m_clfree
, m_bigclfree
, m_16kclfree
;
4565 u_int32_t s
= mbuf_gscale
;
4567 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
4569 m_clusters
= m_total(MC_CL
);
4570 m_bigclusters
= m_total(MC_BIGCL
);
4571 m_16kclusters
= m_total(MC_16KCL
);
4572 m_clfree
= m_infree(MC_CL
);
4573 m_bigclfree
= m_infree(MC_BIGCL
);
4574 m_16kclfree
= m_infree(MC_16KCL
);
4576 /* Bail if we've maxed out the mbuf memory map */
4577 if ((bufsize
!= m_maxsize(MC_16KCL
) &&
4578 (m_clusters
+ (m_bigclusters
<< 1) >= nclusters
)) ||
4579 (njcl
> 0 && bufsize
== m_maxsize(MC_16KCL
) &&
4580 (m_16kclusters
<< 3) >= njcl
)) {
4582 if (bufsize
== MCLBYTES
&& num
> m_clfree
) {
4583 printf("m_howmany - out of small clusters, "
4584 "%d short\n", num
- mbstat
.m_clfree
);
4590 if (bufsize
== m_maxsize(MC_CL
)) {
4592 if (m_clusters
< MINCL
)
4593 return (MINCL
- m_clusters
);
4594 /* Too few (free < threshold) and not over maximum */
4595 if (m_clusters
< m_maxlimit(MC_CL
)) {
4596 if (m_clfree
>= MCL_LOWAT
)
4598 if (num
>= m_clfree
)
4600 if (((m_clusters
+ num
) >> s
) > m_clfree
)
4601 j
= ((m_clusters
+ num
) >> s
) - m_clfree
;
4603 if (i
+ m_clusters
>= m_maxlimit(MC_CL
))
4604 i
= m_maxlimit(MC_CL
) - m_clusters
;
4606 VERIFY((m_total(MC_CL
) + i
) <= m_maxlimit(MC_CL
));
4607 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
4609 if (m_bigclusters
< MINBIGCL
)
4610 return (MINBIGCL
- m_bigclusters
);
4611 /* Too few (free < 1/16 total) and not over maximum */
4612 if (m_bigclusters
< m_maxlimit(MC_BIGCL
)) {
4613 if (m_bigclfree
>= MBIGCL_LOWAT
)
4615 if (num
>= m_bigclfree
)
4616 i
= num
- m_bigclfree
;
4617 if (((m_bigclusters
+ num
) >> 4) > m_bigclfree
)
4618 j
= ((m_bigclusters
+ num
) >> 4) - m_bigclfree
;
4620 if (i
+ m_bigclusters
>= m_maxlimit(MC_BIGCL
))
4621 i
= m_maxlimit(MC_BIGCL
) - m_bigclusters
;
4623 VERIFY((m_total(MC_BIGCL
) + i
) <= m_maxlimit(MC_BIGCL
));
4627 if (m_16kclusters
< MIN16KCL
)
4628 return (MIN16KCL
- m_16kclusters
);
4629 /* Too few (free < 1/16 total) and not over maximum */
4630 if (m_16kclusters
< m_maxlimit(MC_16KCL
)) {
4631 if (m_16kclfree
>= M16KCL_LOWAT
)
4633 if (num
>= m_16kclfree
)
4634 i
= num
- m_16kclfree
;
4635 if (((m_16kclusters
+ num
) >> 4) > m_16kclfree
)
4636 j
= ((m_16kclusters
+ num
) >> 4) - m_16kclfree
;
4638 if (i
+ m_16kclusters
>= m_maxlimit(MC_16KCL
))
4639 i
= m_maxlimit(MC_16KCL
) - m_16kclusters
;
4641 VERIFY((m_total(MC_16KCL
) + i
) <= m_maxlimit(MC_16KCL
));
4648 * Return the number of bytes in the mbuf chain, m.
4651 m_length(struct mbuf
*m
)
4654 unsigned int pktlen
;
4656 if (m
->m_flags
& M_PKTHDR
)
4657 return (m
->m_pkthdr
.len
);
4660 for (m0
= m
; m0
!= NULL
; m0
= m0
->m_next
)
4661 pktlen
+= m0
->m_len
;
4666 * Copy data from a buffer back into the indicated mbuf chain,
4667 * starting "off" bytes from the beginning, extending the mbuf
4668 * chain if necessary.
4671 m_copyback(struct mbuf
*m0
, int off
, int len
, const void *cp
)
4674 struct mbuf
*origm
= m0
;
4684 m_copyback0(&m0
, off
, len
, cp
,
4685 M_COPYBACK0_COPYBACK
| M_COPYBACK0_EXTEND
, M_DONTWAIT
);
4688 if (error
!= 0 || (m0
!= NULL
&& origm
!= m0
))
4689 panic("m_copyback");
4694 m_copyback_cow(struct mbuf
*m0
, int off
, int len
, const void *cp
, int how
)
4698 /* don't support chain expansion */
4699 VERIFY(off
+ len
<= m_length(m0
));
4701 error
= m_copyback0(&m0
, off
, len
, cp
,
4702 M_COPYBACK0_COPYBACK
| M_COPYBACK0_COW
, how
);
4705 * no way to recover from partial success.
4706 * just free the chain.
4715 * m_makewritable: ensure the specified range writable.
4718 m_makewritable(struct mbuf
**mp
, int off
, int len
, int how
)
4723 int origlen
, reslen
;
4725 origlen
= m_length(*mp
);
4728 #if 0 /* M_COPYALL is large enough */
4729 if (len
== M_COPYALL
)
4730 len
= m_length(*mp
) - off
; /* XXX */
4733 error
= m_copyback0(mp
, off
, len
, NULL
,
4734 M_COPYBACK0_PRESERVE
| M_COPYBACK0_COW
, how
);
4738 for (n
= *mp
; n
; n
= n
->m_next
)
4740 if (origlen
!= reslen
)
4741 panic("m_makewritable: length changed");
4742 if (((*mp
)->m_flags
& M_PKTHDR
) && reslen
!= (*mp
)->m_pkthdr
.len
)
4743 panic("m_makewritable: inconsist");
4750 m_copyback0(struct mbuf
**mp0
, int off
, int len
, const void *vp
, int flags
,
4757 const char *cp
= vp
;
4759 VERIFY(mp0
!= NULL
);
4760 VERIFY(*mp0
!= NULL
);
4761 VERIFY((flags
& M_COPYBACK0_PRESERVE
) == 0 || cp
== NULL
);
4762 VERIFY((flags
& M_COPYBACK0_COPYBACK
) == 0 || cp
!= NULL
);
4765 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
4766 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
4769 VERIFY((~flags
& (M_COPYBACK0_EXTEND
|M_COPYBACK0_COW
)) != 0);
4773 while (off
> (mlen
= m
->m_len
)) {
4776 if (m
->m_next
== NULL
) {
4779 if (!(flags
& M_COPYBACK0_EXTEND
))
4783 * try to make some space at the end of "m".
4787 if (off
+ len
>= MINCLSIZE
&&
4788 !(m
->m_flags
& M_EXT
) && m
->m_len
== 0) {
4791 tspace
= M_TRAILINGSPACE(m
);
4793 tspace
= MIN(tspace
, off
+ len
);
4795 bzero(mtod(m
, char *) + m
->m_len
,
4804 * need to allocate an mbuf.
4807 if (off
+ len
>= MINCLSIZE
) {
4808 n
= m_getcl(how
, m
->m_type
, 0);
4810 n
= _M_GET(how
, m
->m_type
);
4816 n
->m_len
= MIN(M_TRAILINGSPACE(n
), off
+ len
);
4817 bzero(mtod(n
, char *), MIN(n
->m_len
, off
));
4824 mlen
= m
->m_len
- off
;
4825 if (mlen
!= 0 && m_mclhasreference(m
)) {
4830 * this mbuf is read-only.
4831 * allocate a new writable mbuf and try again.
4834 #if defined(DIAGNOSTIC)
4835 if (!(flags
& M_COPYBACK0_COW
))
4836 panic("m_copyback0: read-only");
4837 #endif /* defined(DIAGNOSTIC) */
4840 * if we're going to write into the middle of
4841 * a mbuf, split it first.
4843 if (off
> 0 && len
< mlen
) {
4844 n
= m_split0(m
, off
, how
, 0);
4855 * XXX TODO coalesce into the trailingspace of
4856 * the previous mbuf when possible.
4860 * allocate a new mbuf. copy packet header if needed.
4862 n
= _M_GET(how
, m
->m_type
);
4865 if (off
== 0 && (m
->m_flags
& M_PKTHDR
)) {
4866 M_COPY_PKTHDR(n
, m
);
4869 if (len
>= MINCLSIZE
)
4870 MCLGET(n
, M_DONTWAIT
);
4872 (n
->m_flags
& M_EXT
) ? MCLBYTES
: MLEN
;
4878 * free the region which has been overwritten.
4879 * copying data from old mbufs if requested.
4881 if (flags
& M_COPYBACK0_PRESERVE
)
4882 datap
= mtod(n
, char *);
4886 VERIFY(off
== 0 || eatlen
>= mlen
);
4888 VERIFY(len
>= mlen
);
4892 m_copydata(m
, off
, mlen
, datap
);
4899 while (m
!= NULL
&& m_mclhasreference(m
) &&
4900 n
->m_type
== m
->m_type
&& eatlen
> 0) {
4901 mlen
= MIN(eatlen
, m
->m_len
);
4903 m_copydata(m
, 0, mlen
, datap
);
4910 *mp
= m
= m_free(m
);
4918 mlen
= MIN(mlen
, len
);
4919 if (flags
& M_COPYBACK0_COPYBACK
) {
4920 bcopy(cp
, mtod(m
, caddr_t
) + off
, (unsigned)mlen
);
4929 if (m
->m_next
== NULL
) {
4936 if (((m
= *mp0
)->m_flags
& M_PKTHDR
) && (m
->m_pkthdr
.len
< totlen
)) {
4937 VERIFY(flags
& M_COPYBACK0_EXTEND
);
4938 m
->m_pkthdr
.len
= totlen
;
4948 mcl_to_paddr(char *addr
)
4950 vm_offset_t base_phys
;
4952 if (!MBUF_IN_MAP(addr
))
4954 base_phys
= mcl_paddr
[(addr
- (char *)mbutl
) >> PGSHIFT
];
4958 return ((char *)((uintptr_t)base_phys
| ((uintptr_t)addr
& PGOFSET
)));
4962 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
4963 * And really copy the thing. That way, we don't "precompute" checksums
4964 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
4965 * small packets, don't dup into a cluster. That way received packets
4966 * don't take up too much room in the sockbuf (cf. sbspace()).
4971 m_dup(struct mbuf
*m
, int how
)
4973 struct mbuf
*n
, **np
;
4979 if (m
->m_flags
& M_PKTHDR
)
4983 * Quick check: if we have one mbuf and its data fits in an
4984 * mbuf with packet header, just copy and go.
4986 if (m
->m_next
== NULL
) {
4987 /* Then just move the data into an mbuf and be done... */
4989 if (m
->m_pkthdr
.len
<= MHLEN
&& m
->m_len
<= MHLEN
) {
4990 if ((n
= _M_GETHDR(how
, m
->m_type
)) == NULL
)
4992 n
->m_len
= m
->m_len
;
4993 m_dup_pkthdr(n
, m
, how
);
4994 bcopy(m
->m_data
, n
->m_data
, m
->m_len
);
4997 } else if (m
->m_len
<= MLEN
) {
4998 if ((n
= _M_GET(how
, m
->m_type
)) == NULL
)
5000 bcopy(m
->m_data
, n
->m_data
, m
->m_len
);
5001 n
->m_len
= m
->m_len
;
5007 kprintf("<%x: %x, %x, %x\n", m
, m
->m_flags
, m
->m_len
,
5011 n
= _M_GETHDR(how
, m
->m_type
);
5013 n
= _M_GET(how
, m
->m_type
);
5016 if (m
->m_flags
& M_EXT
) {
5017 if (m
->m_len
<= m_maxsize(MC_CL
))
5019 else if (m
->m_len
<= m_maxsize(MC_BIGCL
))
5020 n
= m_mbigget(n
, how
);
5021 else if (m
->m_len
<= m_maxsize(MC_16KCL
) && njcl
> 0)
5022 n
= m_m16kget(n
, how
);
5023 if (!(n
->m_flags
& M_EXT
)) {
5030 /* Don't use M_COPY_PKTHDR: preserve m_data */
5031 m_dup_pkthdr(n
, m
, how
);
5033 if (!(n
->m_flags
& M_EXT
))
5034 n
->m_data
= n
->m_pktdat
;
5036 n
->m_len
= m
->m_len
;
5038 * Get the dup on the same bdry as the original
5039 * Assume that the two mbufs have the same offset to data area
5040 * (up to word boundaries)
5042 bcopy(MTOD(m
, caddr_t
), MTOD(n
, caddr_t
), (unsigned)n
->m_len
);
5046 kprintf(">%x: %x, %x, %x\n", n
, n
->m_flags
, n
->m_len
,
5061 #define MBUF_MULTIPAGES(m) \
5062 (((m)->m_flags & M_EXT) && \
5063 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
5064 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
5065 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5067 static struct mbuf
*
5068 m_expand(struct mbuf
*m
, struct mbuf
**last
)
5070 struct mbuf
*top
= NULL
;
5071 struct mbuf
**nm
= &top
;
5072 uintptr_t data0
, data
;
5073 unsigned int len0
, len
;
5075 VERIFY(MBUF_MULTIPAGES(m
));
5076 VERIFY(m
->m_next
== NULL
);
5077 data0
= (uintptr_t)m
->m_data
;
5085 if (IS_P2ALIGNED(data
, NBPG
) && len0
> NBPG
)
5087 else if (!IS_P2ALIGNED(data
, NBPG
) &&
5088 P2ROUNDUP(data
, NBPG
) < (data
+ len0
))
5089 len
= P2ROUNDUP(data
, NBPG
) - data
;
5094 VERIFY(m
->m_flags
& M_EXT
);
5095 m
->m_data
= (void *)data
;
5107 n
= _M_RETRY(M_DONTWAIT
, MT_DATA
);
5114 n
->m_ext
= m
->m_ext
;
5116 n
->m_flags
|= M_EXT
;
5123 m_normalize(struct mbuf
*m
)
5125 struct mbuf
*top
= NULL
;
5126 struct mbuf
**nm
= &top
;
5127 boolean_t expanded
= FALSE
;
5135 /* Does the data cross one or more page boundaries? */
5136 if (MBUF_MULTIPAGES(m
)) {
5138 if ((m
= m_expand(m
, &last
)) == NULL
) {
5154 atomic_add_32(&mb_normalized
, 1);
5159 m_mchtype(struct mbuf
*m
, int t
)
5162 mtype_stat_dec(m
->m_type
);
5167 m_mtod(struct mbuf
*m
)
5169 return (MTOD(m
, void *));
5175 return ((struct mbuf
*)((uintptr_t)(x
) & ~(MSIZE
-1)));
5179 m_mcheck(struct mbuf
*m
)
5185 * Inform the corresponding mcache(s) that there's a waiter below.
5188 mbuf_waiter_inc(mbuf_class_t
class, boolean_t comp
)
5190 mcache_waiter_inc(m_cache(class));
5192 if (class == MC_CL
) {
5193 mcache_waiter_inc(m_cache(MC_MBUF_CL
));
5194 } else if (class == MC_BIGCL
) {
5195 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL
));
5196 } else if (class == MC_16KCL
) {
5197 mcache_waiter_inc(m_cache(MC_MBUF_16KCL
));
5199 mcache_waiter_inc(m_cache(MC_MBUF_CL
));
5200 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL
));
5206 * Inform the corresponding mcache(s) that there's no more waiter below.
5209 mbuf_waiter_dec(mbuf_class_t
class, boolean_t comp
)
5211 mcache_waiter_dec(m_cache(class));
5213 if (class == MC_CL
) {
5214 mcache_waiter_dec(m_cache(MC_MBUF_CL
));
5215 } else if (class == MC_BIGCL
) {
5216 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL
));
5217 } else if (class == MC_16KCL
) {
5218 mcache_waiter_dec(m_cache(MC_MBUF_16KCL
));
5220 mcache_waiter_dec(m_cache(MC_MBUF_CL
));
5221 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL
));
5227 * Called during blocking allocation. Returns TRUE if one or more objects
5228 * are available at the per-CPU caches layer and that allocation should be
5229 * retried at that level.
5232 mbuf_sleep(mbuf_class_t
class, unsigned int num
, int wait
)
5234 boolean_t mcache_retry
= FALSE
;
5236 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
5238 /* Check if there's anything at the cache layer */
5239 if (mbuf_cached_above(class, wait
)) {
5240 mcache_retry
= TRUE
;
5244 /* Nothing? Then try hard to get it from somewhere */
5245 m_reclaim(class, num
, (wait
& MCR_COMP
));
5247 /* We tried hard and got something? */
5248 if (m_infree(class) > 0) {
5251 } else if (mbuf_cached_above(class, wait
)) {
5253 mcache_retry
= TRUE
;
5255 } else if (wait
& MCR_TRYHARD
) {
5256 mcache_retry
= TRUE
;
5261 * There's really nothing for us right now; inform the
5262 * cache(s) that there is a waiter below and go to sleep.
5264 mbuf_waiter_inc(class, (wait
& MCR_COMP
));
5266 VERIFY(!(wait
& MCR_NOSLEEP
));
5268 (void) msleep(mb_waitchan
, mbuf_mlock
, (PZERO
-1), m_cname(class), NULL
);
5270 /* We are now up; stop getting notified until next round */
5271 mbuf_waiter_dec(class, (wait
& MCR_COMP
));
5273 /* We waited and got something */
5274 if (m_infree(class) > 0) {
5277 } else if (mbuf_cached_above(class, wait
)) {
5279 mcache_retry
= TRUE
;
5282 return (mcache_retry
);
5286 mbuf_worker_thread(void)
5291 lck_mtx_lock(mbuf_mlock
);
5294 if (mbuf_expand_mcl
) {
5297 /* Adjust to current number of cluster in use */
5298 n
= mbuf_expand_mcl
-
5299 (m_total(MC_CL
) - m_infree(MC_CL
));
5300 if ((n
+ m_total(MC_CL
)) > m_maxlimit(MC_CL
))
5301 n
= m_maxlimit(MC_CL
) - m_total(MC_CL
);
5302 mbuf_expand_mcl
= 0;
5304 if (n
> 0 && freelist_populate(MC_CL
, n
, M_WAIT
) > 0)
5307 if (mbuf_expand_big
) {
5310 /* Adjust to current number of 4 KB cluster in use */
5311 n
= mbuf_expand_big
-
5312 (m_total(MC_BIGCL
) - m_infree(MC_BIGCL
));
5313 if ((n
+ m_total(MC_BIGCL
)) > m_maxlimit(MC_BIGCL
))
5314 n
= m_maxlimit(MC_BIGCL
) - m_total(MC_BIGCL
);
5315 mbuf_expand_big
= 0;
5317 if (n
> 0 && freelist_populate(MC_BIGCL
, n
, M_WAIT
) > 0)
5320 if (mbuf_expand_16k
) {
5323 /* Adjust to current number of 16 KB cluster in use */
5324 n
= mbuf_expand_16k
-
5325 (m_total(MC_16KCL
) - m_infree(MC_16KCL
));
5326 if ((n
+ m_total(MC_16KCL
)) > m_maxlimit(MC_16KCL
))
5327 n
= m_maxlimit(MC_16KCL
) - m_total(MC_16KCL
);
5328 mbuf_expand_16k
= 0;
5331 (void) freelist_populate(MC_16KCL
, n
, M_WAIT
);
5335 * Because we can run out of memory before filling the mbuf
5336 * map, we should not allocate more clusters than they are
5337 * mbufs -- otherwise we could have a large number of useless
5338 * clusters allocated.
5341 while (m_total(MC_MBUF
) <
5342 (m_total(MC_BIGCL
) + m_total(MC_CL
))) {
5343 if (freelist_populate(MC_MBUF
, 1, M_WAIT
) == 0)
5348 lck_mtx_unlock(mbuf_mlock
);
5350 assert_wait(&mbuf_worker_run
, THREAD_UNINT
);
5351 (void) thread_block((thread_continue_t
)mbuf_worker_thread
);
5356 mbuf_worker_thread_init(void)
5358 mbuf_worker_ready
++;
5359 mbuf_worker_thread();
5368 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
5370 VERIFY(MBUF_IN_MAP(buf
));
5371 ix
= ((char *)buf
- (char *)mbutl
) >> MBSHIFT
;
5372 VERIFY(ix
< maxslabgrp
);
5374 if ((slg
= slabstbl
[ix
]) == NULL
) {
5376 * In the current implementation, we never shrink the memory
5377 * pool (hence the cluster map); if we attempt to reallocate
5378 * a cluster group when it's already allocated, panic since
5379 * this is a sign of a memory corruption (slabstbl[ix] got
5380 * nullified). This also means that there shouldn't be any
5381 * hole in the kernel sub-map for the mbuf pool.
5384 VERIFY(ix
< slabgrp
);
5386 * Slabs expansion can only be done single threaded; when
5387 * we get here, it must be as a result of m_clalloc() which
5388 * is serialized and therefore mb_clalloc_busy must be set.
5390 VERIFY(mb_clalloc_busy
);
5391 lck_mtx_unlock(mbuf_mlock
);
5393 /* This is a new buffer; create the slabs group for it */
5394 MALLOC(slg
, mcl_slabg_t
*, sizeof (*slg
), M_TEMP
,
5396 VERIFY(slg
!= NULL
);
5398 lck_mtx_lock(mbuf_mlock
);
5400 * No other thread could have gone into m_clalloc() after
5401 * we dropped the lock above, so verify that it's true.
5403 VERIFY(mb_clalloc_busy
);
5407 /* Chain each slab in the group to its forward neighbor */
5408 for (k
= 1; k
< NSLABSPMB
; k
++)
5409 slg
->slg_slab
[k
- 1].sl_next
= &slg
->slg_slab
[k
];
5410 VERIFY(slg
->slg_slab
[NSLABSPMB
- 1].sl_next
== NULL
);
5412 /* And chain the last slab in the previous group to this */
5414 VERIFY(slabstbl
[ix
- 1]->
5415 slg_slab
[NSLABSPMB
- 1].sl_next
== NULL
);
5416 slabstbl
[ix
- 1]->slg_slab
[NSLABSPMB
- 1].sl_next
=
5421 ix
= MTOCL(buf
) % NSLABSPMB
;
5422 VERIFY(ix
< NSLABSPMB
);
5424 return (&slg
->slg_slab
[ix
]);
5428 slab_init(mcl_slab_t
*sp
, mbuf_class_t
class, u_int32_t flags
,
5429 void *base
, void *head
, unsigned int len
, int refcnt
, int chunks
)
5431 sp
->sl_class
= class;
5432 sp
->sl_flags
= flags
;
5436 sp
->sl_refcnt
= refcnt
;
5437 sp
->sl_chunks
= chunks
;
5442 slab_insert(mcl_slab_t
*sp
, mbuf_class_t
class)
5444 VERIFY(slab_is_detached(sp
));
5445 m_slab_cnt(class)++;
5446 TAILQ_INSERT_TAIL(&m_slablist(class), sp
, sl_link
);
5447 sp
->sl_flags
&= ~SLF_DETACHED
;
5448 if (class == MC_BIGCL
) {
5450 /* Next slab must already be present */
5452 VERIFY(slab_is_detached(sp
));
5453 sp
->sl_flags
&= ~SLF_DETACHED
;
5454 } else if (class == MC_16KCL
) {
5456 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
5458 /* Next slab must already be present */
5460 VERIFY(slab_is_detached(sp
));
5461 sp
->sl_flags
&= ~SLF_DETACHED
;
5467 slab_remove(mcl_slab_t
*sp
, mbuf_class_t
class)
5469 VERIFY(!slab_is_detached(sp
));
5470 VERIFY(m_slab_cnt(class) > 0);
5471 m_slab_cnt(class)--;
5472 TAILQ_REMOVE(&m_slablist(class), sp
, sl_link
);
5474 if (class == MC_BIGCL
) {
5476 /* Next slab must already be present */
5478 VERIFY(!slab_is_detached(sp
));
5480 } else if (class == MC_16KCL
) {
5482 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
5484 /* Next slab must already be present */
5486 VERIFY(!slab_is_detached(sp
));
5493 slab_inrange(mcl_slab_t
*sp
, void *buf
)
5495 return ((uintptr_t)buf
>= (uintptr_t)sp
->sl_base
&&
5496 (uintptr_t)buf
< ((uintptr_t)sp
->sl_base
+ sp
->sl_len
));
5502 slab_nextptr_panic(mcl_slab_t
*sp
, void *addr
)
5505 unsigned int chunk_len
= sp
->sl_len
/ sp
->sl_chunks
;
5506 uintptr_t buf
= (uintptr_t)sp
->sl_base
;
5508 for (i
= 0; i
< sp
->sl_chunks
; i
++, buf
+= chunk_len
) {
5509 void *next
= ((mcache_obj_t
*)buf
)->obj_next
;
5512 if (mclaudit
== NULL
) {
5513 if (next
!= NULL
&& !MBUF_IN_MAP(next
)) {
5514 mcache_t
*cp
= m_cache(sp
->sl_class
);
5515 panic("%s: %s buffer %p in slab %p modified "
5516 "after free at offset 0: %p out of range "
5517 "[%p-%p)\n", __func__
, cp
->mc_name
,
5518 (void *)buf
, sp
, next
, mbutl
, embutl
);
5522 mcache_audit_t
*mca
= mcl_audit_buf2mca(sp
->sl_class
,
5523 (mcache_obj_t
*)buf
);
5524 mcl_audit_verify_nextptr(next
, mca
);
5530 slab_detach(mcl_slab_t
*sp
)
5532 sp
->sl_link
.tqe_next
= (mcl_slab_t
*)-1;
5533 sp
->sl_link
.tqe_prev
= (mcl_slab_t
**)-1;
5534 sp
->sl_flags
|= SLF_DETACHED
;
5538 slab_is_detached(mcl_slab_t
*sp
)
5540 return ((intptr_t)sp
->sl_link
.tqe_next
== -1 &&
5541 (intptr_t)sp
->sl_link
.tqe_prev
== -1 &&
5542 (sp
->sl_flags
& SLF_DETACHED
));
5546 mcl_audit_init(void *buf
, mcache_audit_t
**mca_list
,
5547 mcache_obj_t
**con_list
, size_t con_size
, unsigned int num
)
5549 mcache_audit_t
*mca
, *mca_tail
;
5550 mcache_obj_t
*con
= NULL
;
5551 boolean_t save_contents
= (con_list
!= NULL
);
5554 ASSERT(num
<= NMBPCL
);
5555 ASSERT(con_list
== NULL
|| con_size
!= 0);
5558 /* Make sure we haven't been here before */
5559 for (i
= 0; i
< NMBPCL
; i
++)
5560 VERIFY(mclaudit
[ix
].cl_audit
[i
] == NULL
);
5562 mca
= mca_tail
= *mca_list
;
5566 for (i
= 0; i
< num
; i
++) {
5567 mcache_audit_t
*next
;
5569 next
= mca
->mca_next
;
5570 bzero(mca
, sizeof (*mca
));
5571 mca
->mca_next
= next
;
5572 mclaudit
[ix
].cl_audit
[i
] = mca
;
5574 /* Attach the contents buffer if requested */
5575 if (save_contents
) {
5576 VERIFY(con
!= NULL
);
5577 mca
->mca_contents_size
= con_size
;
5578 mca
->mca_contents
= con
;
5579 con
= con
->obj_next
;
5580 bzero(mca
->mca_contents
, mca
->mca_contents_size
);
5584 mca
= mca
->mca_next
;
5590 *mca_list
= mca_tail
->mca_next
;
5591 mca_tail
->mca_next
= NULL
;
5595 * Given an address of a buffer (mbuf/cluster/big cluster), return
5596 * the corresponding audit structure for that buffer.
5598 static mcache_audit_t
*
5599 mcl_audit_buf2mca(mbuf_class_t
class, mcache_obj_t
*o
)
5601 mcache_audit_t
*mca
= NULL
;
5604 VERIFY(IS_P2ALIGNED(o
, MIN(m_maxsize(class), NBPG
)));
5609 * For the mbuf case, find the index of the cluster
5610 * used by the mbuf and use that index to locate the
5611 * base address of the cluster. Then find out the
5612 * mbuf index relative to the cluster base and use
5613 * it to locate the audit structure.
5615 VERIFY(MCLIDX(CLTOM(ix
), o
) < (int)NMBPCL
);
5616 mca
= mclaudit
[ix
].cl_audit
[MCLIDX(CLTOM(ix
), o
)];
5623 * Same as above, but only return the first element.
5625 mca
= mclaudit
[ix
].cl_audit
[0];
5637 mcl_audit_mbuf(mcache_audit_t
*mca
, void *addr
, boolean_t composite
,
5640 struct mbuf
*m
= addr
;
5641 mcache_obj_t
*next
= ((mcache_obj_t
*)m
)->obj_next
;
5643 VERIFY(mca
->mca_contents
!= NULL
&&
5644 mca
->mca_contents_size
== AUDIT_CONTENTS_SIZE
);
5646 mcl_audit_verify_nextptr(next
, mca
);
5649 /* Save constructed mbuf fields */
5650 mcl_audit_save_mbuf(m
, mca
);
5651 mcache_set_pattern(MCACHE_FREE_PATTERN
, m
, m_maxsize(MC_MBUF
));
5652 ((mcache_obj_t
*)m
)->obj_next
= next
;
5656 /* Check if the buffer has been corrupted while in freelist */
5657 mcache_audit_free_verify_set(mca
, addr
, 0, m_maxsize(MC_MBUF
));
5659 /* Restore constructed mbuf fields */
5660 mcl_audit_restore_mbuf(m
, mca
, composite
);
5664 mcl_audit_restore_mbuf(struct mbuf
*m
, mcache_audit_t
*mca
, boolean_t composite
)
5666 struct mbuf
*ms
= (struct mbuf
*)mca
->mca_contents
;
5669 struct mbuf
*next
= m
->m_next
;
5670 VERIFY(ms
->m_flags
== M_EXT
&& MEXT_RFA(ms
) != NULL
&&
5671 MBUF_IS_COMPOSITE(ms
));
5673 * We could have hand-picked the mbuf fields and restore
5674 * them individually, but that will be a maintenance
5675 * headache. Instead, restore everything that was saved;
5676 * the mbuf layer will recheck and reinitialize anyway.
5678 bcopy(ms
, m
, mca
->mca_contents_size
);
5682 * For a regular mbuf (no cluster attached) there's nothing
5683 * to restore other than the type field, which is expected
5686 m
->m_type
= ms
->m_type
;
5692 mcl_audit_save_mbuf(struct mbuf
*m
, mcache_audit_t
*mca
)
5695 bcopy(m
, mca
->mca_contents
, mca
->mca_contents_size
);
5699 mcl_audit_cluster(mcache_audit_t
*mca
, void *addr
, size_t size
, boolean_t alloc
,
5700 boolean_t save_next
)
5702 mcache_obj_t
*next
= ((mcache_obj_t
*)addr
)->obj_next
;
5705 mcache_set_pattern(MCACHE_FREE_PATTERN
, addr
, size
);
5707 mcl_audit_verify_nextptr(next
, mca
);
5708 ((mcache_obj_t
*)addr
)->obj_next
= next
;
5711 /* Check if the buffer has been corrupted while in freelist */
5712 mcl_audit_verify_nextptr(next
, mca
);
5713 mcache_audit_free_verify_set(mca
, addr
, 0, size
);
5718 mcl_audit_mcheck_panic(struct mbuf
*m
)
5720 mcache_audit_t
*mca
;
5723 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
5725 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
5726 m
, (u_int16_t
)m
->m_type
, MT_FREE
, mcache_dump_mca(mca
));
5731 mcl_audit_verify_nextptr(void *next
, mcache_audit_t
*mca
)
5733 if (next
!= NULL
&& next
!= (void *)MCACHE_FREE_PATTERN
&&
5734 !MBUF_IN_MAP(next
)) {
5735 panic("mcl_audit: buffer %p modified after free at offset 0: "
5736 "%p out of range [%p-%p)\n%s\n",
5737 mca
->mca_addr
, next
, mbutl
, embutl
, mcache_dump_mca(mca
));
5742 SYSCTL_DECL(_kern_ipc
);
5743 SYSCTL_PROC(_kern_ipc
, KIPC_MBSTAT
, mbstat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5744 0, 0, mbstat_sysctl
, "S,mbstat", "");
5745 SYSCTL_PROC(_kern_ipc
, OID_AUTO
, mb_stat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5746 0, 0, mb_stat_sysctl
, "S,mb_stat", "");
5747 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mb_normalized
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5748 &mb_normalized
, 0, "");