]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_mbuf.c
xnu-1504.9.17.tar.gz
[apple/xnu.git] / bsd / kern / uipc_mbuf.c
1 /*
2 * Copyright (c) 2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
80 #include <sys/proc.h>
81
82 #include <kern/kern_types.h>
83 #include <kern/simple_lock.h>
84 #include <kern/queue.h>
85 #include <kern/sched_prim.h>
86 #include <kern/cpu_number.h>
87
88 #include <libkern/OSAtomic.h>
89 #include <libkern/libkern.h>
90
91 #include <IOKit/IOMapper.h>
92
93 #include <machine/limits.h>
94 #include <machine/machine_routines.h>
95
96 #if CONFIG_MACF_NET
97 #include <security/mac_framework.h>
98 #endif /* MAC_NET */
99
100 #include <sys/mcache.h>
101
102 /*
103 * MBUF IMPLEMENTATION NOTES.
104 *
105 * There is a total of 5 per-CPU caches:
106 *
107 * MC_MBUF:
108 * This is a cache of rudimentary objects of MSIZE in size; each
109 * object represents an mbuf structure. This cache preserves only
110 * the m_type field of the mbuf during its transactions.
111 *
112 * MC_CL:
113 * This is a cache of rudimentary objects of MCLBYTES in size; each
114 * object represents a mcluster structure. This cache does not
115 * preserve the contents of the objects during its transactions.
116 *
117 * MC_BIGCL:
118 * This is a cache of rudimentary objects of NBPG in size; each
119 * object represents a mbigcluster structure. This cache does not
120 * preserve the contents of the objects during its transaction.
121 *
122 * MC_MBUF_CL:
123 * This is a cache of mbufs each having a cluster attached to it.
124 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
125 * fields of the mbuf related to the external cluster are preserved
126 * during transactions.
127 *
128 * MC_MBUF_BIGCL:
129 * This is a cache of mbufs each having a big cluster attached to it.
130 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
131 * fields of the mbuf related to the external cluster are preserved
132 * during transactions.
133 *
134 * OBJECT ALLOCATION:
135 *
136 * Allocation requests are handled first at the per-CPU (mcache) layer
137 * before falling back to the slab layer. Performance is optimal when
138 * the request is satisfied at the CPU layer because global data/lock
139 * never gets accessed. When the slab layer is entered for allocation,
140 * the slab freelist will be checked first for available objects before
141 * the VM backing store is invoked. Slab layer operations are serialized
142 * for all of the caches as the mbuf global lock is held most of the time.
143 * Allocation paths are different depending on the class of objects:
144 *
145 * a. Rudimentary object:
146 *
147 * { m_get_common(), m_clattach(), m_mclget(),
148 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
149 * composite object allocation }
150 * | ^
151 * | |
152 * | +-----------------------+
153 * v |
154 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
155 * | ^
156 * v |
157 * [CPU cache] -------> (found?) -------+
158 * | |
159 * v |
160 * mbuf_slab_alloc() |
161 * | |
162 * v |
163 * +---------> [freelist] -------> (found?) -------+
164 * | |
165 * | v
166 * | m_clalloc()
167 * | |
168 * | v
169 * +---<<---- kmem_mb_alloc()
170 *
171 * b. Composite object:
172 *
173 * { m_getpackets_internal(), m_allocpacket_internal() }
174 * | ^
175 * | |
176 * | +------ (done) ---------+
177 * v |
178 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
179 * | ^
180 * v |
181 * [CPU cache] -------> (found?) -------+
182 * | |
183 * v |
184 * mbuf_cslab_alloc() |
185 * | |
186 * v |
187 * [freelist] -------> (found?) -------+
188 * | |
189 * v |
190 * (rudimentary object) |
191 * mcache_alloc/mcache_alloc_ext() ------>>-----+
192 *
193 * Auditing notes: If auditing is enabled, buffers will be subjected to
194 * integrity checks by the audit routine. This is done by verifying their
195 * contents against DEADBEEF (free) pattern before returning them to caller.
196 * As part of this step, the routine will also record the transaction and
197 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
198 * also restore any constructed data structure fields if necessary.
199 *
200 * OBJECT DEALLOCATION:
201 *
202 * Freeing an object simply involves placing it into the CPU cache; this
203 * pollutes the cache to benefit subsequent allocations. The slab layer
204 * will only be entered if the object is to be purged out of the cache.
205 * During normal operations, this happens only when the CPU layer resizes
206 * its bucket while it's adjusting to the allocation load. Deallocation
207 * paths are different depending on the class of objects:
208 *
209 * a. Rudimentary object:
210 *
211 * { m_free(), m_freem_list(), composite object deallocation }
212 * | ^
213 * | |
214 * | +------ (done) ---------+
215 * v |
216 * mcache_free/mcache_free_ext() |
217 * | |
218 * v |
219 * mbuf_slab_audit() |
220 * | |
221 * v |
222 * [CPU cache] ---> (not purging?) -----+
223 * | |
224 * v |
225 * mbuf_slab_free() |
226 * | |
227 * v |
228 * [freelist] ----------->>------------+
229 * (objects never get purged to VM)
230 *
231 * b. Composite object:
232 *
233 * { m_free(), m_freem_list() }
234 * | ^
235 * | |
236 * | +------ (done) ---------+
237 * v |
238 * mcache_free/mcache_free_ext() |
239 * | |
240 * v |
241 * mbuf_cslab_audit() |
242 * | |
243 * v |
244 * [CPU cache] ---> (not purging?) -----+
245 * | |
246 * v |
247 * mbuf_cslab_free() |
248 * | |
249 * v |
250 * [freelist] ---> (not purging?) -----+
251 * | |
252 * v |
253 * (rudimentary object) |
254 * mcache_free/mcache_free_ext() ------->>------+
255 *
256 * Auditing notes: If auditing is enabled, the audit routine will save
257 * any constructed data structure fields (if necessary) before filling the
258 * contents of the buffers with DEADBEEF (free) pattern and recording the
259 * transaction. Buffers that are freed (whether at CPU or slab layer) are
260 * expected to contain the free pattern.
261 *
262 * DEBUGGING:
263 *
264 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
265 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
266 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
267 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Note
268 * that debugging consumes more CPU and memory.
269 *
270 * Each object is associated with exactly one mcache_audit_t structure that
271 * contains the information related to its last buffer transaction. Given
272 * an address of an object, the audit structure can be retrieved by finding
273 * the position of the object relevant to the base address of the cluster:
274 *
275 * +------------+ +=============+
276 * | mbuf addr | | mclaudit[i] |
277 * +------------+ +=============+
278 * | | cl_audit[0] |
279 * i = MTOCL(addr) +-------------+
280 * | +-----> | cl_audit[1] | -----> mcache_audit_t
281 * b = CLTOM(i) | +-------------+
282 * | | | ... |
283 * x = MCLIDX(b, addr) | +-------------+
284 * | | | cl_audit[7] |
285 * +-----------------+ +-------------+
286 * (e.g. x == 1)
287 *
288 * The mclaudit[] array is allocated at initialization time, but its contents
289 * get populated when the corresponding cluster is created. Because a cluster
290 * can be turned into NMBPCL number of mbufs, we preserve enough space for the
291 * mbufs so that there is a 1-to-1 mapping between them. A cluster that never
292 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
293 * remaining entries unused. For big clusters, only one entry is allocated
294 * and used for the entire cluster pair.
295 */
296
297 /* TODO: should be in header file */
298 /* kernel translater */
299 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
300 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
301 extern vm_map_t mb_map; /* special map */
302
303 /* Global lock */
304 static lck_mtx_t *mbuf_mlock;
305 static lck_attr_t *mbuf_mlock_attr;
306 static lck_grp_t *mbuf_mlock_grp;
307 static lck_grp_attr_t *mbuf_mlock_grp_attr;
308
309 /* Back-end (common) layer */
310 static void *mbuf_worker_run; /* wait channel for worker thread */
311 static int mbuf_worker_ready; /* worker thread is runnable */
312 static int mbuf_expand_mcl; /* number of cluster creation requets */
313 static int mbuf_expand_big; /* number of big cluster creation requests */
314 static int mbuf_expand_16k; /* number of 16K cluster creation requests */
315 static int ncpu; /* number of CPUs */
316 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
317 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
318 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
319 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
320 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
321 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
322 static unsigned int mb_normalized; /* number of packets "normalized" */
323 static unsigned int mbuf_gscale; /* Power-of-two growth scale for m_howmany */
324
325 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
326 #define MB_GROWTH_NORMAL 4 /* Threshold: 15/16 of total */
327
328 typedef enum {
329 MC_MBUF = 0, /* Regular mbuf */
330 MC_CL, /* Cluster */
331 MC_BIGCL, /* Large (4K) cluster */
332 MC_16KCL, /* Jumbo (16K) cluster */
333 MC_MBUF_CL, /* mbuf + cluster */
334 MC_MBUF_BIGCL, /* mbuf + large (4K) cluster */
335 MC_MBUF_16KCL /* mbuf + jumbo (16K) cluster */
336 } mbuf_class_t;
337
338 #define MBUF_CLASS_MIN MC_MBUF
339 #define MBUF_CLASS_MAX MC_MBUF_16KCL
340 #define MBUF_CLASS_LAST MC_16KCL
341 #define MBUF_CLASS_VALID(c) \
342 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
343 #define MBUF_CLASS_COMPOSITE(c) \
344 ((int)(c) > MBUF_CLASS_LAST)
345
346
347 /*
348 * mbuf specific mcache allocation request flags.
349 */
350 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
351
352 /*
353 * Per-cluster slab structure.
354 *
355 * A slab is a cluster control structure that contains one or more object
356 * chunks; the available chunks are chained in the slab's freelist (sl_head).
357 * Each time a chunk is taken out of the slab, the slab's reference count
358 * gets incremented. When all chunks have been taken out, the empty slab
359 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
360 * returned to a slab causes the slab's reference count to be decremented;
361 * it also causes the slab to be reinserted back to class's slab list, if
362 * it's not already done.
363 *
364 * Compartmentalizing of the object chunks into slabs allows us to easily
365 * merge one or more slabs together when the adjacent slabs are idle, as
366 * well as to convert or move a slab from one class to another; e.g. the
367 * mbuf cluster slab can be converted to a regular cluster slab when all
368 * mbufs in the slab have been freed.
369 *
370 * A slab may also span across multiple clusters for chunks larger than
371 * a cluster's size. In this case, only the slab of the first cluster is
372 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
373 * that they are part of the larger slab.
374 */
375 typedef struct mcl_slab {
376 struct mcl_slab *sl_next; /* neighboring slab */
377 u_int8_t sl_class; /* controlling mbuf class */
378 int8_t sl_refcnt; /* outstanding allocations */
379 int8_t sl_chunks; /* chunks (bufs) in this slab */
380 u_int16_t sl_flags; /* slab flags (see below) */
381 u_int16_t sl_len; /* slab length */
382 void *sl_base; /* base of allocated memory */
383 void *sl_head; /* first free buffer */
384 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
385 } mcl_slab_t;
386
387 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
388 #define SLF_PARTIAL 0x0002 /* part of another slab */
389 #define SLF_DETACHED 0x0004 /* not in slab freelist */
390
391 /*
392 * The array of slabs are broken into groups of arrays per 1MB of kernel
393 * memory to reduce the footprint. Each group is allocated on demand
394 * whenever a new piece of memory mapped in from the VM crosses the 1MB
395 * boundary.
396 */
397 #define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */
398
399 typedef struct mcl_slabg {
400 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */
401 } mcl_slabg_t;
402
403 /*
404 * Per-cluster audit structure.
405 */
406 typedef struct {
407 mcache_audit_t *cl_audit[NMBPCL]; /* array of audits */
408 } mcl_audit_t;
409
410 #if CONFIG_MBUF_NOEXPAND
411 static unsigned int maxmbufcl;
412 #endif /* CONFIG_MBUF_NOEXPAND */
413
414 /*
415 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
416 * and m_ext structures. If auditing is enabled, we allocate a shadow
417 * mbuf structure of this size inside each audit structure, and the
418 * contents of the real mbuf gets copied into it when the mbuf is freed.
419 * This allows us to pattern-fill the mbuf for integrity check, and to
420 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
421 * Note that we don't save the contents of clusters when they are freed;
422 * we simply pattern-fill them.
423 */
424 #define AUDIT_CONTENTS_SIZE ((MSIZE - MHLEN) + sizeof (_m_ext_t))
425
426 /*
427 * mbuf specific mcache audit flags
428 */
429 #define MB_INUSE 0x01 /* object has not been returned to slab */
430 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
431 #define MB_SCVALID 0x04 /* object has valid saved contents */
432
433 /*
434 * Each of the following two arrays hold up to nmbclusters elements.
435 */
436 static mcl_audit_t *mclaudit; /* array of cluster audit information */
437 static mcl_slabg_t **slabstbl; /* cluster slabs table */
438 static unsigned int maxslabgrp; /* max # of entries in slabs table */
439 static unsigned int slabgrp; /* # of entries in slabs table */
440
441 /* Globals */
442 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
443 int njcl; /* # of clusters for jumbo sizes */
444 int njclbytes; /* size of a jumbo cluster */
445 union mcluster *mbutl; /* first mapped cluster address */
446 union mcluster *embutl; /* ending virtual address of mclusters */
447 int max_linkhdr; /* largest link-level header */
448 int max_protohdr; /* largest protocol header */
449 int max_hdr; /* largest link+protocol header */
450 int max_datalen; /* MHLEN - max_hdr */
451
452 extern u_int32_t high_sb_max;
453
454 /* TODO: should be in header file */
455 int do_reclaim = 0;
456
457 /* The minimum number of objects that are allocated, to start. */
458 #define MINCL 32
459 #define MINBIGCL (MINCL >> 1)
460 #define MIN16KCL (MINCL >> 2)
461
462 /* Low watermarks (only map in pages once free counts go below) */
463 #define MCL_LOWAT MINCL
464 #define MBIGCL_LOWAT MINBIGCL
465 #define M16KCL_LOWAT MIN16KCL
466
467 typedef struct {
468 mbuf_class_t mtbl_class; /* class type */
469 mcache_t *mtbl_cache; /* mcache for this buffer class */
470 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
471 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
472 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
473 u_int32_t mtbl_maxsize; /* maximum buffer size */
474 int mtbl_minlimit; /* minimum allowed */
475 int mtbl_maxlimit; /* maximum allowed */
476 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
477 } mbuf_table_t;
478
479 #define m_class(c) mbuf_table[c].mtbl_class
480 #define m_cache(c) mbuf_table[c].mtbl_cache
481 #define m_slablist(c) mbuf_table[c].mtbl_slablist
482 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
483 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
484 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
485 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
486 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
487 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
488 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
489 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
490 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
491 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
492 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
493 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
494 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
495 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
496 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
497 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
498 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
499
500 static mbuf_table_t mbuf_table[] = {
501 /*
502 * The caches for mbufs, regular clusters and big clusters.
503 */
504 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
505 NULL, NULL, 0, 0, 0, 0 },
506 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
507 NULL, NULL, 0, 0, 0, 0 },
508 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
509 NULL, NULL, 0, 0, 0, 0 },
510 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
511 NULL, NULL, 0, 0, 0, 0 },
512 /*
513 * The following are special caches; they serve as intermediate
514 * caches backed by the above rudimentary caches. Each object
515 * in the cache is an mbuf with a cluster attached to it. Unlike
516 * the above caches, these intermediate caches do not directly
517 * deal with the slab structures; instead, the constructed
518 * cached elements are simply stored in the freelists.
519 */
520 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
521 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
522 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
523 };
524
525 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
526
527 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
528 static int mb_waiters; /* number of sleepers */
529
530 /* The following are used to serialize m_clalloc() */
531 static boolean_t mb_clalloc_busy;
532 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
533 static int mb_clalloc_waiters;
534
535 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
536 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
537 static void mbuf_table_init(void);
538 static inline void m_incref(struct mbuf *);
539 static inline u_int32_t m_decref(struct mbuf *);
540 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
541 static void mbuf_worker_thread_init(void);
542 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
543 static void slab_free(mbuf_class_t, mcache_obj_t *);
544 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
545 unsigned int, int);
546 static void mbuf_slab_free(void *, mcache_obj_t *, int);
547 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
548 static void mbuf_slab_notify(void *, u_int32_t);
549 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
550 unsigned int);
551 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
552 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
553 unsigned int, int);
554 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
555 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
556 static int freelist_populate(mbuf_class_t, unsigned int, int);
557 static boolean_t mbuf_cached_above(mbuf_class_t, int);
558 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
559 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
560 static int m_howmany(int, size_t);
561 static void mbuf_worker_thread(void);
562 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
563
564 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
565 size_t, unsigned int);
566 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
567 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
568 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
569 boolean_t);
570 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
571 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
572 static void mcl_audit_mcheck_panic(struct mbuf *);
573 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
574
575 static mcl_slab_t *slab_get(void *);
576 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
577 void *, void *, unsigned int, int, int);
578 static void slab_insert(mcl_slab_t *, mbuf_class_t);
579 static void slab_remove(mcl_slab_t *, mbuf_class_t);
580 static boolean_t slab_inrange(mcl_slab_t *, void *);
581 static void slab_nextptr_panic(mcl_slab_t *, void *);
582 static void slab_detach(mcl_slab_t *);
583 static boolean_t slab_is_detached(mcl_slab_t *);
584
585 static unsigned int m_length(struct mbuf *);
586 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
587 static struct mbuf *m_split0(struct mbuf *, int, int, int);
588
589 /* flags for m_copyback0 */
590 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
591 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
592 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
593 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
594
595 /*
596 * This flag is set for all mbufs that come out of and into the composite
597 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
598 * are marked with such a flag have clusters attached to them, and will be
599 * treated differently when they are freed; instead of being placed back
600 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
601 * are placed back into the appropriate composite cache's freelist, and the
602 * actual freeing is deferred until the composite objects are purged. At
603 * such a time, this flag will be cleared from the mbufs and the objects
604 * will be freed into their own separate freelists.
605 */
606 #define EXTF_COMPOSITE 0x1
607
608 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
609 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
610 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
611 #define MBUF_IS_COMPOSITE(m) \
612 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE))
613
614 /*
615 * Macros used to verify the integrity of the mbuf.
616 */
617 #define _MCHECK(m) { \
618 if ((m)->m_type != MT_FREE) { \
619 if (mclaudit == NULL) \
620 panic("MCHECK: m_type=%d m=%p", \
621 (u_int16_t)(m)->m_type, m); \
622 else \
623 mcl_audit_mcheck_panic(m); \
624 } \
625 }
626
627 #define MBUF_IN_MAP(addr) \
628 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
629
630 #define MRANGE(addr) { \
631 if (!MBUF_IN_MAP(addr)) \
632 panic("MRANGE: address out of range 0x%p", addr); \
633 }
634
635 /*
636 * Macro version of mtod.
637 */
638 #define MTOD(m, t) ((t)((m)->m_data))
639
640 /*
641 * Macros to obtain cluster index and base cluster address.
642 */
643 #define MTOCL(x) (((char *)(x) - (char *)mbutl) >> MCLSHIFT)
644 #define CLTOM(x) ((union mcluster *)(mbutl + (x)))
645
646 /*
647 * Macro to find the mbuf index relative to the cluster base.
648 */
649 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> 8)
650
651 /*
652 * Macros used during mbuf and cluster initialization.
653 */
654 #define MBUF_INIT(m, pkthdr, type) { \
655 _MCHECK(m); \
656 (m)->m_next = (m)->m_nextpkt = NULL; \
657 (m)->m_len = 0; \
658 (m)->m_type = type; \
659 if ((pkthdr) == 0) { \
660 (m)->m_data = (m)->m_dat; \
661 (m)->m_flags = 0; \
662 } else { \
663 (m)->m_data = (m)->m_pktdat; \
664 (m)->m_flags = M_PKTHDR; \
665 (m)->m_pkthdr.rcvif = NULL; \
666 (m)->m_pkthdr.len = 0; \
667 (m)->m_pkthdr.header = NULL; \
668 (m)->m_pkthdr.csum_flags = 0; \
669 (m)->m_pkthdr.csum_data = 0; \
670 (m)->m_pkthdr.tso_segsz = 0; \
671 (m)->m_pkthdr.vlan_tag = 0; \
672 (m)->m_pkthdr.socket_id = 0; \
673 m_tag_init(m); \
674 m_prio_init(m); \
675 } \
676 }
677
678 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
679 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
680 (m)->m_flags |= M_EXT; \
681 (m)->m_ext.ext_size = (size); \
682 (m)->m_ext.ext_free = (free); \
683 (m)->m_ext.ext_arg = (arg); \
684 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
685 &(m)->m_ext.ext_refs; \
686 MEXT_RFA(m) = (rfa); \
687 MEXT_REF(m) = (ref); \
688 MEXT_FLAGS(m) = (flag); \
689 }
690
691 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
692 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
693
694 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
695 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
696
697 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
698 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
699
700 /*
701 * Macro to convert BSD malloc sleep flag to mcache's
702 */
703 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
704
705 /*
706 * The structure that holds all mbuf class statistics exportable via sysctl.
707 * Similar to mbstat structure, the mb_stat structure is protected by the
708 * global mbuf lock. It contains additional information about the classes
709 * that allows for a more accurate view of the state of the allocator.
710 */
711 struct mb_stat *mb_stat;
712 struct omb_stat *omb_stat; /* For backwards compatibility */
713
714 #define MB_STAT_SIZE(n) \
715 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
716 #define OMB_STAT_SIZE(n) \
717 ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
718
719 /*
720 * The legacy structure holding all of the mbuf allocation statistics.
721 * The actual statistics used by the kernel are stored in the mbuf_table
722 * instead, and are updated atomically while the global mbuf lock is held.
723 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
724 * Unlike before, the kernel no longer relies on the contents of mbstat for
725 * its operations (e.g. cluster expansion) because the structure is exposed
726 * to outside and could possibly be modified, therefore making it unsafe.
727 * With the exception of the mbstat.m_mtypes array (see below), all of the
728 * statistics are updated as they change.
729 */
730 struct mbstat mbstat;
731
732 #define MBSTAT_MTYPES_MAX \
733 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
734
735 /*
736 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
737 * atomically and stored in a per-CPU structure which is lock-free; this is
738 * done in order to avoid writing to the global mbstat data structure which
739 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
740 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
741 * array and returned to the application. Any updates for types greater or
742 * equal than MT_MAX would be done atomically to the mbstat; this slows down
743 * performance but is okay since the kernel uses only up to MT_MAX-1 while
744 * anything beyond that (up to type 255) is considered a corner case.
745 */
746 typedef struct {
747 unsigned int cpu_mtypes[MT_MAX];
748 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
749
750 typedef struct {
751 mtypes_cpu_t mbs_cpu[1];
752 } mbuf_mtypes_t;
753
754 static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */
755
756 #define MBUF_MTYPES_SIZE(n) \
757 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
758
759 #define MTYPES_CPU(p) \
760 ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
761
762 /* This should be in a header file */
763 #define atomic_add_16(a, n) ((void) OSAddAtomic16(n, a))
764 #define atomic_add_32(a, n) ((void) OSAddAtomic(n, a))
765
766 #define mtype_stat_add(type, n) { \
767 if ((unsigned)(type) < MT_MAX) { \
768 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
769 atomic_add_32(&mbs->cpu_mtypes[type], n); \
770 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
771 atomic_add_16((int16_t*)&mbstat.m_mtypes[type], n); \
772 } \
773 }
774
775 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
776 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
777 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
778
779 static int
780 mbstat_sysctl SYSCTL_HANDLER_ARGS
781 {
782 #pragma unused(oidp, arg1, arg2)
783 int m, n;
784 mtypes_cpu_t mtc;
785
786 bzero(&mtc, sizeof (mtc));
787 for (m = 0; m < ncpu; m++) {
788 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
789 mtypes_cpu_t temp;
790
791 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
792 sizeof (temp.cpu_mtypes));
793
794 for (n = 0; n < MT_MAX; n++)
795 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
796 }
797 lck_mtx_lock(mbuf_mlock);
798 for (n = 0; n < MT_MAX; n++)
799 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
800 lck_mtx_unlock(mbuf_mlock);
801
802 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
803 }
804
805 static int
806 mb_stat_sysctl SYSCTL_HANDLER_ARGS
807 {
808 #pragma unused(oidp, arg1, arg2)
809 mcache_t *cp;
810 mcache_cpu_t *ccp;
811 mb_class_stat_t *sp;
812 void *statp;
813 int k, m, bktsize, statsz, proc64 = proc_is64bit(req->p);
814
815 lck_mtx_lock(mbuf_mlock);
816 for (k = 0; k < NELEM(mbuf_table); k++) {
817 cp = m_cache(k);
818 ccp = &cp->mc_cpu[0];
819 bktsize = ccp->cc_bktsize;
820 sp = mbuf_table[k].mtbl_stats;
821
822 if (cp->mc_flags & MCF_NOCPUCACHE)
823 sp->mbcl_mc_state = MCS_DISABLED;
824 else if (cp->mc_purge_cnt > 0)
825 sp->mbcl_mc_state = MCS_PURGING;
826 else if (bktsize == 0)
827 sp->mbcl_mc_state = MCS_OFFLINE;
828 else
829 sp->mbcl_mc_state = MCS_ONLINE;
830
831 sp->mbcl_mc_cached = 0;
832 for (m = 0; m < ncpu; m++) {
833 ccp = &cp->mc_cpu[m];
834 if (ccp->cc_objs > 0)
835 sp->mbcl_mc_cached += ccp->cc_objs;
836 if (ccp->cc_pobjs > 0)
837 sp->mbcl_mc_cached += ccp->cc_pobjs;
838 }
839 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
840 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
841 sp->mbcl_infree;
842
843 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
844 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
845 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
846
847 /* Calculate total count specific to each class */
848 sp->mbcl_ctotal = sp->mbcl_total;
849 switch (m_class(k)) {
850 case MC_MBUF:
851 /* Deduct mbufs used in composite caches */
852 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
853 m_total(MC_MBUF_BIGCL));
854 break;
855
856 case MC_CL:
857 /* Deduct clusters used in composite cache and mbufs */
858 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
859 (P2ROUNDUP(m_total(MC_MBUF), NMBPCL)/NMBPCL));
860 break;
861
862 case MC_BIGCL:
863 /* Deduct clusters used in composite cache */
864 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
865 break;
866
867 case MC_16KCL:
868 /* Deduct clusters used in composite cache */
869 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
870 break;
871
872 default:
873 break;
874 }
875 }
876
877 if (!proc64) {
878 struct omb_class_stat *oc;
879 struct mb_class_stat *c;
880
881 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
882 oc = &omb_stat->mbs_class[0];
883 c = &mb_stat->mbs_class[0];
884 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
885 (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
886 "%s", c->mbcl_cname);
887 oc->mbcl_size = c->mbcl_size;
888 oc->mbcl_total = c->mbcl_total;
889 oc->mbcl_active = c->mbcl_active;
890 oc->mbcl_infree = c->mbcl_infree;
891 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
892 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
893 oc->mbcl_free_cnt = c->mbcl_free_cnt;
894 oc->mbcl_notified = c->mbcl_notified;
895 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
896 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
897 oc->mbcl_ctotal = c->mbcl_ctotal;
898 oc->mbcl_mc_state = c->mbcl_mc_state;
899 oc->mbcl_mc_cached = c->mbcl_mc_cached;
900 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
901 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
902 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
903 }
904 statp = omb_stat;
905 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
906 } else {
907 statp = mb_stat;
908 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
909 }
910
911 lck_mtx_unlock(mbuf_mlock);
912
913 return (SYSCTL_OUT(req, statp, statsz));
914 }
915
916 static inline void
917 m_incref(struct mbuf *m)
918 {
919 UInt32 old, new;
920 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
921
922 do {
923 old = *addr;
924 new = old + 1;
925 ASSERT(new != 0);
926 } while (!OSCompareAndSwap(old, new, addr));
927 }
928
929 static inline u_int32_t
930 m_decref(struct mbuf *m)
931 {
932 UInt32 old, new;
933 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
934
935 do {
936 old = *addr;
937 new = old - 1;
938 ASSERT(old != 0);
939 } while (!OSCompareAndSwap(old, new, addr));
940
941 return (new);
942 }
943
944 static void
945 mbuf_table_init(void)
946 {
947 int m;
948
949 MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
950 M_TEMP, M_WAITOK | M_ZERO);
951 VERIFY(omb_stat != NULL);
952
953 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
954 M_TEMP, M_WAITOK | M_ZERO);
955 VERIFY(mb_stat != NULL);
956
957 mb_stat->mbs_cnt = NELEM(mbuf_table);
958 for (m = 0; m < NELEM(mbuf_table); m++)
959 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
960
961 #if CONFIG_MBUF_JUMBO
962 /*
963 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
964 * this only on platforms where jumbo cluster pool is enabled.
965 */
966 njcl = nmbclusters / 3;
967 njclbytes = M16KCLBYTES;
968 #endif /* CONFIG_MBUF_JUMBO */
969
970 /*
971 * nclusters is going to be split in 2 to hold both the 2K
972 * and the 4K pools, so make sure each half is even.
973 */
974 nclusters = P2ROUNDDOWN(nmbclusters - njcl, 4);
975 if (njcl > 0) {
976 /*
977 * Each jumbo cluster takes 8 2K clusters, so make
978 * sure that the pool size is evenly divisible by 8.
979 */
980 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
981 }
982
983 #if CONFIG_MBUF_NOEXPAND
984 /* Only use 4k clusters if we're setting aside more than 256k */
985 if (nmbclusters <= 128) {
986 maxmbufcl = nmbclusters / 4;
987 } else {
988 /* Half to big clusters, half to small */
989 maxmbufcl = (nmbclusters / 4) * 3;
990 }
991 #endif /* CONFIG_MBUF_NOEXPAND */
992
993 /*
994 * 1/2 of the map is reserved for 2K clusters. Out of this, 1/16th
995 * of the total number of 2K clusters allocated is reserved and cannot
996 * be turned into mbufs. It can only be used for pure cluster objects.
997 */
998 m_minlimit(MC_CL) = (nclusters >> 5);
999 m_maxlimit(MC_CL) = (nclusters >> 1);
1000 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1001 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1002
1003 /*
1004 * The remaining (15/16th) can be turned into mbufs.
1005 */
1006 m_minlimit(MC_MBUF) = 0;
1007 m_maxlimit(MC_MBUF) = (m_maxlimit(MC_CL) - m_minlimit(MC_CL)) * NMBPCL;
1008 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1009 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1010
1011 /*
1012 * The other 1/2 of the map is reserved for 4K clusters.
1013 */
1014 m_minlimit(MC_BIGCL) = 0;
1015 m_maxlimit(MC_BIGCL) = m_maxlimit(MC_CL) >> 1;
1016 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = NBPG;
1017 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1018
1019 /*
1020 * Set limits for the composite classes.
1021 */
1022 m_minlimit(MC_MBUF_CL) = 0;
1023 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL) - m_minlimit(MC_CL);
1024 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1025 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1026 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1027
1028 m_minlimit(MC_MBUF_BIGCL) = 0;
1029 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1030 m_maxsize(MC_MBUF_BIGCL) = NBPG;
1031 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1032 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1033
1034 /*
1035 * And for jumbo classes.
1036 */
1037 m_minlimit(MC_16KCL) = 0;
1038 m_maxlimit(MC_16KCL) = (njcl >> 3);
1039 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1040 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1041
1042 m_minlimit(MC_MBUF_16KCL) = 0;
1043 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1044 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1045 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1046 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1047
1048 /*
1049 * Initialize the legacy mbstat structure.
1050 */
1051 bzero(&mbstat, sizeof (mbstat));
1052 mbstat.m_msize = m_maxsize(MC_MBUF);
1053 mbstat.m_mclbytes = m_maxsize(MC_CL);
1054 mbstat.m_minclsize = MINCLSIZE;
1055 mbstat.m_mlen = MLEN;
1056 mbstat.m_mhlen = MHLEN;
1057 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1058 }
1059
1060 #if defined(__LP64__)
1061 typedef struct ncl_tbl {
1062 uint64_t nt_maxmem; /* memory (sane) size */
1063 uint32_t nt_mbpool; /* mbuf pool size */
1064 } ncl_tbl_t;
1065
1066 /* Non-server */
1067 static ncl_tbl_t ncl_table[] = {
1068 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
1069 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ },
1070 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ },
1071 { 0, 0 }
1072 };
1073
1074 /* Server */
1075 static ncl_tbl_t ncl_table_srv[] = {
1076 { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ },
1077 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ },
1078 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ },
1079 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ },
1080 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (256 << MBSHIFT) /* 256 MB */ },
1081 { (1ULL << (GBSHIFT + 6)) /* 64 GB */, (384 << MBSHIFT) /* 384 MB */ },
1082 { 0, 0 }
1083 };
1084 #endif /* __LP64__ */
1085
1086 __private_extern__ unsigned int
1087 mbuf_default_ncl(int srv, uint64_t mem)
1088 {
1089 #if !defined(__LP64__)
1090 #pragma unused(srv)
1091 unsigned int n;
1092 /*
1093 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1094 */
1095 if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1096 n = 32768;
1097 #else
1098 unsigned int n, i;
1099 ncl_tbl_t *tbl = (srv ? ncl_table_srv : ncl_table);
1100 /*
1101 * 64-bit kernel (mbuf pool size based on table).
1102 */
1103 n = tbl[0].nt_mbpool;
1104 for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1105 if (mem < tbl[i].nt_maxmem)
1106 break;
1107 n = tbl[i].nt_mbpool;
1108 }
1109 n >>= MCLSHIFT;
1110 #endif /* !__LP64__ */
1111 return (n);
1112 }
1113
1114 __private_extern__ void
1115 mbinit(void)
1116 {
1117 unsigned int m;
1118 int initmcl = MINCL;
1119 void *buf;
1120 thread_t thread = THREAD_NULL;
1121
1122 if (nmbclusters == 0)
1123 nmbclusters = NMBCLUSTERS;
1124
1125 /* Setup the mbuf table */
1126 mbuf_table_init();
1127
1128 /* Global lock for common layer */
1129 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1130 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1131 mbuf_mlock_attr = lck_attr_alloc_init();
1132 mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr);
1133
1134 /* Allocate cluster slabs table */
1135 maxslabgrp = P2ROUNDUP(nmbclusters, NSLABSPMB) / NSLABSPMB;
1136 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1137 M_TEMP, M_WAITOK | M_ZERO);
1138 VERIFY(slabstbl != NULL);
1139
1140 /* Allocate audit structures if needed */
1141 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1142 mbuf_debug |= mcache_getflags();
1143 if (mbuf_debug & MCF_AUDIT) {
1144 MALLOC(mclaudit, mcl_audit_t *,
1145 nmbclusters * sizeof (*mclaudit), M_TEMP,
1146 M_WAITOK | M_ZERO);
1147 VERIFY(mclaudit != NULL);
1148
1149 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1150 AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1151 VERIFY(mcl_audit_con_cache != NULL);
1152 }
1153
1154 /* Calculate the number of pages assigned to the cluster pool */
1155 mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1156 MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1157 M_TEMP, M_WAITOK);
1158 VERIFY(mcl_paddr != NULL);
1159
1160 /* Register with the I/O Bus mapper */
1161 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1162 bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1163
1164 embutl = (union mcluster *)
1165 ((unsigned char *)mbutl + (nmbclusters * MCLBYTES));
1166
1167 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1168
1169 lck_mtx_lock(mbuf_mlock);
1170
1171 if (m_clalloc(MAX(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES) == 0)
1172 panic("mbinit: m_clalloc failed\n");
1173
1174 lck_mtx_unlock(mbuf_mlock);
1175
1176 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, NULL, &thread);
1177 thread_deallocate(thread);
1178
1179 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1180 0, 0, MCR_SLEEP);
1181
1182 /* Create the cache for each class */
1183 for (m = 0; m < NELEM(mbuf_table); m++) {
1184 void *allocfunc, *freefunc, *auditfunc;
1185 u_int32_t flags;
1186
1187 flags = mbuf_debug;
1188 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1189 m_class(m) == MC_MBUF_16KCL) {
1190 allocfunc = mbuf_cslab_alloc;
1191 freefunc = mbuf_cslab_free;
1192 auditfunc = mbuf_cslab_audit;
1193 } else {
1194 allocfunc = mbuf_slab_alloc;
1195 freefunc = mbuf_slab_free;
1196 auditfunc = mbuf_slab_audit;
1197 }
1198
1199 /*
1200 * Disable per-CPU caches for jumbo classes if there
1201 * is no jumbo cluster pool available in the system.
1202 * The cache itself is still created (but will never
1203 * be populated) since it simplifies the code.
1204 */
1205 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1206 njcl == 0)
1207 flags |= MCF_NOCPUCACHE;
1208
1209 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1210 allocfunc, freefunc, auditfunc, mbuf_slab_notify,
1211 (void *)(uintptr_t)m, flags, MCR_SLEEP);
1212 }
1213
1214 /*
1215 * Allocate structure for per-CPU statistics that's aligned
1216 * on the CPU cache boundary; this code assumes that we never
1217 * uninitialize this framework, since the original address
1218 * before alignment is not saved.
1219 */
1220 ncpu = ml_get_max_cpus();
1221 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1222 M_TEMP, M_WAITOK);
1223 VERIFY(buf != NULL);
1224
1225 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1226 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1227
1228 mbuf_gscale = MB_GROWTH_NORMAL;
1229
1230 /*
1231 * Set the max limit on sb_max to be 1/16 th of the size of
1232 * memory allocated for mbuf clusters.
1233 */
1234 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1235 if (high_sb_max < sb_max) {
1236 /* sb_max is too large for this configuration, scale it down */
1237 if (high_sb_max > (1 << MBSHIFT)) {
1238 /* We have atleast 16 M of mbuf pool */
1239 sb_max = high_sb_max;
1240 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1241 /* If we have more than 1M of mbufpool, cap the size of
1242 * max sock buf at 1M
1243 */
1244 sb_max = high_sb_max = (1 << MBSHIFT);
1245 } else {
1246 sb_max = high_sb_max;
1247 }
1248 }
1249
1250 printf("mbinit: done (%d MB memory set for mbuf pool)\n",
1251 (nmbclusters << MCLSHIFT) >> MBSHIFT);
1252 }
1253
1254 /*
1255 * Obtain a slab of object(s) from the class's freelist.
1256 */
1257 static mcache_obj_t *
1258 slab_alloc(mbuf_class_t class, int wait)
1259 {
1260 mcl_slab_t *sp;
1261 mcache_obj_t *buf;
1262
1263 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1264
1265 VERIFY(class != MC_16KCL || njcl > 0);
1266
1267 /* This should always be NULL for us */
1268 VERIFY(m_cobjlist(class) == NULL);
1269
1270 /*
1271 * Treat composite objects as having longer lifespan by using
1272 * a slab from the reverse direction, in hoping that this could
1273 * reduce the probability of fragmentation for slabs that hold
1274 * more than one buffer chunks (e.g. mbuf slabs). For other
1275 * slabs, this probably doesn't make much of a difference.
1276 */
1277 if (class == MC_MBUF && (wait & MCR_COMP))
1278 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1279 else
1280 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1281
1282 if (sp == NULL) {
1283 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1284 /* The slab list for this class is empty */
1285 return (NULL);
1286 }
1287
1288 VERIFY(m_infree(class) > 0);
1289 VERIFY(!slab_is_detached(sp));
1290 VERIFY(sp->sl_class == class &&
1291 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1292 buf = sp->sl_head;
1293 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1294
1295 if (class == MC_MBUF) {
1296 sp->sl_head = buf->obj_next;
1297 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPCL - 1));
1298 } else {
1299 sp->sl_head = NULL;
1300 }
1301 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1302 slab_nextptr_panic(sp, sp->sl_head);
1303 /* In case sl_head is in the map but not in the slab */
1304 VERIFY(slab_inrange(sp, sp->sl_head));
1305 /* NOTREACHED */
1306 }
1307
1308 /* Increment slab reference */
1309 sp->sl_refcnt++;
1310
1311 if (mclaudit != NULL) {
1312 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1313 mca->mca_uflags = 0;
1314 /* Save contents on mbuf objects only */
1315 if (class == MC_MBUF)
1316 mca->mca_uflags |= MB_SCVALID;
1317 }
1318
1319 if (class == MC_CL) {
1320 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1321 /*
1322 * A 2K cluster slab can have at most 1 reference.
1323 */
1324 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1325 sp->sl_len == m_maxsize(MC_CL) && sp->sl_head == NULL);
1326 } else if (class == MC_BIGCL) {
1327 mcl_slab_t *nsp = sp->sl_next;
1328 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1329 m_infree(MC_MBUF_BIGCL);
1330 /*
1331 * Increment 2nd slab. A 4K big cluster takes
1332 * 2 slabs, each having at most 1 reference.
1333 */
1334 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1335 sp->sl_len == m_maxsize(MC_BIGCL) && sp->sl_head == NULL);
1336 /* Next slab must already be present */
1337 VERIFY(nsp != NULL);
1338 nsp->sl_refcnt++;
1339 VERIFY(!slab_is_detached(nsp));
1340 VERIFY(nsp->sl_class == MC_BIGCL &&
1341 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1342 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1343 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1344 nsp->sl_head == NULL);
1345 } else if (class == MC_16KCL) {
1346 mcl_slab_t *nsp;
1347 int k;
1348
1349 --m_infree(MC_16KCL);
1350 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1351 sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1352 /*
1353 * Increment 2nd-8th slab. A 16K big cluster takes
1354 * 8 cluster slabs, each having at most 1 reference.
1355 */
1356 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1357 nsp = nsp->sl_next;
1358 /* Next slab must already be present */
1359 VERIFY(nsp != NULL);
1360 nsp->sl_refcnt++;
1361 VERIFY(!slab_is_detached(nsp));
1362 VERIFY(nsp->sl_class == MC_16KCL &&
1363 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1364 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1365 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1366 nsp->sl_head == NULL);
1367 }
1368 } else {
1369 ASSERT(class == MC_MBUF);
1370 --m_infree(MC_MBUF);
1371 /*
1372 * If auditing is turned on, this check is
1373 * deferred until later in mbuf_slab_audit().
1374 */
1375 if (mclaudit == NULL)
1376 _MCHECK((struct mbuf *)buf);
1377 /*
1378 * Since we have incremented the reference count above,
1379 * an mbuf slab (formerly a 2K cluster slab that was cut
1380 * up into mbufs) must have a reference count between 1
1381 * and NMBPCL at this point.
1382 */
1383 VERIFY(sp->sl_refcnt >= 1 &&
1384 (unsigned short)sp->sl_refcnt <= NMBPCL &&
1385 sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1386 VERIFY((unsigned short)sp->sl_refcnt < NMBPCL ||
1387 sp->sl_head == NULL);
1388 }
1389
1390 /* If empty, remove this slab from the class's freelist */
1391 if (sp->sl_head == NULL) {
1392 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPCL);
1393 slab_remove(sp, class);
1394 }
1395
1396 return (buf);
1397 }
1398
1399 /*
1400 * Place a slab of object(s) back into a class's slab list.
1401 */
1402 static void
1403 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1404 {
1405 mcl_slab_t *sp;
1406
1407 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1408
1409 VERIFY(class != MC_16KCL || njcl > 0);
1410 VERIFY(buf->obj_next == NULL);
1411 sp = slab_get(buf);
1412 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1413 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1414
1415 /* Decrement slab reference */
1416 sp->sl_refcnt--;
1417
1418 if (class == MC_CL || class == MC_BIGCL) {
1419 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1420 /*
1421 * A 2K cluster slab can have at most 1 reference
1422 * which must be 0 at this point.
1423 */
1424 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1425 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1426 VERIFY(slab_is_detached(sp));
1427 if (class == MC_BIGCL) {
1428 mcl_slab_t *nsp = sp->sl_next;
1429 VERIFY(IS_P2ALIGNED(buf, NBPG));
1430 /* Next slab must already be present */
1431 VERIFY(nsp != NULL);
1432 /* Decrement 2nd slab reference */
1433 nsp->sl_refcnt--;
1434 /*
1435 * A 4K big cluster takes 2 slabs, both
1436 * must now have 0 reference.
1437 */
1438 VERIFY(slab_is_detached(nsp));
1439 VERIFY(nsp->sl_class == MC_BIGCL &&
1440 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1441 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1442 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1443 nsp->sl_head == NULL);
1444 }
1445 } else if (class == MC_16KCL) {
1446 mcl_slab_t *nsp;
1447 int k;
1448 /*
1449 * A 16K cluster takes 8 cluster slabs, all must
1450 * now have 0 reference.
1451 */
1452 VERIFY(IS_P2ALIGNED(buf, NBPG));
1453 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1454 sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1455 VERIFY(slab_is_detached(sp));
1456 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1457 nsp = nsp->sl_next;
1458 /* Next slab must already be present */
1459 VERIFY(nsp != NULL);
1460 nsp->sl_refcnt--;
1461 VERIFY(slab_is_detached(nsp));
1462 VERIFY(nsp->sl_class == MC_16KCL &&
1463 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1464 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1465 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1466 nsp->sl_head == NULL);
1467 }
1468 } else {
1469 /*
1470 * An mbuf slab has a total of NMBPL reference counts.
1471 * Since we have decremented the reference above, it
1472 * must now be between 0 and NMBPCL-1.
1473 */
1474 VERIFY(sp->sl_refcnt >= 0 &&
1475 (unsigned short)sp->sl_refcnt <= (NMBPCL - 1) &&
1476 sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1477 VERIFY(sp->sl_refcnt < (NMBPCL - 1) ||
1478 (slab_is_detached(sp) && sp->sl_head == NULL));
1479 }
1480
1481 /*
1482 * When auditing is enabled, ensure that the buffer still
1483 * contains the free pattern. Otherwise it got corrupted
1484 * while at the CPU cache layer.
1485 */
1486 if (mclaudit != NULL) {
1487 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1488 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1489 mca->mca_uflags &= ~MB_SCVALID;
1490 }
1491
1492 if (class == MC_CL) {
1493 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1494 } else if (class == MC_BIGCL) {
1495 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1496 m_infree(MC_MBUF_BIGCL);
1497 } else if (class == MC_16KCL) {
1498 ++m_infree(MC_16KCL);
1499 } else {
1500 ++m_infree(MC_MBUF);
1501 buf->obj_next = sp->sl_head;
1502 }
1503 sp->sl_head = buf;
1504
1505 /* All mbufs are freed; return the cluster that we stole earlier */
1506 if (sp->sl_refcnt == 0 && class == MC_MBUF) {
1507 int i = NMBPCL;
1508
1509 m_total(MC_MBUF) -= NMBPCL;
1510 mbstat.m_mbufs = m_total(MC_MBUF);
1511 m_infree(MC_MBUF) -= NMBPCL;
1512 mtype_stat_add(MT_FREE, -((unsigned)NMBPCL));
1513
1514 while (i--) {
1515 struct mbuf *m = sp->sl_head;
1516 VERIFY(m != NULL);
1517 sp->sl_head = m->m_next;
1518 m->m_next = NULL;
1519 }
1520 VERIFY(sp->sl_head == NULL);
1521
1522 /* Remove the slab from the mbuf class's slab list */
1523 slab_remove(sp, class);
1524
1525 /* Reinitialize it as a 2K cluster slab */
1526 slab_init(sp, MC_CL, sp->sl_flags, sp->sl_base, sp->sl_base,
1527 sp->sl_len, 0, 1);
1528
1529 if (mclaudit != NULL)
1530 mcache_set_pattern(MCACHE_FREE_PATTERN,
1531 (caddr_t)sp->sl_head, m_maxsize(MC_CL));
1532
1533 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1534
1535 VERIFY(slab_is_detached(sp));
1536 /* And finally switch class */
1537 class = MC_CL;
1538 }
1539
1540 /* Reinsert the slab to the class's slab list */
1541 if (slab_is_detached(sp))
1542 slab_insert(sp, class);
1543 }
1544
1545 /*
1546 * Common allocator for rudimentary objects called by the CPU cache layer
1547 * during an allocation request whenever there is no available element in the
1548 * bucket layer. It returns one or more elements from the appropriate global
1549 * freelist. If the freelist is empty, it will attempt to populate it and
1550 * retry the allocation.
1551 */
1552 static unsigned int
1553 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1554 {
1555 mbuf_class_t class = (mbuf_class_t)arg;
1556 unsigned int need = num;
1557 mcache_obj_t **list = *plist;
1558
1559 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1560 ASSERT(need > 0);
1561
1562 lck_mtx_lock(mbuf_mlock);
1563
1564 for (;;) {
1565 if ((*list = slab_alloc(class, wait)) != NULL) {
1566 (*list)->obj_next = NULL;
1567 list = *plist = &(*list)->obj_next;
1568
1569 if (--need == 0) {
1570 /*
1571 * If the number of elements in freelist has
1572 * dropped below low watermark, asynchronously
1573 * populate the freelist now rather than doing
1574 * it later when we run out of elements.
1575 */
1576 if (!mbuf_cached_above(class, wait) &&
1577 m_infree(class) < m_total(class) >> 5) {
1578 (void) freelist_populate(class, 1,
1579 M_DONTWAIT);
1580 }
1581 break;
1582 }
1583 } else {
1584 VERIFY(m_infree(class) == 0 || class == MC_CL);
1585
1586 (void) freelist_populate(class, 1,
1587 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1588
1589 if (m_infree(class) > 0)
1590 continue;
1591
1592 /* Check if there's anything at the cache layer */
1593 if (mbuf_cached_above(class, wait))
1594 break;
1595
1596 /* We have nothing and cannot block; give up */
1597 if (wait & MCR_NOSLEEP) {
1598 if (!(wait & MCR_TRYHARD)) {
1599 m_fail_cnt(class)++;
1600 mbstat.m_drops++;
1601 break;
1602 }
1603 }
1604
1605 /*
1606 * If the freelist is still empty and the caller is
1607 * willing to be blocked, sleep on the wait channel
1608 * until an element is available. Otherwise, if
1609 * MCR_TRYHARD is set, do our best to satisfy the
1610 * request without having to go to sleep.
1611 */
1612 if (mbuf_worker_ready &&
1613 mbuf_sleep(class, need, wait))
1614 break;
1615
1616 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1617 }
1618 }
1619
1620 m_alloc_cnt(class) += num - need;
1621 lck_mtx_unlock(mbuf_mlock);
1622
1623 return (num - need);
1624 }
1625
1626 /*
1627 * Common de-allocator for rudimentary objects called by the CPU cache
1628 * layer when one or more elements need to be returned to the appropriate
1629 * global freelist.
1630 */
1631 static void
1632 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
1633 {
1634 mbuf_class_t class = (mbuf_class_t)arg;
1635 mcache_obj_t *nlist;
1636 unsigned int num = 0;
1637 int w;
1638
1639 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1640
1641 lck_mtx_lock(mbuf_mlock);
1642
1643 for (;;) {
1644 nlist = list->obj_next;
1645 list->obj_next = NULL;
1646 slab_free(class, list);
1647 ++num;
1648 if ((list = nlist) == NULL)
1649 break;
1650 }
1651 m_free_cnt(class) += num;
1652
1653 if ((w = mb_waiters) > 0)
1654 mb_waiters = 0;
1655
1656 lck_mtx_unlock(mbuf_mlock);
1657
1658 if (w != 0)
1659 wakeup(mb_waitchan);
1660 }
1661
1662 /*
1663 * Common auditor for rudimentary objects called by the CPU cache layer
1664 * during an allocation or free request. For the former, this is called
1665 * after the objects are obtained from either the bucket or slab layer
1666 * and before they are returned to the caller. For the latter, this is
1667 * called immediately during free and before placing the objects into
1668 * the bucket or slab layer.
1669 */
1670 static void
1671 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1672 {
1673 mbuf_class_t class = (mbuf_class_t)arg;
1674 mcache_audit_t *mca;
1675
1676 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1677
1678 while (list != NULL) {
1679 lck_mtx_lock(mbuf_mlock);
1680 mca = mcl_audit_buf2mca(class, list);
1681
1682 /* Do the sanity checks */
1683 if (class == MC_MBUF) {
1684 mcl_audit_mbuf(mca, list, FALSE, alloc);
1685 ASSERT(mca->mca_uflags & MB_SCVALID);
1686 } else {
1687 mcl_audit_cluster(mca, list, m_maxsize(class),
1688 alloc, TRUE);
1689 ASSERT(!(mca->mca_uflags & MB_SCVALID));
1690 }
1691 /* Record this transaction */
1692 mcache_buffer_log(mca, list, m_cache(class));
1693 if (alloc)
1694 mca->mca_uflags |= MB_INUSE;
1695 else
1696 mca->mca_uflags &= ~MB_INUSE;
1697 /* Unpair the object (unconditionally) */
1698 mca->mca_uptr = NULL;
1699 lck_mtx_unlock(mbuf_mlock);
1700
1701 list = list->obj_next;
1702 }
1703 }
1704
1705 /*
1706 * Common notify routine for all caches. It is called by mcache when
1707 * one or more objects get freed. We use this indication to trigger
1708 * the wakeup of any sleeping threads so that they can retry their
1709 * allocation requests.
1710 */
1711 static void
1712 mbuf_slab_notify(void *arg, u_int32_t reason)
1713 {
1714 mbuf_class_t class = (mbuf_class_t)arg;
1715 int w;
1716
1717 ASSERT(MBUF_CLASS_VALID(class));
1718
1719 if (reason != MCN_RETRYALLOC)
1720 return;
1721
1722 lck_mtx_lock(mbuf_mlock);
1723 if ((w = mb_waiters) > 0) {
1724 m_notified(class)++;
1725 mb_waiters = 0;
1726 }
1727 lck_mtx_unlock(mbuf_mlock);
1728
1729 if (w != 0)
1730 wakeup(mb_waitchan);
1731 }
1732
1733 /*
1734 * Obtain object(s) from the composite class's freelist.
1735 */
1736 static unsigned int
1737 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
1738 {
1739 unsigned int need = num;
1740 mcl_slab_t *sp, *clsp, *nsp;
1741 struct mbuf *m;
1742 mcache_obj_t **list = *plist;
1743 void *cl;
1744
1745 VERIFY(need > 0);
1746 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1747 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1748
1749 /* Get what we can from the freelist */
1750 while ((*list = m_cobjlist(class)) != NULL) {
1751 MRANGE(*list);
1752
1753 m = (struct mbuf *)*list;
1754 sp = slab_get(m);
1755 cl = m->m_ext.ext_buf;
1756 clsp = slab_get(cl);
1757 VERIFY(m->m_flags == M_EXT && cl != NULL);
1758 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
1759 VERIFY(clsp->sl_refcnt == 1);
1760 if (class == MC_MBUF_BIGCL) {
1761 nsp = clsp->sl_next;
1762 /* Next slab must already be present */
1763 VERIFY(nsp != NULL);
1764 VERIFY(nsp->sl_refcnt == 1);
1765 } else if (class == MC_MBUF_16KCL) {
1766 int k;
1767 for (nsp = clsp, k = 1;
1768 k < (M16KCLBYTES / MCLBYTES); k++) {
1769 nsp = nsp->sl_next;
1770 /* Next slab must already be present */
1771 VERIFY(nsp != NULL);
1772 VERIFY(nsp->sl_refcnt == 1);
1773 }
1774 }
1775
1776 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
1777 !MBUF_IN_MAP(m_cobjlist(class))) {
1778 slab_nextptr_panic(sp, m_cobjlist(class));
1779 /* NOTREACHED */
1780 }
1781 (*list)->obj_next = NULL;
1782 list = *plist = &(*list)->obj_next;
1783
1784 if (--need == 0)
1785 break;
1786 }
1787 m_infree(class) -= (num - need);
1788
1789 return (num - need);
1790 }
1791
1792 /*
1793 * Place object(s) back into a composite class's freelist.
1794 */
1795 static unsigned int
1796 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
1797 {
1798 mcache_obj_t *o, *tail;
1799 unsigned int num = 0;
1800 struct mbuf *m, *ms;
1801 mcache_audit_t *mca = NULL;
1802 mcache_obj_t *ref_list = NULL;
1803 mcl_slab_t *clsp, *nsp;
1804 void *cl;
1805
1806 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1807 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1808 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1809
1810 o = tail = list;
1811
1812 while ((m = ms = (struct mbuf *)o) != NULL) {
1813 mcache_obj_t *rfa, *nexto = o->obj_next;
1814
1815 /* Do the mbuf sanity checks */
1816 if (mclaudit != NULL) {
1817 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1818 mcache_audit_free_verify(mca, m, 0, m_maxsize(MC_MBUF));
1819 ms = (struct mbuf *)mca->mca_contents;
1820 }
1821
1822 /* Do the cluster sanity checks */
1823 cl = ms->m_ext.ext_buf;
1824 clsp = slab_get(cl);
1825 if (mclaudit != NULL) {
1826 size_t size;
1827 if (class == MC_MBUF_CL)
1828 size = m_maxsize(MC_CL);
1829 else if (class == MC_MBUF_BIGCL)
1830 size = m_maxsize(MC_BIGCL);
1831 else
1832 size = m_maxsize(MC_16KCL);
1833 mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL,
1834 (mcache_obj_t *)cl), cl, 0, size);
1835 }
1836 VERIFY(ms->m_type == MT_FREE);
1837 VERIFY(ms->m_flags == M_EXT);
1838 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1839 VERIFY(clsp->sl_refcnt == 1);
1840 if (class == MC_MBUF_BIGCL) {
1841 nsp = clsp->sl_next;
1842 /* Next slab must already be present */
1843 VERIFY(nsp != NULL);
1844 VERIFY(nsp->sl_refcnt == 1);
1845 } else if (class == MC_MBUF_16KCL) {
1846 int k;
1847 for (nsp = clsp, k = 1;
1848 k < (M16KCLBYTES / MCLBYTES); k++) {
1849 nsp = nsp->sl_next;
1850 /* Next slab must already be present */
1851 VERIFY(nsp != NULL);
1852 VERIFY(nsp->sl_refcnt == 1);
1853 }
1854 }
1855
1856 /*
1857 * If we're asked to purge, restore the actual mbuf using
1858 * contents of the shadow structure (if auditing is enabled)
1859 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1860 * about to free it and the attached cluster into their caches.
1861 */
1862 if (purged) {
1863 /* Restore constructed mbuf fields */
1864 if (mclaudit != NULL)
1865 mcl_audit_restore_mbuf(m, mca, TRUE);
1866
1867 MEXT_REF(m) = 0;
1868 MEXT_FLAGS(m) = 0;
1869
1870 rfa = (mcache_obj_t *)MEXT_RFA(m);
1871 rfa->obj_next = ref_list;
1872 ref_list = rfa;
1873 MEXT_RFA(m) = NULL;
1874
1875 m->m_type = MT_FREE;
1876 m->m_flags = m->m_len = 0;
1877 m->m_next = m->m_nextpkt = NULL;
1878
1879 /* Save mbuf fields and make auditing happy */
1880 if (mclaudit != NULL)
1881 mcl_audit_mbuf(mca, o, FALSE, FALSE);
1882
1883 VERIFY(m_total(class) > 0);
1884 m_total(class)--;
1885
1886 /* Free the mbuf */
1887 o->obj_next = NULL;
1888 slab_free(MC_MBUF, o);
1889
1890 /* And free the cluster */
1891 ((mcache_obj_t *)cl)->obj_next = NULL;
1892 if (class == MC_MBUF_CL)
1893 slab_free(MC_CL, cl);
1894 else if (class == MC_MBUF_BIGCL)
1895 slab_free(MC_BIGCL, cl);
1896 else
1897 slab_free(MC_16KCL, cl);
1898 }
1899
1900 ++num;
1901 tail = o;
1902 o = nexto;
1903 }
1904
1905 if (!purged) {
1906 tail->obj_next = m_cobjlist(class);
1907 m_cobjlist(class) = list;
1908 m_infree(class) += num;
1909 } else if (ref_list != NULL) {
1910 mcache_free_ext(ref_cache, ref_list);
1911 }
1912
1913 return (num);
1914 }
1915
1916 /*
1917 * Common allocator for composite objects called by the CPU cache layer
1918 * during an allocation request whenever there is no available element in
1919 * the bucket layer. It returns one or more composite elements from the
1920 * appropriate global freelist. If the freelist is empty, it will attempt
1921 * to obtain the rudimentary objects from their caches and construct them
1922 * into composite mbuf + cluster objects.
1923 */
1924 static unsigned int
1925 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
1926 int wait)
1927 {
1928 mbuf_class_t class = (mbuf_class_t)arg;
1929 mcache_t *cp = NULL;
1930 unsigned int num = 0, cnum = 0, want = needed;
1931 mcache_obj_t *ref_list = NULL;
1932 mcache_obj_t *mp_list = NULL;
1933 mcache_obj_t *clp_list = NULL;
1934 mcache_obj_t **list;
1935 struct ext_ref *rfa;
1936 struct mbuf *m;
1937 void *cl;
1938
1939 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1940 ASSERT(needed > 0);
1941
1942 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1943
1944 /* There should not be any slab for this class */
1945 VERIFY(m_slab_cnt(class) == 0 &&
1946 m_slablist(class).tqh_first == NULL &&
1947 m_slablist(class).tqh_last == NULL);
1948
1949 lck_mtx_lock(mbuf_mlock);
1950
1951 /* Try using the freelist first */
1952 num = cslab_alloc(class, plist, needed);
1953 list = *plist;
1954 if (num == needed) {
1955 m_alloc_cnt(class) += num;
1956 lck_mtx_unlock(mbuf_mlock);
1957 return (needed);
1958 }
1959
1960 lck_mtx_unlock(mbuf_mlock);
1961
1962 /*
1963 * We could not satisfy the request using the freelist alone;
1964 * allocate from the appropriate rudimentary caches and use
1965 * whatever we can get to construct the composite objects.
1966 */
1967 needed -= num;
1968
1969 /*
1970 * Mark these allocation requests as coming from a composite cache.
1971 * Also, if the caller is willing to be blocked, mark the request
1972 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
1973 * slab layer waiting for the individual object when one or more
1974 * of the already-constructed composite objects are available.
1975 */
1976 wait |= MCR_COMP;
1977 if (!(wait & MCR_NOSLEEP))
1978 wait |= MCR_FAILOK;
1979
1980 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
1981 if (needed == 0) {
1982 ASSERT(mp_list == NULL);
1983 goto fail;
1984 }
1985 if (class == MC_MBUF_CL)
1986 cp = m_cache(MC_CL);
1987 else if (class == MC_MBUF_BIGCL)
1988 cp = m_cache(MC_BIGCL);
1989 else
1990 cp = m_cache(MC_16KCL);
1991 needed = mcache_alloc_ext(cp, &clp_list, needed, wait);
1992 if (needed == 0) {
1993 ASSERT(clp_list == NULL);
1994 goto fail;
1995 }
1996 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
1997 if (needed == 0) {
1998 ASSERT(ref_list == NULL);
1999 goto fail;
2000 }
2001
2002 /*
2003 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2004 * overs will get freed accordingly before we return to caller.
2005 */
2006 for (cnum = 0; cnum < needed; cnum++) {
2007 struct mbuf *ms;
2008
2009 m = ms = (struct mbuf *)mp_list;
2010 mp_list = mp_list->obj_next;
2011
2012 cl = clp_list;
2013 clp_list = clp_list->obj_next;
2014 ((mcache_obj_t *)cl)->obj_next = NULL;
2015
2016 rfa = (struct ext_ref *)ref_list;
2017 ref_list = ref_list->obj_next;
2018 ((mcache_obj_t *)rfa)->obj_next = NULL;
2019
2020 /*
2021 * If auditing is enabled, construct the shadow mbuf
2022 * in the audit structure instead of in the actual one.
2023 * mbuf_cslab_audit() will take care of restoring the
2024 * contents after the integrity check.
2025 */
2026 if (mclaudit != NULL) {
2027 mcache_audit_t *mca, *cl_mca;
2028 size_t size;
2029
2030 lck_mtx_lock(mbuf_mlock);
2031 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2032 ms = ((struct mbuf *)mca->mca_contents);
2033 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2034
2035 /*
2036 * Pair them up. Note that this is done at the time
2037 * the mbuf+cluster objects are constructed. This
2038 * information should be treated as "best effort"
2039 * debugging hint since more than one mbufs can refer
2040 * to a cluster. In that case, the cluster might not
2041 * be freed along with the mbuf it was paired with.
2042 */
2043 mca->mca_uptr = cl_mca;
2044 cl_mca->mca_uptr = mca;
2045
2046 ASSERT(mca->mca_uflags & MB_SCVALID);
2047 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2048 lck_mtx_unlock(mbuf_mlock);
2049
2050 /* Technically, they are in the freelist */
2051 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2052 m_maxsize(MC_MBUF));
2053 if (class == MC_MBUF_CL)
2054 size = m_maxsize(MC_CL);
2055 else if (class == MC_MBUF_BIGCL)
2056 size = m_maxsize(MC_BIGCL);
2057 else
2058 size = m_maxsize(MC_16KCL);
2059 mcache_set_pattern(MCACHE_FREE_PATTERN, cl, size);
2060 }
2061
2062 MBUF_INIT(ms, 0, MT_FREE);
2063 if (class == MC_MBUF_16KCL) {
2064 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2065 } else if (class == MC_MBUF_BIGCL) {
2066 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2067 } else {
2068 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2069 }
2070 VERIFY(ms->m_flags == M_EXT);
2071 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2072
2073 *list = (mcache_obj_t *)m;
2074 (*list)->obj_next = NULL;
2075 list = *plist = &(*list)->obj_next;
2076 }
2077
2078 fail:
2079 /*
2080 * Free up what's left of the above.
2081 */
2082 if (mp_list != NULL)
2083 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2084 if (clp_list != NULL)
2085 mcache_free_ext(cp, clp_list);
2086 if (ref_list != NULL)
2087 mcache_free_ext(ref_cache, ref_list);
2088
2089 lck_mtx_lock(mbuf_mlock);
2090 if (num > 0 || cnum > 0) {
2091 m_total(class) += cnum;
2092 VERIFY(m_total(class) <= m_maxlimit(class));
2093 m_alloc_cnt(class) += num + cnum;
2094 }
2095 if ((num + cnum) < want)
2096 m_fail_cnt(class) += (want - (num + cnum));
2097 lck_mtx_unlock(mbuf_mlock);
2098
2099 return (num + cnum);
2100 }
2101
2102 /*
2103 * Common de-allocator for composite objects called by the CPU cache
2104 * layer when one or more elements need to be returned to the appropriate
2105 * global freelist.
2106 */
2107 static void
2108 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2109 {
2110 mbuf_class_t class = (mbuf_class_t)arg;
2111 unsigned int num;
2112 int w;
2113
2114 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2115
2116 lck_mtx_lock(mbuf_mlock);
2117
2118 num = cslab_free(class, list, purged);
2119 m_free_cnt(class) += num;
2120
2121 if ((w = mb_waiters) > 0)
2122 mb_waiters = 0;
2123
2124 lck_mtx_unlock(mbuf_mlock);
2125
2126 if (w != 0)
2127 wakeup(mb_waitchan);
2128 }
2129
2130 /*
2131 * Common auditor for composite objects called by the CPU cache layer
2132 * during an allocation or free request. For the former, this is called
2133 * after the objects are obtained from either the bucket or slab layer
2134 * and before they are returned to the caller. For the latter, this is
2135 * called immediately during free and before placing the objects into
2136 * the bucket or slab layer.
2137 */
2138 static void
2139 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2140 {
2141 mbuf_class_t class = (mbuf_class_t)arg;
2142 mcache_audit_t *mca;
2143 struct mbuf *m, *ms;
2144 mcl_slab_t *clsp, *nsp;
2145 size_t size;
2146 void *cl;
2147
2148 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2149
2150 while ((m = ms = (struct mbuf *)list) != NULL) {
2151 lck_mtx_lock(mbuf_mlock);
2152 /* Do the mbuf sanity checks and record its transaction */
2153 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2154 mcl_audit_mbuf(mca, m, TRUE, alloc);
2155 mcache_buffer_log(mca, m, m_cache(class));
2156 if (alloc)
2157 mca->mca_uflags |= MB_COMP_INUSE;
2158 else
2159 mca->mca_uflags &= ~MB_COMP_INUSE;
2160
2161 /*
2162 * Use the shadow mbuf in the audit structure if we are
2163 * freeing, since the contents of the actual mbuf has been
2164 * pattern-filled by the above call to mcl_audit_mbuf().
2165 */
2166 if (!alloc)
2167 ms = (struct mbuf *)mca->mca_contents;
2168
2169 /* Do the cluster sanity checks and record its transaction */
2170 cl = ms->m_ext.ext_buf;
2171 clsp = slab_get(cl);
2172 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2173 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2174 VERIFY(clsp->sl_refcnt == 1);
2175 if (class == MC_MBUF_BIGCL) {
2176 nsp = clsp->sl_next;
2177 /* Next slab must already be present */
2178 VERIFY(nsp != NULL);
2179 VERIFY(nsp->sl_refcnt == 1);
2180 } else if (class == MC_MBUF_16KCL) {
2181 int k;
2182 for (nsp = clsp, k = 1;
2183 k < (M16KCLBYTES / MCLBYTES); k++) {
2184 nsp = nsp->sl_next;
2185 /* Next slab must already be present */
2186 VERIFY(nsp != NULL);
2187 VERIFY(nsp->sl_refcnt == 1);
2188 }
2189 }
2190
2191 mca = mcl_audit_buf2mca(MC_CL, cl);
2192 if (class == MC_MBUF_CL)
2193 size = m_maxsize(MC_CL);
2194 else if (class == MC_MBUF_BIGCL)
2195 size = m_maxsize(MC_BIGCL);
2196 else
2197 size = m_maxsize(MC_16KCL);
2198 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2199 mcache_buffer_log(mca, cl, m_cache(class));
2200 if (alloc)
2201 mca->mca_uflags |= MB_COMP_INUSE;
2202 else
2203 mca->mca_uflags &= ~MB_COMP_INUSE;
2204 lck_mtx_unlock(mbuf_mlock);
2205
2206 list = list->obj_next;
2207 }
2208 }
2209
2210 /*
2211 * Allocate some number of mbuf clusters and place on cluster freelist.
2212 */
2213 static int
2214 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2215 {
2216 int i;
2217 vm_size_t size = 0;
2218 int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2219 vm_offset_t page = 0;
2220 mcache_audit_t *mca_list = NULL;
2221 mcache_obj_t *con_list = NULL;
2222 mcl_slab_t *sp;
2223
2224 VERIFY(bufsize == m_maxsize(MC_CL) ||
2225 bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL));
2226
2227 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2228
2229 /*
2230 * Multiple threads may attempt to populate the cluster map one
2231 * after another. Since we drop the lock below prior to acquiring
2232 * the physical page(s), our view of the cluster map may no longer
2233 * be accurate, and we could end up over-committing the pages beyond
2234 * the maximum allowed for each class. To prevent it, this entire
2235 * operation (including the page mapping) is serialized.
2236 */
2237 while (mb_clalloc_busy) {
2238 mb_clalloc_waiters++;
2239 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2240 (PZERO-1), "m_clalloc", NULL);
2241 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2242 }
2243
2244 /* We are busy now; tell everyone else to go away */
2245 mb_clalloc_busy = TRUE;
2246
2247 /*
2248 * Honor the caller's wish to block or not block. We have a way
2249 * to grow the pool asynchronously using the mbuf worker thread.
2250 */
2251 i = m_howmany(num, bufsize);
2252 if (i == 0 || (wait & M_DONTWAIT))
2253 goto out;
2254
2255 lck_mtx_unlock(mbuf_mlock);
2256
2257 size = round_page(i * bufsize);
2258 page = kmem_mb_alloc(mb_map, size, large_buffer);
2259
2260 /*
2261 * If we did ask for "n" 16K physically contiguous chunks
2262 * and didn't get them, then please try again without this
2263 * restriction.
2264 */
2265 if (large_buffer && page == 0)
2266 page = kmem_mb_alloc(mb_map, size, 0);
2267
2268 if (page == 0) {
2269 if (bufsize <= m_maxsize(MC_BIGCL)) {
2270 /* Try for 1 page if failed, only for 2KB/4KB request */
2271 size = NBPG;
2272 page = kmem_mb_alloc(mb_map, size, 0);
2273 }
2274
2275 if (page == 0) {
2276 lck_mtx_lock(mbuf_mlock);
2277 goto out;
2278 }
2279 }
2280
2281 VERIFY(IS_P2ALIGNED(page, NBPG));
2282 numpages = size / NBPG;
2283
2284 /* If auditing is enabled, allocate the audit structures now */
2285 if (mclaudit != NULL) {
2286 int needed;
2287
2288 /*
2289 * Yes, I realize this is a waste of memory for clusters
2290 * that never get transformed into mbufs, as we may end
2291 * up with NMBPCL-1 unused audit structures per cluster.
2292 * But doing so tremendously simplifies the allocation
2293 * strategy, since at this point we are not holding the
2294 * mbuf lock and the caller is okay to be blocked. For
2295 * the case of big clusters, we allocate one structure
2296 * for each as we never turn them into mbufs.
2297 */
2298 if (bufsize == m_maxsize(MC_CL)) {
2299 needed = numpages * 2 * NMBPCL;
2300
2301 i = mcache_alloc_ext(mcl_audit_con_cache,
2302 &con_list, needed, MCR_SLEEP);
2303
2304 VERIFY(con_list != NULL && i == needed);
2305 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2306 needed = numpages;
2307 } else {
2308 needed = numpages / (M16KCLBYTES / NBPG);
2309 }
2310
2311 i = mcache_alloc_ext(mcache_audit_cache,
2312 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2313
2314 VERIFY(mca_list != NULL && i == needed);
2315 }
2316
2317 lck_mtx_lock(mbuf_mlock);
2318
2319 for (i = 0; i < numpages; i++, page += NBPG) {
2320 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2321 ppnum_t new_page = pmap_find_phys(kernel_pmap,
2322 (vm_offset_t)page);
2323
2324 /*
2325 * In the case of no mapper being available the following
2326 * code noops and returns the input page; if there is a
2327 * mapper the appropriate I/O page is returned.
2328 */
2329 VERIFY(offset < mcl_pages);
2330 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2331 mcl_paddr[offset] = new_page << PGSHIFT;
2332
2333 /* Pattern-fill this fresh page */
2334 if (mclaudit != NULL)
2335 mcache_set_pattern(MCACHE_FREE_PATTERN,
2336 (caddr_t)page, NBPG);
2337
2338 if (bufsize == m_maxsize(MC_CL)) {
2339 union mcluster *mcl = (union mcluster *)page;
2340
2341 /* 1st cluster in the page */
2342 sp = slab_get(mcl);
2343 if (mclaudit != NULL)
2344 mcl_audit_init(mcl, &mca_list, &con_list,
2345 AUDIT_CONTENTS_SIZE, NMBPCL);
2346
2347 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2348 slab_init(sp, MC_CL, SLF_MAPPED,
2349 mcl, mcl, bufsize, 0, 1);
2350
2351 /* Insert this slab */
2352 slab_insert(sp, MC_CL);
2353
2354 /* Update stats now since slab_get() drops the lock */
2355 mbstat.m_clfree = ++m_infree(MC_CL) +
2356 m_infree(MC_MBUF_CL);
2357 mbstat.m_clusters = ++m_total(MC_CL);
2358 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2359
2360 /* 2nd cluster in the page */
2361 sp = slab_get(++mcl);
2362 if (mclaudit != NULL)
2363 mcl_audit_init(mcl, &mca_list, &con_list,
2364 AUDIT_CONTENTS_SIZE, NMBPCL);
2365
2366 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2367 slab_init(sp, MC_CL, SLF_MAPPED,
2368 mcl, mcl, bufsize, 0, 1);
2369
2370 /* Insert this slab */
2371 slab_insert(sp, MC_CL);
2372
2373 /* Update stats now since slab_get() drops the lock */
2374 mbstat.m_clfree = ++m_infree(MC_CL) +
2375 m_infree(MC_MBUF_CL);
2376 mbstat.m_clusters = ++m_total(MC_CL);
2377 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2378 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2379 union mbigcluster *mbc = (union mbigcluster *)page;
2380 mcl_slab_t *nsp;
2381
2382 /* One for the entire page */
2383 sp = slab_get(mbc);
2384 if (mclaudit != NULL)
2385 mcl_audit_init(mbc, &mca_list, NULL, 0, 1);
2386
2387 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2388 slab_init(sp, MC_BIGCL, SLF_MAPPED,
2389 mbc, mbc, bufsize, 0, 1);
2390
2391 /* 2nd cluster's slab is part of the previous one */
2392 nsp = slab_get(((union mcluster *)page) + 1);
2393 slab_init(nsp, MC_BIGCL, SLF_MAPPED | SLF_PARTIAL,
2394 mbc, NULL, 0, 0, 0);
2395
2396 /* Insert this slab */
2397 slab_insert(sp, MC_BIGCL);
2398
2399 /* Update stats now since slab_get() drops the lock */
2400 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2401 m_infree(MC_MBUF_BIGCL);
2402 mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2403 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2404 } else if ((i % (M16KCLBYTES / NBPG)) == 0) {
2405 union m16kcluster *m16kcl = (union m16kcluster *)page;
2406 mcl_slab_t *nsp;
2407 int k;
2408
2409 VERIFY(njcl > 0);
2410 /* One for the entire 16KB */
2411 sp = slab_get(m16kcl);
2412 if (mclaudit != NULL)
2413 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2414
2415 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2416 slab_init(sp, MC_16KCL, SLF_MAPPED,
2417 m16kcl, m16kcl, bufsize, 0, 1);
2418
2419 /* 2nd-8th cluster's slab is part of the first one */
2420 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
2421 nsp = slab_get(((union mcluster *)page) + k);
2422 VERIFY(nsp->sl_refcnt == 0 &&
2423 nsp->sl_flags == 0);
2424 slab_init(nsp, MC_16KCL,
2425 SLF_MAPPED | SLF_PARTIAL,
2426 m16kcl, NULL, 0, 0, 0);
2427 }
2428
2429 /* Insert this slab */
2430 slab_insert(sp, MC_16KCL);
2431
2432 /* Update stats now since slab_get() drops the lock */
2433 m_infree(MC_16KCL)++;
2434 m_total(MC_16KCL)++;
2435 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2436 }
2437 }
2438 VERIFY(mca_list == NULL && con_list == NULL);
2439
2440 /* We're done; let others enter */
2441 mb_clalloc_busy = FALSE;
2442 if (mb_clalloc_waiters > 0) {
2443 mb_clalloc_waiters = 0;
2444 wakeup(mb_clalloc_waitchan);
2445 }
2446
2447 if (bufsize == m_maxsize(MC_CL))
2448 return (numpages << 1);
2449 else if (bufsize == m_maxsize(MC_BIGCL))
2450 return (numpages);
2451
2452 VERIFY(bufsize == m_maxsize(MC_16KCL));
2453 return (numpages / (M16KCLBYTES / NBPG));
2454
2455 out:
2456 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2457
2458 /* We're done; let others enter */
2459 mb_clalloc_busy = FALSE;
2460 if (mb_clalloc_waiters > 0) {
2461 mb_clalloc_waiters = 0;
2462 wakeup(mb_clalloc_waitchan);
2463 }
2464
2465 /*
2466 * When non-blocking we kick a thread if we have to grow the
2467 * pool or if the number of free clusters is less than requested.
2468 */
2469 if (bufsize == m_maxsize(MC_CL)) {
2470 if (i > 0) {
2471 /*
2472 * Remember total number of clusters needed
2473 * at this time.
2474 */
2475 i += m_total(MC_CL);
2476 if (i > mbuf_expand_mcl) {
2477 mbuf_expand_mcl = i;
2478 if (mbuf_worker_ready)
2479 wakeup((caddr_t)&mbuf_worker_run);
2480 }
2481 }
2482
2483 if (m_infree(MC_CL) >= num)
2484 return (1);
2485 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2486 if (i > 0) {
2487 /*
2488 * Remember total number of 4KB clusters needed
2489 * at this time.
2490 */
2491 i += m_total(MC_BIGCL);
2492 if (i > mbuf_expand_big) {
2493 mbuf_expand_big = i;
2494 if (mbuf_worker_ready)
2495 wakeup((caddr_t)&mbuf_worker_run);
2496 }
2497 }
2498
2499 if (m_infree(MC_BIGCL) >= num)
2500 return (1);
2501 } else {
2502 if (i > 0) {
2503 /*
2504 * Remember total number of 16KB clusters needed
2505 * at this time.
2506 */
2507 i += m_total(MC_16KCL);
2508 if (i > mbuf_expand_16k) {
2509 mbuf_expand_16k = i;
2510 if (mbuf_worker_ready)
2511 wakeup((caddr_t)&mbuf_worker_run);
2512 }
2513 }
2514
2515 if (m_infree(MC_16KCL) >= num)
2516 return (1);
2517 }
2518 return (0);
2519 }
2520
2521 /*
2522 * Populate the global freelist of the corresponding buffer class.
2523 */
2524 static int
2525 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2526 {
2527 mcache_obj_t *o = NULL;
2528 int i;
2529
2530 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2531 class == MC_16KCL);
2532
2533 #if CONFIG_MBUF_NOEXPAND
2534 if ((mbstat.m_mbufs / NMBPCL) >= maxmbufcl) {
2535 #if DEBUG
2536 static int printonce = 1;
2537 if (printonce == 1) {
2538 printonce = 0;
2539 printf("m_expand failed, allocated %ld out of %d "
2540 "clusters\n", mbstat.m_mbufs / NMBPCL,
2541 nmbclusters);
2542 }
2543 #endif /* DEBUG */
2544 return (0);
2545 }
2546 #endif /* CONFIG_MBUF_NOEXPAND */
2547
2548 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2549
2550 switch (class) {
2551 case MC_MBUF:
2552 case MC_CL:
2553 i = m_clalloc(num, wait, m_maxsize(MC_CL));
2554
2555 /* Respect the 2K clusters minimum limit */
2556 if (m_total(MC_CL) == m_maxlimit(MC_CL) &&
2557 m_infree(MC_CL) <= m_minlimit(MC_CL)) {
2558 if (class != MC_CL || (wait & MCR_COMP))
2559 return (0);
2560 }
2561 if (class == MC_CL)
2562 return (i != 0);
2563 break;
2564
2565 case MC_BIGCL:
2566 case MC_16KCL:
2567 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2568 /* NOTREACHED */
2569
2570 default:
2571 VERIFY(0);
2572 /* NOTREACHED */
2573 }
2574
2575 /* Steal a cluster and cut it up to create NMBPCL mbufs */
2576 if ((o = slab_alloc(MC_CL, wait)) != NULL) {
2577 struct mbuf *m = (struct mbuf *)o;
2578 mcache_audit_t *mca = NULL;
2579 mcl_slab_t *sp = slab_get(o);
2580
2581 VERIFY(slab_is_detached(sp) &&
2582 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2583
2584 /* Make sure that the cluster is unmolested while in freelist */
2585 if (mclaudit != NULL) {
2586 mca = mcl_audit_buf2mca(MC_CL, o);
2587 mcache_audit_free_verify(mca, o, 0, m_maxsize(MC_CL));
2588 }
2589
2590 /* Reinitialize it as an mbuf slab */
2591 slab_init(sp, MC_MBUF, sp->sl_flags, sp->sl_base, NULL,
2592 sp->sl_len, 0, NMBPCL);
2593
2594 VERIFY(m == (struct mbuf *)sp->sl_base);
2595 VERIFY(sp->sl_head == NULL);
2596
2597 m_total(MC_MBUF) += NMBPCL;
2598 mbstat.m_mbufs = m_total(MC_MBUF);
2599 m_infree(MC_MBUF) += NMBPCL;
2600 mtype_stat_add(MT_FREE, NMBPCL);
2601
2602 i = NMBPCL;
2603 while (i--) {
2604 /*
2605 * If auditing is enabled, construct the shadow mbuf
2606 * in the audit structure instead of the actual one.
2607 * mbuf_slab_audit() will take care of restoring the
2608 * contents after the integrity check.
2609 */
2610 if (mclaudit != NULL) {
2611 struct mbuf *ms;
2612 mca = mcl_audit_buf2mca(MC_MBUF,
2613 (mcache_obj_t *)m);
2614 ms = ((struct mbuf *)mca->mca_contents);
2615 ms->m_type = MT_FREE;
2616 } else {
2617 m->m_type = MT_FREE;
2618 }
2619 m->m_next = sp->sl_head;
2620 sp->sl_head = (void *)m++;
2621 }
2622
2623 /* Insert it into the mbuf class's slab list */
2624 slab_insert(sp, MC_MBUF);
2625
2626 if ((i = mb_waiters) > 0)
2627 mb_waiters = 0;
2628 if (i != 0)
2629 wakeup(mb_waitchan);
2630
2631 return (1);
2632 }
2633
2634 return (0);
2635 }
2636
2637 /*
2638 * (Inaccurately) check if it might be worth a trip back to the
2639 * mcache layer due the availability of objects there. We'll
2640 * end up back here if there's nothing up there.
2641 */
2642 static boolean_t
2643 mbuf_cached_above(mbuf_class_t class, int wait)
2644 {
2645 switch (class) {
2646 case MC_MBUF:
2647 if (wait & MCR_COMP)
2648 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
2649 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2650 break;
2651
2652 case MC_CL:
2653 if (wait & MCR_COMP)
2654 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
2655 break;
2656
2657 case MC_BIGCL:
2658 if (wait & MCR_COMP)
2659 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2660 break;
2661
2662 case MC_16KCL:
2663 if (wait & MCR_COMP)
2664 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
2665 break;
2666
2667 case MC_MBUF_CL:
2668 case MC_MBUF_BIGCL:
2669 case MC_MBUF_16KCL:
2670 break;
2671
2672 default:
2673 VERIFY(0);
2674 /* NOTREACHED */
2675 }
2676
2677 return (!mcache_bkt_isempty(m_cache(class)));
2678 }
2679
2680 /*
2681 * If possible, convert constructed objects to raw ones.
2682 */
2683 static boolean_t
2684 mbuf_steal(mbuf_class_t class, unsigned int num)
2685 {
2686 mcache_obj_t *top = NULL;
2687 mcache_obj_t **list = &top;
2688 unsigned int tot = 0;
2689
2690 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2691
2692 switch (class) {
2693 case MC_MBUF:
2694 case MC_CL:
2695 case MC_BIGCL:
2696 case MC_16KCL:
2697 return (FALSE);
2698
2699 case MC_MBUF_CL:
2700 case MC_MBUF_BIGCL:
2701 case MC_MBUF_16KCL:
2702 /* Get the required number of constructed objects if possible */
2703 if (m_infree(class) > m_minlimit(class)) {
2704 tot = cslab_alloc(class, &list,
2705 MIN(num, m_infree(class)));
2706 }
2707
2708 /* And destroy them to get back the raw objects */
2709 if (top != NULL)
2710 (void) cslab_free(class, top, 1);
2711 break;
2712
2713 default:
2714 VERIFY(0);
2715 /* NOTREACHED */
2716 }
2717
2718 return (tot == num);
2719 }
2720
2721 static void
2722 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
2723 {
2724 int m, bmap = 0;
2725
2726 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2727
2728 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2729 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2730 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2731
2732 /*
2733 * This logic can be made smarter; for now, simply mark
2734 * all other related classes as potential victims.
2735 */
2736 switch (class) {
2737 case MC_MBUF:
2738 m_wantpurge(MC_CL)++;
2739 m_wantpurge(MC_MBUF_CL)++;
2740 m_wantpurge(MC_MBUF_BIGCL)++;
2741 break;
2742
2743 case MC_CL:
2744 m_wantpurge(MC_MBUF)++;
2745 if (!comp)
2746 m_wantpurge(MC_MBUF_CL)++;
2747 break;
2748
2749 case MC_BIGCL:
2750 if (!comp)
2751 m_wantpurge(MC_MBUF_BIGCL)++;
2752 break;
2753
2754 case MC_16KCL:
2755 if (!comp)
2756 m_wantpurge(MC_MBUF_16KCL)++;
2757 break;
2758
2759 default:
2760 VERIFY(0);
2761 /* NOTREACHED */
2762 }
2763
2764 /*
2765 * Run through each marked class and check if we really need to
2766 * purge (and therefore temporarily disable) the per-CPU caches
2767 * layer used by the class. If so, remember the classes since
2768 * we are going to drop the lock below prior to purging.
2769 */
2770 for (m = 0; m < NELEM(mbuf_table); m++) {
2771 if (m_wantpurge(m) > 0) {
2772 m_wantpurge(m) = 0;
2773 /*
2774 * Try hard to steal the required number of objects
2775 * from the freelist of other mbuf classes. Only
2776 * purge and disable the per-CPU caches layer when
2777 * we don't have enough; it's the last resort.
2778 */
2779 if (!mbuf_steal(m, num))
2780 bmap |= (1 << m);
2781 }
2782 }
2783
2784 lck_mtx_unlock(mbuf_mlock);
2785
2786 if (bmap != 0) {
2787 /* drain is performed in pfslowtimo(), to avoid deadlocks */
2788 do_reclaim = 1;
2789
2790 /* Sigh; we have no other choices but to ask mcache to purge */
2791 for (m = 0; m < NELEM(mbuf_table); m++) {
2792 if ((bmap & (1 << m)) &&
2793 mcache_purge_cache(m_cache(m))) {
2794 lck_mtx_lock(mbuf_mlock);
2795 m_purge_cnt(m)++;
2796 mbstat.m_drain++;
2797 lck_mtx_unlock(mbuf_mlock);
2798 }
2799 }
2800 } else {
2801 /*
2802 * Request mcache to reap extra elements from all of its caches;
2803 * note that all reaps are serialized and happen only at a fixed
2804 * interval.
2805 */
2806 mcache_reap();
2807 }
2808 lck_mtx_lock(mbuf_mlock);
2809 }
2810
2811 static inline struct mbuf *
2812 m_get_common(int wait, short type, int hdr)
2813 {
2814 struct mbuf *m;
2815 int mcflags = MSLEEPF(wait);
2816
2817 /* Is this due to a non-blocking retry? If so, then try harder */
2818 if (mcflags & MCR_NOSLEEP)
2819 mcflags |= MCR_TRYHARD;
2820
2821 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
2822 if (m != NULL) {
2823 MBUF_INIT(m, hdr, type);
2824 mtype_stat_inc(type);
2825 mtype_stat_dec(MT_FREE);
2826 #if CONFIG_MACF_NET
2827 if (hdr && mac_init_mbuf(m, wait) != 0) {
2828 m_free(m);
2829 return (NULL);
2830 }
2831 #endif /* MAC_NET */
2832 }
2833 return (m);
2834 }
2835
2836 /*
2837 * Space allocation routines; these are also available as macros
2838 * for critical paths.
2839 */
2840 #define _M_GET(wait, type) m_get_common(wait, type, 0)
2841 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
2842 #define _M_RETRY(wait, type) _M_GET(wait, type)
2843 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
2844 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
2845 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
2846
2847 struct mbuf *
2848 m_get(int wait, int type)
2849 {
2850 return (_M_GET(wait, type));
2851 }
2852
2853 struct mbuf *
2854 m_gethdr(int wait, int type)
2855 {
2856 return (_M_GETHDR(wait, type));
2857 }
2858
2859 struct mbuf *
2860 m_retry(int wait, int type)
2861 {
2862 return (_M_RETRY(wait, type));
2863 }
2864
2865 struct mbuf *
2866 m_retryhdr(int wait, int type)
2867 {
2868 return (_M_RETRYHDR(wait, type));
2869 }
2870
2871 struct mbuf *
2872 m_getclr(int wait, int type)
2873 {
2874 struct mbuf *m;
2875
2876 _MGET(m, wait, type);
2877 if (m != NULL)
2878 bzero(MTOD(m, caddr_t), MLEN);
2879 return (m);
2880 }
2881
2882 struct mbuf *
2883 m_free(struct mbuf *m)
2884 {
2885 struct mbuf *n = m->m_next;
2886
2887 if (m->m_type == MT_FREE)
2888 panic("m_free: freeing an already freed mbuf");
2889
2890 /* Free the aux data and tags if there is any */
2891 if (m->m_flags & M_PKTHDR) {
2892 m_tag_delete_chain(m, NULL);
2893 }
2894
2895 if (m->m_flags & M_EXT) {
2896 u_int32_t refcnt;
2897 u_int32_t flags;
2898
2899 refcnt = m_decref(m);
2900 flags = MEXT_FLAGS(m);
2901 if (refcnt == 0 && flags == 0) {
2902 if (m->m_ext.ext_free == NULL) {
2903 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2904 } else if (m->m_ext.ext_free == m_bigfree) {
2905 mcache_free(m_cache(MC_BIGCL),
2906 m->m_ext.ext_buf);
2907 } else if (m->m_ext.ext_free == m_16kfree) {
2908 mcache_free(m_cache(MC_16KCL),
2909 m->m_ext.ext_buf);
2910 } else {
2911 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2912 m->m_ext.ext_size, m->m_ext.ext_arg);
2913 }
2914 mcache_free(ref_cache, MEXT_RFA(m));
2915 MEXT_RFA(m) = NULL;
2916 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2917 VERIFY(m->m_type != MT_FREE);
2918
2919 mtype_stat_dec(m->m_type);
2920 mtype_stat_inc(MT_FREE);
2921
2922 m->m_type = MT_FREE;
2923 m->m_flags = M_EXT;
2924 m->m_len = 0;
2925 m->m_next = m->m_nextpkt = NULL;
2926
2927 /* "Free" into the intermediate cache */
2928 if (m->m_ext.ext_free == NULL) {
2929 mcache_free(m_cache(MC_MBUF_CL), m);
2930 } else if (m->m_ext.ext_free == m_bigfree) {
2931 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2932 } else {
2933 VERIFY(m->m_ext.ext_free == m_16kfree);
2934 mcache_free(m_cache(MC_MBUF_16KCL), m);
2935 }
2936 return (n);
2937 }
2938 }
2939
2940 if (m->m_type != MT_FREE) {
2941 mtype_stat_dec(m->m_type);
2942 mtype_stat_inc(MT_FREE);
2943 }
2944
2945 m->m_type = MT_FREE;
2946 m->m_flags = m->m_len = 0;
2947 m->m_next = m->m_nextpkt = NULL;
2948
2949 mcache_free(m_cache(MC_MBUF), m);
2950
2951 return (n);
2952 }
2953
2954 __private_extern__ struct mbuf *
2955 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
2956 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
2957 int wait)
2958 {
2959 struct ext_ref *rfa = NULL;
2960
2961 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
2962 return (NULL);
2963
2964 if (m->m_flags & M_EXT) {
2965 u_int32_t refcnt;
2966 u_int32_t flags;
2967
2968 refcnt = m_decref(m);
2969 flags = MEXT_FLAGS(m);
2970 if (refcnt == 0 && flags == 0) {
2971 if (m->m_ext.ext_free == NULL) {
2972 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2973 } else if (m->m_ext.ext_free == m_bigfree) {
2974 mcache_free(m_cache(MC_BIGCL),
2975 m->m_ext.ext_buf);
2976 } else if (m->m_ext.ext_free == m_16kfree) {
2977 mcache_free(m_cache(MC_16KCL),
2978 m->m_ext.ext_buf);
2979 } else {
2980 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2981 m->m_ext.ext_size, m->m_ext.ext_arg);
2982 }
2983 /* Re-use the reference structure */
2984 rfa = MEXT_RFA(m);
2985 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2986 VERIFY(m->m_type != MT_FREE);
2987
2988 mtype_stat_dec(m->m_type);
2989 mtype_stat_inc(MT_FREE);
2990
2991 m->m_type = MT_FREE;
2992 m->m_flags = M_EXT;
2993 m->m_len = 0;
2994 m->m_next = m->m_nextpkt = NULL;
2995 /* "Free" into the intermediate cache */
2996 if (m->m_ext.ext_free == NULL) {
2997 mcache_free(m_cache(MC_MBUF_CL), m);
2998 } else if (m->m_ext.ext_free == m_bigfree) {
2999 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3000 } else {
3001 VERIFY(m->m_ext.ext_free == m_16kfree);
3002 mcache_free(m_cache(MC_MBUF_16KCL), m);
3003 }
3004 /*
3005 * Allocate a new mbuf, since we didn't divorce
3006 * the composite mbuf + cluster pair above.
3007 */
3008 if ((m = _M_GETHDR(wait, type)) == NULL)
3009 return (NULL);
3010 }
3011 }
3012
3013 if (rfa == NULL &&
3014 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3015 m_free(m);
3016 return (NULL);
3017 }
3018
3019 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3020
3021 return (m);
3022 }
3023
3024 /*
3025 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3026 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3027 */
3028 struct mbuf *
3029 m_getcl(int wait, int type, int flags)
3030 {
3031 struct mbuf *m;
3032 int mcflags = MSLEEPF(wait);
3033 int hdr = (flags & M_PKTHDR);
3034
3035 /* Is this due to a non-blocking retry? If so, then try harder */
3036 if (mcflags & MCR_NOSLEEP)
3037 mcflags |= MCR_TRYHARD;
3038
3039 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3040 if (m != NULL) {
3041 MBUF_INIT(m, hdr, type);
3042 mtype_stat_inc(type);
3043 mtype_stat_dec(MT_FREE);
3044 #if CONFIG_MACF_NET
3045 if (hdr && mac_init_mbuf(m, wait) != 0) {
3046 m_free(m);
3047 return (NULL);
3048 }
3049 #endif /* MAC_NET */
3050 }
3051 return (m);
3052 }
3053
3054 /* m_mclget() add an mbuf cluster to a normal mbuf */
3055 struct mbuf *
3056 m_mclget(struct mbuf *m, int wait)
3057 {
3058 struct ext_ref *rfa;
3059
3060 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3061 return (m);
3062
3063 m->m_ext.ext_buf = m_mclalloc(wait);
3064 if (m->m_ext.ext_buf != NULL) {
3065 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3066 } else {
3067 mcache_free(ref_cache, rfa);
3068 }
3069 return (m);
3070 }
3071
3072 /* Allocate an mbuf cluster */
3073 caddr_t
3074 m_mclalloc(int wait)
3075 {
3076 int mcflags = MSLEEPF(wait);
3077
3078 /* Is this due to a non-blocking retry? If so, then try harder */
3079 if (mcflags & MCR_NOSLEEP)
3080 mcflags |= MCR_TRYHARD;
3081
3082 return (mcache_alloc(m_cache(MC_CL), mcflags));
3083 }
3084
3085 /* Free an mbuf cluster */
3086 void
3087 m_mclfree(caddr_t p)
3088 {
3089 mcache_free(m_cache(MC_CL), p);
3090 }
3091
3092 /*
3093 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3094 * another mbuf
3095 */
3096 int
3097 m_mclhasreference(struct mbuf *m)
3098 {
3099 if (!(m->m_flags & M_EXT))
3100 return (0);
3101
3102 ASSERT(MEXT_RFA(m) != NULL);
3103
3104 return (MEXT_REF(m) > 1);
3105 }
3106
3107 __private_extern__ caddr_t
3108 m_bigalloc(int wait)
3109 {
3110 int mcflags = MSLEEPF(wait);
3111
3112 /* Is this due to a non-blocking retry? If so, then try harder */
3113 if (mcflags & MCR_NOSLEEP)
3114 mcflags |= MCR_TRYHARD;
3115
3116 return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3117 }
3118
3119 __private_extern__ void
3120 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3121 {
3122 mcache_free(m_cache(MC_BIGCL), p);
3123 }
3124
3125 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3126 __private_extern__ struct mbuf *
3127 m_mbigget(struct mbuf *m, int wait)
3128 {
3129 struct ext_ref *rfa;
3130
3131 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3132 return (m);
3133
3134 m->m_ext.ext_buf = m_bigalloc(wait);
3135 if (m->m_ext.ext_buf != NULL) {
3136 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3137 } else {
3138 mcache_free(ref_cache, rfa);
3139 }
3140 return (m);
3141 }
3142
3143 __private_extern__ caddr_t
3144 m_16kalloc(int wait)
3145 {
3146 int mcflags = MSLEEPF(wait);
3147
3148 /* Is this due to a non-blocking retry? If so, then try harder */
3149 if (mcflags & MCR_NOSLEEP)
3150 mcflags |= MCR_TRYHARD;
3151
3152 return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3153 }
3154
3155 __private_extern__ void
3156 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3157 {
3158 mcache_free(m_cache(MC_16KCL), p);
3159 }
3160
3161 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3162 __private_extern__ struct mbuf *
3163 m_m16kget(struct mbuf *m, int wait)
3164 {
3165 struct ext_ref *rfa;
3166
3167 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3168 return (m);
3169
3170 m->m_ext.ext_buf = m_16kalloc(wait);
3171 if (m->m_ext.ext_buf != NULL) {
3172 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3173 } else {
3174 mcache_free(ref_cache, rfa);
3175 }
3176 return (m);
3177 }
3178
3179 /*
3180 * "Move" mbuf pkthdr from "from" to "to".
3181 * "from" must have M_PKTHDR set, and "to" must be empty.
3182 */
3183 void
3184 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3185 {
3186 /* We will be taking over the tags of 'to' */
3187 if (to->m_flags & M_PKTHDR)
3188 m_tag_delete_chain(to, NULL);
3189 to->m_pkthdr = from->m_pkthdr; /* especially tags */
3190 m_tag_init(from); /* purge tags from src */
3191 m_prio_init(from); /* reset priority from src */
3192 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3193 if ((to->m_flags & M_EXT) == 0)
3194 to->m_data = to->m_pktdat;
3195 }
3196
3197 /*
3198 * Duplicate "from"'s mbuf pkthdr in "to".
3199 * "from" must have M_PKTHDR set, and "to" must be empty.
3200 * In particular, this does a deep copy of the packet tags.
3201 */
3202 static int
3203 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3204 {
3205 if (to->m_flags & M_PKTHDR)
3206 m_tag_delete_chain(to, NULL);
3207 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3208 if ((to->m_flags & M_EXT) == 0)
3209 to->m_data = to->m_pktdat;
3210 to->m_pkthdr = from->m_pkthdr;
3211 m_tag_init(to);
3212 return (m_tag_copy_chain(to, from, how));
3213 }
3214
3215 /*
3216 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3217 * if wantall is not set, return whatever number were available. Set up the
3218 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3219 * are chained on the m_nextpkt field. Any packets requested beyond this
3220 * are chained onto the last packet header's m_next field. The size of
3221 * the cluster is controlled by the parameter bufsize.
3222 */
3223 __private_extern__ struct mbuf *
3224 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3225 int wait, int wantall, size_t bufsize)
3226 {
3227 struct mbuf *m;
3228 struct mbuf **np, *top;
3229 unsigned int pnum, needed = *num_needed;
3230 mcache_obj_t *mp_list = NULL;
3231 int mcflags = MSLEEPF(wait);
3232 u_int32_t flag;
3233 struct ext_ref *rfa;
3234 mcache_t *cp;
3235 void *cl;
3236
3237 ASSERT(bufsize == m_maxsize(MC_CL) ||
3238 bufsize == m_maxsize(MC_BIGCL) ||
3239 bufsize == m_maxsize(MC_16KCL));
3240
3241 /*
3242 * Caller must first check for njcl because this
3243 * routine is internal and not exposed/used via KPI.
3244 */
3245 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3246
3247 top = NULL;
3248 np = &top;
3249 pnum = 0;
3250
3251 /*
3252 * The caller doesn't want all the requested buffers; only some.
3253 * Try hard to get what we can, but don't block. This effectively
3254 * overrides MCR_SLEEP, since this thread will not go to sleep
3255 * if we can't get all the buffers.
3256 */
3257 if (!wantall || (mcflags & MCR_NOSLEEP))
3258 mcflags |= MCR_TRYHARD;
3259
3260 /* Allocate the composite mbuf + cluster elements from the cache */
3261 if (bufsize == m_maxsize(MC_CL))
3262 cp = m_cache(MC_MBUF_CL);
3263 else if (bufsize == m_maxsize(MC_BIGCL))
3264 cp = m_cache(MC_MBUF_BIGCL);
3265 else
3266 cp = m_cache(MC_MBUF_16KCL);
3267 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3268
3269 for (pnum = 0; pnum < needed; pnum++) {
3270 m = (struct mbuf *)mp_list;
3271 mp_list = mp_list->obj_next;
3272
3273 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3274 cl = m->m_ext.ext_buf;
3275 rfa = MEXT_RFA(m);
3276
3277 ASSERT(cl != NULL && rfa != NULL);
3278 VERIFY(MBUF_IS_COMPOSITE(m));
3279
3280 flag = MEXT_FLAGS(m);
3281
3282 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3283 if (bufsize == m_maxsize(MC_16KCL)) {
3284 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3285 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3286 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3287 } else {
3288 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3289 }
3290
3291 if (num_with_pkthdrs > 0) {
3292 --num_with_pkthdrs;
3293 #if CONFIG_MACF_NET
3294 if (mac_mbuf_label_init(m, wait) != 0) {
3295 m_free(m);
3296 break;
3297 }
3298 #endif /* MAC_NET */
3299 }
3300
3301 *np = m;
3302 if (num_with_pkthdrs > 0)
3303 np = &m->m_nextpkt;
3304 else
3305 np = &m->m_next;
3306 }
3307 ASSERT(pnum != *num_needed || mp_list == NULL);
3308 if (mp_list != NULL)
3309 mcache_free_ext(cp, mp_list);
3310
3311 if (pnum > 0) {
3312 mtype_stat_add(MT_DATA, pnum);
3313 mtype_stat_sub(MT_FREE, pnum);
3314 }
3315
3316 if (wantall && (pnum != *num_needed)) {
3317 if (top != NULL)
3318 m_freem_list(top);
3319 return (NULL);
3320 }
3321
3322 *num_needed = pnum;
3323 return (top);
3324 }
3325
3326 /*
3327 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3328 * wantall is not set, return whatever number were available. The size of
3329 * each mbuf in the list is controlled by the parameter packetlen. Each
3330 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3331 * in the chain is called a segment. If maxsegments is not null and the
3332 * value pointed to is not null, this specify the maximum number of segments
3333 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3334 * is zero the caller does not have any restriction on the number of segments.
3335 * The actual number of segments of a mbuf chain is return in the value
3336 * pointed to by maxsegments.
3337 */
3338 __private_extern__ struct mbuf *
3339 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3340 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3341 {
3342 struct mbuf **np, *top, *first = NULL;
3343 size_t bufsize, r_bufsize;
3344 unsigned int num = 0;
3345 unsigned int nsegs = 0;
3346 unsigned int needed, resid;
3347 int mcflags = MSLEEPF(wait);
3348 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3349 mcache_t *cp = NULL, *rcp = NULL;
3350
3351 if (*numlist == 0)
3352 return (NULL);
3353
3354 top = NULL;
3355 np = &top;
3356
3357 if (wantsize == 0) {
3358 if (packetlen <= MINCLSIZE) {
3359 bufsize = packetlen;
3360 } else if (packetlen > m_maxsize(MC_CL)) {
3361 /* Use 4KB if jumbo cluster pool isn't available */
3362 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3363 bufsize = m_maxsize(MC_BIGCL);
3364 else
3365 bufsize = m_maxsize(MC_16KCL);
3366 } else {
3367 bufsize = m_maxsize(MC_CL);
3368 }
3369 } else if (wantsize == m_maxsize(MC_CL) ||
3370 wantsize == m_maxsize(MC_BIGCL) ||
3371 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3372 bufsize = wantsize;
3373 } else {
3374 return (NULL);
3375 }
3376
3377 if (bufsize <= MHLEN) {
3378 nsegs = 1;
3379 } else if (bufsize <= MINCLSIZE) {
3380 if (maxsegments != NULL && *maxsegments == 1) {
3381 bufsize = m_maxsize(MC_CL);
3382 nsegs = 1;
3383 } else {
3384 nsegs = 2;
3385 }
3386 } else if (bufsize == m_maxsize(MC_16KCL)) {
3387 VERIFY(njcl > 0);
3388 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3389 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3390 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3391 } else {
3392 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3393 }
3394 if (maxsegments != NULL) {
3395 if (*maxsegments && nsegs > *maxsegments) {
3396 *maxsegments = nsegs;
3397 return (NULL);
3398 }
3399 *maxsegments = nsegs;
3400 }
3401
3402 /*
3403 * The caller doesn't want all the requested buffers; only some.
3404 * Try hard to get what we can, but don't block. This effectively
3405 * overrides MCR_SLEEP, since this thread will not go to sleep
3406 * if we can't get all the buffers.
3407 */
3408 if (!wantall || (mcflags & MCR_NOSLEEP))
3409 mcflags |= MCR_TRYHARD;
3410
3411 /*
3412 * Simple case where all elements in the lists/chains are mbufs.
3413 * Unless bufsize is greater than MHLEN, each segment chain is made
3414 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3415 * of 2 mbufs; the second one is used for the residual data, i.e.
3416 * the remaining data that cannot fit into the first mbuf.
3417 */
3418 if (bufsize <= MINCLSIZE) {
3419 /* Allocate the elements in one shot from the mbuf cache */
3420 ASSERT(bufsize <= MHLEN || nsegs == 2);
3421 cp = m_cache(MC_MBUF);
3422 needed = mcache_alloc_ext(cp, &mp_list,
3423 (*numlist) * nsegs, mcflags);
3424
3425 /*
3426 * The number of elements must be even if we are to use an
3427 * mbuf (instead of a cluster) to store the residual data.
3428 * If we couldn't allocate the requested number of mbufs,
3429 * trim the number down (if it's odd) in order to avoid
3430 * creating a partial segment chain.
3431 */
3432 if (bufsize > MHLEN && (needed & 0x1))
3433 needed--;
3434
3435 while (num < needed) {
3436 struct mbuf *m;
3437
3438 m = (struct mbuf *)mp_list;
3439 mp_list = mp_list->obj_next;
3440 ASSERT(m != NULL);
3441
3442 MBUF_INIT(m, 1, MT_DATA);
3443 #if CONFIG_MACF_NET
3444 if (mac_init_mbuf(m, wait) != 0) {
3445 m_free(m);
3446 break;
3447 }
3448 #endif /* MAC_NET */
3449 num++;
3450 if (bufsize > MHLEN) {
3451 /* A second mbuf for this segment chain */
3452 m->m_next = (struct mbuf *)mp_list;
3453 mp_list = mp_list->obj_next;
3454 ASSERT(m->m_next != NULL);
3455
3456 MBUF_INIT(m->m_next, 0, MT_DATA);
3457 num++;
3458 }
3459 *np = m;
3460 np = &m->m_nextpkt;
3461 }
3462 ASSERT(num != *numlist || mp_list == NULL);
3463
3464 if (num > 0) {
3465 mtype_stat_add(MT_DATA, num);
3466 mtype_stat_sub(MT_FREE, num);
3467 }
3468 num /= nsegs;
3469
3470 /* We've got them all; return to caller */
3471 if (num == *numlist)
3472 return (top);
3473
3474 goto fail;
3475 }
3476
3477 /*
3478 * Complex cases where elements are made up of one or more composite
3479 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3480 * be illustrated as follows:
3481 *
3482 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3483 *
3484 * Every composite mbuf + cluster element comes from the intermediate
3485 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
3486 * the last composite element will come from the MC_MBUF_CL cache,
3487 * unless the residual data is larger than 2KB where we use the
3488 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
3489 * data is defined as extra data beyond the first element that cannot
3490 * fit into the previous element, i.e. there is no residual data if
3491 * the chain only has 1 segment.
3492 */
3493 r_bufsize = bufsize;
3494 resid = packetlen > bufsize ? packetlen % bufsize : 0;
3495 if (resid > 0) {
3496 /* There is residual data; figure out the cluster size */
3497 if (wantsize == 0 && packetlen > MINCLSIZE) {
3498 /*
3499 * Caller didn't request that all of the segments
3500 * in the chain use the same cluster size; use the
3501 * smaller of the cluster sizes.
3502 */
3503 if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3504 r_bufsize = m_maxsize(MC_16KCL);
3505 else if (resid > m_maxsize(MC_CL))
3506 r_bufsize = m_maxsize(MC_BIGCL);
3507 else
3508 r_bufsize = m_maxsize(MC_CL);
3509 } else {
3510 /* Use the same cluster size as the other segments */
3511 resid = 0;
3512 }
3513 }
3514
3515 needed = *numlist;
3516 if (resid > 0) {
3517 /*
3518 * Attempt to allocate composite mbuf + cluster elements for
3519 * the residual data in each chain; record the number of such
3520 * elements that can be allocated so that we know how many
3521 * segment chains we can afford to create.
3522 */
3523 if (r_bufsize <= m_maxsize(MC_CL))
3524 rcp = m_cache(MC_MBUF_CL);
3525 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3526 rcp = m_cache(MC_MBUF_BIGCL);
3527 else
3528 rcp = m_cache(MC_MBUF_16KCL);
3529 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3530
3531 if (needed == 0)
3532 goto fail;
3533
3534 /* This is temporarily reduced for calculation */
3535 ASSERT(nsegs > 1);
3536 nsegs--;
3537 }
3538
3539 /*
3540 * Attempt to allocate the rest of the composite mbuf + cluster
3541 * elements for the number of segment chains that we need.
3542 */
3543 if (bufsize <= m_maxsize(MC_CL))
3544 cp = m_cache(MC_MBUF_CL);
3545 else if (bufsize <= m_maxsize(MC_BIGCL))
3546 cp = m_cache(MC_MBUF_BIGCL);
3547 else
3548 cp = m_cache(MC_MBUF_16KCL);
3549 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3550
3551 /* Round it down to avoid creating a partial segment chain */
3552 needed = (needed / nsegs) * nsegs;
3553 if (needed == 0)
3554 goto fail;
3555
3556 if (resid > 0) {
3557 /*
3558 * We're about to construct the chain(s); take into account
3559 * the number of segments we have created above to hold the
3560 * residual data for each chain, as well as restore the
3561 * original count of segments per chain.
3562 */
3563 ASSERT(nsegs > 0);
3564 needed += needed / nsegs;
3565 nsegs++;
3566 }
3567
3568 for (;;) {
3569 struct mbuf *m;
3570 u_int32_t flag;
3571 struct ext_ref *rfa;
3572 void *cl;
3573 int pkthdr;
3574
3575 ++num;
3576 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3577 m = (struct mbuf *)mp_list;
3578 mp_list = mp_list->obj_next;
3579 } else {
3580 m = (struct mbuf *)rmp_list;
3581 rmp_list = rmp_list->obj_next;
3582 }
3583 ASSERT(m != NULL);
3584 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3585 VERIFY(m->m_ext.ext_free == NULL ||
3586 m->m_ext.ext_free == m_bigfree ||
3587 m->m_ext.ext_free == m_16kfree);
3588
3589 cl = m->m_ext.ext_buf;
3590 rfa = MEXT_RFA(m);
3591
3592 ASSERT(cl != NULL && rfa != NULL);
3593 VERIFY(MBUF_IS_COMPOSITE(m));
3594
3595 flag = MEXT_FLAGS(m);
3596
3597 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
3598 if (pkthdr)
3599 first = m;
3600 MBUF_INIT(m, pkthdr, MT_DATA);
3601 if (m->m_ext.ext_free == m_16kfree) {
3602 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3603 } else if (m->m_ext.ext_free == m_bigfree) {
3604 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3605 } else {
3606 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3607 }
3608 #if CONFIG_MACF_NET
3609 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
3610 --num;
3611 m_free(m);
3612 break;
3613 }
3614 #endif /* MAC_NET */
3615
3616 *np = m;
3617 if ((num % nsegs) == 0)
3618 np = &first->m_nextpkt;
3619 else
3620 np = &m->m_next;
3621
3622 if (num == needed)
3623 break;
3624 }
3625
3626 if (num > 0) {
3627 mtype_stat_add(MT_DATA, num);
3628 mtype_stat_sub(MT_FREE, num);
3629 }
3630
3631 num /= nsegs;
3632
3633 /* We've got them all; return to caller */
3634 if (num == *numlist) {
3635 ASSERT(mp_list == NULL && rmp_list == NULL);
3636 return (top);
3637 }
3638
3639 fail:
3640 /* Free up what's left of the above */
3641 if (mp_list != NULL)
3642 mcache_free_ext(cp, mp_list);
3643 if (rmp_list != NULL)
3644 mcache_free_ext(rcp, rmp_list);
3645 if (wantall && top != NULL) {
3646 m_freem(top);
3647 return (NULL);
3648 }
3649 *numlist = num;
3650 return (top);
3651 }
3652
3653 /*
3654 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3655 * packets on receive ring.
3656 */
3657 __private_extern__ struct mbuf *
3658 m_getpacket_how(int wait)
3659 {
3660 unsigned int num_needed = 1;
3661
3662 return (m_getpackets_internal(&num_needed, 1, wait, 1,
3663 m_maxsize(MC_CL)));
3664 }
3665
3666 /*
3667 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3668 * packets on receive ring.
3669 */
3670 struct mbuf *
3671 m_getpacket(void)
3672 {
3673 unsigned int num_needed = 1;
3674
3675 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
3676 m_maxsize(MC_CL)));
3677 }
3678
3679 /*
3680 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3681 * if this can't be met, return whatever number were available. Set up the
3682 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
3683 * are chained on the m_nextpkt field. Any packets requested beyond this are
3684 * chained onto the last packet header's m_next field.
3685 */
3686 struct mbuf *
3687 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
3688 {
3689 unsigned int n = num_needed;
3690
3691 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
3692 m_maxsize(MC_CL)));
3693 }
3694
3695 /*
3696 * Return a list of mbuf hdrs set up as packet hdrs chained together
3697 * on the m_nextpkt field
3698 */
3699 struct mbuf *
3700 m_getpackethdrs(int num_needed, int how)
3701 {
3702 struct mbuf *m;
3703 struct mbuf **np, *top;
3704
3705 top = NULL;
3706 np = &top;
3707
3708 while (num_needed--) {
3709 m = _M_RETRYHDR(how, MT_DATA);
3710 if (m == NULL)
3711 break;
3712
3713 *np = m;
3714 np = &m->m_nextpkt;
3715 }
3716
3717 return (top);
3718 }
3719
3720 /*
3721 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
3722 * for mbufs packets freed. Used by the drivers.
3723 */
3724 int
3725 m_freem_list(struct mbuf *m)
3726 {
3727 struct mbuf *nextpkt;
3728 mcache_obj_t *mp_list = NULL;
3729 mcache_obj_t *mcl_list = NULL;
3730 mcache_obj_t *mbc_list = NULL;
3731 mcache_obj_t *m16k_list = NULL;
3732 mcache_obj_t *m_mcl_list = NULL;
3733 mcache_obj_t *m_mbc_list = NULL;
3734 mcache_obj_t *m_m16k_list = NULL;
3735 mcache_obj_t *ref_list = NULL;
3736 int pktcount = 0;
3737 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
3738
3739 while (m != NULL) {
3740 pktcount++;
3741
3742 nextpkt = m->m_nextpkt;
3743 m->m_nextpkt = NULL;
3744
3745 while (m != NULL) {
3746 struct mbuf *next = m->m_next;
3747 mcache_obj_t *o, *rfa;
3748 u_int32_t refcnt, flags;
3749
3750 if (m->m_type == MT_FREE)
3751 panic("m_free: freeing an already freed mbuf");
3752
3753 if (m->m_type != MT_FREE)
3754 mt_free++;
3755
3756 if (m->m_flags & M_PKTHDR) {
3757 m_tag_delete_chain(m, NULL);
3758 }
3759
3760 if (!(m->m_flags & M_EXT))
3761 goto simple_free;
3762
3763 o = (mcache_obj_t *)m->m_ext.ext_buf;
3764 refcnt = m_decref(m);
3765 flags = MEXT_FLAGS(m);
3766 if (refcnt == 0 && flags == 0) {
3767 if (m->m_ext.ext_free == NULL) {
3768 o->obj_next = mcl_list;
3769 mcl_list = o;
3770 } else if (m->m_ext.ext_free == m_bigfree) {
3771 o->obj_next = mbc_list;
3772 mbc_list = o;
3773 } else if (m->m_ext.ext_free == m_16kfree) {
3774 o->obj_next = m16k_list;
3775 m16k_list = o;
3776 } else {
3777 (*(m->m_ext.ext_free))((caddr_t)o,
3778 m->m_ext.ext_size,
3779 m->m_ext.ext_arg);
3780 }
3781 rfa = (mcache_obj_t *)MEXT_RFA(m);
3782 rfa->obj_next = ref_list;
3783 ref_list = rfa;
3784 MEXT_RFA(m) = NULL;
3785 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
3786 VERIFY(m->m_type != MT_FREE);
3787 /*
3788 * Amortize the costs of atomic operations
3789 * by doing them at the end, if possible.
3790 */
3791 if (m->m_type == MT_DATA)
3792 mt_data++;
3793 else if (m->m_type == MT_HEADER)
3794 mt_header++;
3795 else if (m->m_type == MT_SONAME)
3796 mt_soname++;
3797 else if (m->m_type == MT_TAG)
3798 mt_tag++;
3799 else
3800 mtype_stat_dec(m->m_type);
3801
3802 m->m_type = MT_FREE;
3803 m->m_flags = M_EXT;
3804 m->m_len = 0;
3805 m->m_next = m->m_nextpkt = NULL;
3806
3807 /* "Free" into the intermediate cache */
3808 o = (mcache_obj_t *)m;
3809 if (m->m_ext.ext_free == NULL) {
3810 o->obj_next = m_mcl_list;
3811 m_mcl_list = o;
3812 } else if (m->m_ext.ext_free == m_bigfree) {
3813 o->obj_next = m_mbc_list;
3814 m_mbc_list = o;
3815 } else {
3816 VERIFY(m->m_ext.ext_free == m_16kfree);
3817 o->obj_next = m_m16k_list;
3818 m_m16k_list = o;
3819 }
3820 m = next;
3821 continue;
3822 }
3823 simple_free:
3824 /*
3825 * Amortize the costs of atomic operations
3826 * by doing them at the end, if possible.
3827 */
3828 if (m->m_type == MT_DATA)
3829 mt_data++;
3830 else if (m->m_type == MT_HEADER)
3831 mt_header++;
3832 else if (m->m_type == MT_SONAME)
3833 mt_soname++;
3834 else if (m->m_type == MT_TAG)
3835 mt_tag++;
3836 else if (m->m_type != MT_FREE)
3837 mtype_stat_dec(m->m_type);
3838
3839 m->m_type = MT_FREE;
3840 m->m_flags = m->m_len = 0;
3841 m->m_next = m->m_nextpkt = NULL;
3842
3843 ((mcache_obj_t *)m)->obj_next = mp_list;
3844 mp_list = (mcache_obj_t *)m;
3845
3846 m = next;
3847 }
3848
3849 m = nextpkt;
3850 }
3851
3852 if (mt_free > 0)
3853 mtype_stat_add(MT_FREE, mt_free);
3854 if (mt_data > 0)
3855 mtype_stat_sub(MT_DATA, mt_data);
3856 if (mt_header > 0)
3857 mtype_stat_sub(MT_HEADER, mt_header);
3858 if (mt_soname > 0)
3859 mtype_stat_sub(MT_SONAME, mt_soname);
3860 if (mt_tag > 0)
3861 mtype_stat_sub(MT_TAG, mt_tag);
3862
3863 if (mp_list != NULL)
3864 mcache_free_ext(m_cache(MC_MBUF), mp_list);
3865 if (mcl_list != NULL)
3866 mcache_free_ext(m_cache(MC_CL), mcl_list);
3867 if (mbc_list != NULL)
3868 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
3869 if (m16k_list != NULL)
3870 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
3871 if (m_mcl_list != NULL)
3872 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
3873 if (m_mbc_list != NULL)
3874 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
3875 if (m_m16k_list != NULL)
3876 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
3877 if (ref_list != NULL)
3878 mcache_free_ext(ref_cache, ref_list);
3879
3880 return (pktcount);
3881 }
3882
3883 void
3884 m_freem(struct mbuf *m)
3885 {
3886 while (m != NULL)
3887 m = m_free(m);
3888 }
3889
3890 /*
3891 * Mbuffer utility routines.
3892 */
3893
3894 /*
3895 * Compute the amount of space available before the current start
3896 * of data in an mbuf.
3897 */
3898 int
3899 m_leadingspace(struct mbuf *m)
3900 {
3901 if (m->m_flags & M_EXT) {
3902 if (MCLHASREFERENCE(m))
3903 return (0);
3904 return (m->m_data - m->m_ext.ext_buf);
3905 }
3906 if (m->m_flags & M_PKTHDR)
3907 return (m->m_data - m->m_pktdat);
3908 return (m->m_data - m->m_dat);
3909 }
3910
3911 /*
3912 * Compute the amount of space available after the end of data in an mbuf.
3913 */
3914 int
3915 m_trailingspace(struct mbuf *m)
3916 {
3917 if (m->m_flags & M_EXT) {
3918 if (MCLHASREFERENCE(m))
3919 return (0);
3920 return (m->m_ext.ext_buf + m->m_ext.ext_size -
3921 (m->m_data + m->m_len));
3922 }
3923 return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
3924 }
3925
3926 /*
3927 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
3928 * copy junk along. Does not adjust packet header length.
3929 */
3930 struct mbuf *
3931 m_prepend(struct mbuf *m, int len, int how)
3932 {
3933 struct mbuf *mn;
3934
3935 _MGET(mn, how, m->m_type);
3936 if (mn == NULL) {
3937 m_freem(m);
3938 return (NULL);
3939 }
3940 if (m->m_flags & M_PKTHDR) {
3941 M_COPY_PKTHDR(mn, m);
3942 m->m_flags &= ~M_PKTHDR;
3943 }
3944 mn->m_next = m;
3945 m = mn;
3946 if (len < MHLEN)
3947 MH_ALIGN(m, len);
3948 m->m_len = len;
3949 return (m);
3950 }
3951
3952 /*
3953 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3954 * chain, copy junk along, and adjust length.
3955 */
3956 struct mbuf *
3957 m_prepend_2(struct mbuf *m, int len, int how)
3958 {
3959 if (M_LEADINGSPACE(m) >= len) {
3960 m->m_data -= len;
3961 m->m_len += len;
3962 } else {
3963 m = m_prepend(m, len, how);
3964 }
3965 if ((m) && (m->m_flags & M_PKTHDR))
3966 m->m_pkthdr.len += len;
3967 return (m);
3968 }
3969
3970 /*
3971 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3972 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
3973 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3974 */
3975 int MCFail;
3976
3977 struct mbuf *
3978 m_copym(struct mbuf *m, int off0, int len, int wait)
3979 {
3980 struct mbuf *n, *mhdr = NULL, **np;
3981 int off = off0;
3982 struct mbuf *top;
3983 int copyhdr = 0;
3984
3985 if (off < 0 || len < 0)
3986 panic("m_copym: invalid offset %d or len %d", off, len);
3987
3988 if (off == 0 && (m->m_flags & M_PKTHDR)) {
3989 mhdr = m;
3990 copyhdr = 1;
3991 }
3992
3993 while (off >= m->m_len) {
3994 if (m->m_next == NULL)
3995 panic("m_copym: invalid mbuf chain");
3996 off -= m->m_len;
3997 m = m->m_next;
3998 }
3999 np = &top;
4000 top = NULL;
4001
4002 while (len > 0) {
4003 if (m == NULL) {
4004 if (len != M_COPYALL)
4005 panic("m_copym: len != M_COPYALL");
4006 break;
4007 }
4008
4009 n = _M_RETRY(wait, m->m_type);
4010 *np = n;
4011
4012 if (n == NULL)
4013 goto nospace;
4014
4015 if (copyhdr != 0) {
4016 M_COPY_PKTHDR(n, mhdr);
4017 if (len == M_COPYALL)
4018 n->m_pkthdr.len -= off0;
4019 else
4020 n->m_pkthdr.len = len;
4021 copyhdr = 0;
4022 }
4023 if (len == M_COPYALL) {
4024 if (MIN(len, (m->m_len - off)) == len) {
4025 printf("m->m_len %d - off %d = %d, %d\n",
4026 m->m_len, off, m->m_len - off,
4027 MIN(len, (m->m_len - off)));
4028 }
4029 }
4030 n->m_len = MIN(len, (m->m_len - off));
4031 if (n->m_len == M_COPYALL) {
4032 printf("n->m_len == M_COPYALL, fixing\n");
4033 n->m_len = MHLEN;
4034 }
4035 if (m->m_flags & M_EXT) {
4036 n->m_ext = m->m_ext;
4037 m_incref(m);
4038 n->m_data = m->m_data + off;
4039 n->m_flags |= M_EXT;
4040 } else {
4041 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4042 (unsigned)n->m_len);
4043 }
4044 if (len != M_COPYALL)
4045 len -= n->m_len;
4046 off = 0;
4047 m = m->m_next;
4048 np = &n->m_next;
4049 }
4050
4051 if (top == NULL)
4052 MCFail++;
4053
4054 return (top);
4055 nospace:
4056
4057 m_freem(top);
4058 MCFail++;
4059 return (NULL);
4060 }
4061
4062 /*
4063 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4064 * within this routine also, the last mbuf and offset accessed are passed
4065 * out and can be passed back in to avoid having to rescan the entire mbuf
4066 * list (normally hung off of the socket)
4067 */
4068 struct mbuf *
4069 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4070 struct mbuf **m_last, int *m_off)
4071 {
4072 struct mbuf *n, **np = NULL;
4073 int off = off0, len = len0;
4074 struct mbuf *top = NULL;
4075 int mcflags = MSLEEPF(wait);
4076 int copyhdr = 0;
4077 int type = 0;
4078 mcache_obj_t *list = NULL;
4079 int needed = 0;
4080
4081 if (off == 0 && (m->m_flags & M_PKTHDR))
4082 copyhdr = 1;
4083
4084 if (*m_last != NULL) {
4085 m = *m_last;
4086 off = *m_off;
4087 } else {
4088 while (off >= m->m_len) {
4089 off -= m->m_len;
4090 m = m->m_next;
4091 }
4092 }
4093
4094 n = m;
4095 while (len > 0) {
4096 needed++;
4097 ASSERT(n != NULL);
4098 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4099 n = n->m_next;
4100 }
4101 needed++;
4102 len = len0;
4103
4104 /*
4105 * If the caller doesn't want to be put to sleep, mark it with
4106 * MCR_TRYHARD so that we may reclaim buffers from other places
4107 * before giving up.
4108 */
4109 if (mcflags & MCR_NOSLEEP)
4110 mcflags |= MCR_TRYHARD;
4111
4112 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4113 mcflags) != needed)
4114 goto nospace;
4115
4116 needed = 0;
4117 while (len > 0) {
4118 n = (struct mbuf *)list;
4119 list = list->obj_next;
4120 ASSERT(n != NULL && m != NULL);
4121
4122 type = (top == NULL) ? MT_HEADER : m->m_type;
4123 MBUF_INIT(n, (top == NULL), type);
4124 #if CONFIG_MACF_NET
4125 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4126 mtype_stat_inc(MT_HEADER);
4127 mtype_stat_dec(MT_FREE);
4128 m_free(n);
4129 goto nospace;
4130 }
4131 #endif /* MAC_NET */
4132
4133 if (top == NULL) {
4134 top = n;
4135 np = &top->m_next;
4136 continue;
4137 } else {
4138 needed++;
4139 *np = n;
4140 }
4141
4142 if (copyhdr) {
4143 M_COPY_PKTHDR(n, m);
4144 n->m_pkthdr.len = len;
4145 copyhdr = 0;
4146 }
4147 n->m_len = MIN(len, (m->m_len - off));
4148
4149 if (m->m_flags & M_EXT) {
4150 n->m_ext = m->m_ext;
4151 m_incref(m);
4152 n->m_data = m->m_data + off;
4153 n->m_flags |= M_EXT;
4154 } else {
4155 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4156 (unsigned)n->m_len);
4157 }
4158 len -= n->m_len;
4159
4160 if (len == 0) {
4161 if ((off + n->m_len) == m->m_len) {
4162 *m_last = m->m_next;
4163 *m_off = 0;
4164 } else {
4165 *m_last = m;
4166 *m_off = off + n->m_len;
4167 }
4168 break;
4169 }
4170 off = 0;
4171 m = m->m_next;
4172 np = &n->m_next;
4173 }
4174
4175 mtype_stat_inc(MT_HEADER);
4176 mtype_stat_add(type, needed);
4177 mtype_stat_sub(MT_FREE, needed + 1);
4178
4179 ASSERT(list == NULL);
4180 return (top);
4181
4182 nospace:
4183 if (list != NULL)
4184 mcache_free_ext(m_cache(MC_MBUF), list);
4185 if (top != NULL)
4186 m_freem(top);
4187 MCFail++;
4188 return (NULL);
4189 }
4190
4191 /*
4192 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4193 * continuing for "len" bytes, into the indicated buffer.
4194 */
4195 void
4196 m_copydata(struct mbuf *m, int off, int len, void *vp)
4197 {
4198 unsigned count;
4199 char *cp = vp;
4200
4201 if (off < 0 || len < 0)
4202 panic("m_copydata: invalid offset %d or len %d", off, len);
4203
4204 while (off > 0) {
4205 if (m == NULL)
4206 panic("m_copydata: invalid mbuf chain");
4207 if (off < m->m_len)
4208 break;
4209 off -= m->m_len;
4210 m = m->m_next;
4211 }
4212 while (len > 0) {
4213 if (m == NULL)
4214 panic("m_copydata: invalid mbuf chain");
4215 count = MIN(m->m_len - off, len);
4216 bcopy(MTOD(m, caddr_t) + off, cp, count);
4217 len -= count;
4218 cp += count;
4219 off = 0;
4220 m = m->m_next;
4221 }
4222 }
4223
4224 /*
4225 * Concatenate mbuf chain n to m. Both chains must be of the same type
4226 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4227 */
4228 void
4229 m_cat(struct mbuf *m, struct mbuf *n)
4230 {
4231 while (m->m_next)
4232 m = m->m_next;
4233 while (n) {
4234 if ((m->m_flags & M_EXT) ||
4235 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4236 /* just join the two chains */
4237 m->m_next = n;
4238 return;
4239 }
4240 /* splat the data from one into the other */
4241 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4242 (u_int)n->m_len);
4243 m->m_len += n->m_len;
4244 n = m_free(n);
4245 }
4246 }
4247
4248 void
4249 m_adj(struct mbuf *mp, int req_len)
4250 {
4251 int len = req_len;
4252 struct mbuf *m;
4253 int count;
4254
4255 if ((m = mp) == NULL)
4256 return;
4257 if (len >= 0) {
4258 /*
4259 * Trim from head.
4260 */
4261 while (m != NULL && len > 0) {
4262 if (m->m_len <= len) {
4263 len -= m->m_len;
4264 m->m_len = 0;
4265 m = m->m_next;
4266 } else {
4267 m->m_len -= len;
4268 m->m_data += len;
4269 len = 0;
4270 }
4271 }
4272 m = mp;
4273 if (m->m_flags & M_PKTHDR)
4274 m->m_pkthdr.len -= (req_len - len);
4275 } else {
4276 /*
4277 * Trim from tail. Scan the mbuf chain,
4278 * calculating its length and finding the last mbuf.
4279 * If the adjustment only affects this mbuf, then just
4280 * adjust and return. Otherwise, rescan and truncate
4281 * after the remaining size.
4282 */
4283 len = -len;
4284 count = 0;
4285 for (;;) {
4286 count += m->m_len;
4287 if (m->m_next == (struct mbuf *)0)
4288 break;
4289 m = m->m_next;
4290 }
4291 if (m->m_len >= len) {
4292 m->m_len -= len;
4293 m = mp;
4294 if (m->m_flags & M_PKTHDR)
4295 m->m_pkthdr.len -= len;
4296 return;
4297 }
4298 count -= len;
4299 if (count < 0)
4300 count = 0;
4301 /*
4302 * Correct length for chain is "count".
4303 * Find the mbuf with last data, adjust its length,
4304 * and toss data from remaining mbufs on chain.
4305 */
4306 m = mp;
4307 if (m->m_flags & M_PKTHDR)
4308 m->m_pkthdr.len = count;
4309 for (; m; m = m->m_next) {
4310 if (m->m_len >= count) {
4311 m->m_len = count;
4312 break;
4313 }
4314 count -= m->m_len;
4315 }
4316 while ((m = m->m_next))
4317 m->m_len = 0;
4318 }
4319 }
4320
4321 /*
4322 * Rearange an mbuf chain so that len bytes are contiguous
4323 * and in the data area of an mbuf (so that mtod and dtom
4324 * will work for a structure of size len). Returns the resulting
4325 * mbuf chain on success, frees it and returns null on failure.
4326 * If there is room, it will add up to max_protohdr-len extra bytes to the
4327 * contiguous region in an attempt to avoid being called next time.
4328 */
4329 int MPFail;
4330
4331 struct mbuf *
4332 m_pullup(struct mbuf *n, int len)
4333 {
4334 struct mbuf *m;
4335 int count;
4336 int space;
4337
4338 /*
4339 * If first mbuf has no cluster, and has room for len bytes
4340 * without shifting current data, pullup into it,
4341 * otherwise allocate a new mbuf to prepend to the chain.
4342 */
4343 if ((n->m_flags & M_EXT) == 0 &&
4344 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4345 if (n->m_len >= len)
4346 return (n);
4347 m = n;
4348 n = n->m_next;
4349 len -= m->m_len;
4350 } else {
4351 if (len > MHLEN)
4352 goto bad;
4353 _MGET(m, M_DONTWAIT, n->m_type);
4354 if (m == 0)
4355 goto bad;
4356 m->m_len = 0;
4357 if (n->m_flags & M_PKTHDR) {
4358 M_COPY_PKTHDR(m, n);
4359 n->m_flags &= ~M_PKTHDR;
4360 }
4361 }
4362 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4363 do {
4364 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4365 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4366 (unsigned)count);
4367 len -= count;
4368 m->m_len += count;
4369 n->m_len -= count;
4370 space -= count;
4371 if (n->m_len)
4372 n->m_data += count;
4373 else
4374 n = m_free(n);
4375 } while (len > 0 && n);
4376 if (len > 0) {
4377 (void) m_free(m);
4378 goto bad;
4379 }
4380 m->m_next = n;
4381 return (m);
4382 bad:
4383 m_freem(n);
4384 MPFail++;
4385 return (0);
4386 }
4387
4388 /*
4389 * Partition an mbuf chain in two pieces, returning the tail --
4390 * all but the first len0 bytes. In case of failure, it returns NULL and
4391 * attempts to restore the chain to its original state.
4392 */
4393 struct mbuf *
4394 m_split(struct mbuf *m0, int len0, int wait)
4395 {
4396 return (m_split0(m0, len0, wait, 1));
4397 }
4398
4399 static struct mbuf *
4400 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4401 {
4402 struct mbuf *m, *n;
4403 unsigned len = len0, remain;
4404
4405 for (m = m0; m && len > m->m_len; m = m->m_next)
4406 len -= m->m_len;
4407 if (m == NULL)
4408 return (NULL);
4409 remain = m->m_len - len;
4410 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
4411 _MGETHDR(n, wait, m0->m_type);
4412 if (n == NULL)
4413 return (NULL);
4414 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4415 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4416 m0->m_pkthdr.len = len0;
4417 if (m->m_flags & M_EXT)
4418 goto extpacket;
4419 if (remain > MHLEN) {
4420 /* m can't be the lead packet */
4421 MH_ALIGN(n, 0);
4422 n->m_next = m_split(m, len, wait);
4423 if (n->m_next == NULL) {
4424 (void) m_free(n);
4425 return (NULL);
4426 } else
4427 return (n);
4428 } else
4429 MH_ALIGN(n, remain);
4430 } else if (remain == 0) {
4431 n = m->m_next;
4432 m->m_next = NULL;
4433 return (n);
4434 } else {
4435 _MGET(n, wait, m->m_type);
4436 if (n == NULL)
4437 return (NULL);
4438 M_ALIGN(n, remain);
4439 }
4440 extpacket:
4441 if (m->m_flags & M_EXT) {
4442 n->m_flags |= M_EXT;
4443 n->m_ext = m->m_ext;
4444 m_incref(m);
4445 n->m_data = m->m_data + len;
4446 } else {
4447 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4448 }
4449 n->m_len = remain;
4450 m->m_len = len;
4451 n->m_next = m->m_next;
4452 m->m_next = NULL;
4453 return (n);
4454 }
4455
4456 /*
4457 * Routine to copy from device local memory into mbufs.
4458 */
4459 struct mbuf *
4460 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4461 void (*copy)(const void *, void *, size_t))
4462 {
4463 struct mbuf *m;
4464 struct mbuf *top = NULL, **mp = &top;
4465 int off = off0, len;
4466 char *cp;
4467 char *epkt;
4468
4469 cp = buf;
4470 epkt = cp + totlen;
4471 if (off) {
4472 /*
4473 * If 'off' is non-zero, packet is trailer-encapsulated,
4474 * so we have to skip the type and length fields.
4475 */
4476 cp += off + 2 * sizeof (u_int16_t);
4477 totlen -= 2 * sizeof (u_int16_t);
4478 }
4479 _MGETHDR(m, M_DONTWAIT, MT_DATA);
4480 if (m == NULL)
4481 return (NULL);
4482 m->m_pkthdr.rcvif = ifp;
4483 m->m_pkthdr.len = totlen;
4484 m->m_len = MHLEN;
4485
4486 while (totlen > 0) {
4487 if (top != NULL) {
4488 _MGET(m, M_DONTWAIT, MT_DATA);
4489 if (m == NULL) {
4490 m_freem(top);
4491 return (NULL);
4492 }
4493 m->m_len = MLEN;
4494 }
4495 len = MIN(totlen, epkt - cp);
4496 if (len >= MINCLSIZE) {
4497 MCLGET(m, M_DONTWAIT);
4498 if (m->m_flags & M_EXT) {
4499 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4500 } else {
4501 /* give up when it's out of cluster mbufs */
4502 if (top != NULL)
4503 m_freem(top);
4504 m_freem(m);
4505 return (NULL);
4506 }
4507 } else {
4508 /*
4509 * Place initial small packet/header at end of mbuf.
4510 */
4511 if (len < m->m_len) {
4512 if (top == NULL &&
4513 len + max_linkhdr <= m->m_len)
4514 m->m_data += max_linkhdr;
4515 m->m_len = len;
4516 } else {
4517 len = m->m_len;
4518 }
4519 }
4520 if (copy)
4521 copy(cp, MTOD(m, caddr_t), (unsigned)len);
4522 else
4523 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4524 cp += len;
4525 *mp = m;
4526 mp = &m->m_next;
4527 totlen -= len;
4528 if (cp == epkt)
4529 cp = buf;
4530 }
4531 return (top);
4532 }
4533
4534 void
4535 mbuf_growth_aggressive(void)
4536 {
4537 lck_mtx_lock(mbuf_mlock);
4538 /*
4539 * Don't start to grow the pool until we are at least
4540 * 1/2 (50%) of current total capacity.
4541 */
4542 mbuf_gscale = MB_GROWTH_AGGRESSIVE;
4543 lck_mtx_unlock(mbuf_mlock);
4544 }
4545
4546 void
4547 mbuf_growth_normal(void)
4548 {
4549 lck_mtx_lock(mbuf_mlock);
4550 /*
4551 * Don't start to grow the pool until we are at least
4552 * 15/16 (93.75%) of current total capacity.
4553 */
4554 mbuf_gscale = MB_GROWTH_NORMAL;
4555 lck_mtx_unlock(mbuf_mlock);
4556 }
4557
4558 /*
4559 * Cluster freelist allocation check.
4560 */
4561 static int
4562 m_howmany(int num, size_t bufsize)
4563 {
4564 int i = 0, j = 0;
4565 u_int32_t m_clusters, m_bigclusters, m_16kclusters;
4566 u_int32_t m_clfree, m_bigclfree, m_16kclfree;
4567 u_int32_t s = mbuf_gscale;
4568
4569 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4570
4571 m_clusters = m_total(MC_CL);
4572 m_bigclusters = m_total(MC_BIGCL);
4573 m_16kclusters = m_total(MC_16KCL);
4574 m_clfree = m_infree(MC_CL);
4575 m_bigclfree = m_infree(MC_BIGCL);
4576 m_16kclfree = m_infree(MC_16KCL);
4577
4578 /* Bail if we've maxed out the mbuf memory map */
4579 if ((bufsize != m_maxsize(MC_16KCL) &&
4580 (m_clusters + (m_bigclusters << 1) >= nclusters)) ||
4581 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
4582 (m_16kclusters << 3) >= njcl)) {
4583 #if DEBUG
4584 if (bufsize == MCLBYTES && num > m_clfree) {
4585 printf("m_howmany - out of small clusters, "
4586 "%d short\n", num - mbstat.m_clfree);
4587 }
4588 #endif /* DEBUG */
4589 return (0);
4590 }
4591
4592 if (bufsize == m_maxsize(MC_CL)) {
4593 /* Under minimum */
4594 if (m_clusters < MINCL)
4595 return (MINCL - m_clusters);
4596 /* Too few (free < threshold) and not over maximum */
4597 if (m_clusters < m_maxlimit(MC_CL)) {
4598 if (m_clfree >= MCL_LOWAT)
4599 return (0);
4600 if (num >= m_clfree)
4601 i = num - m_clfree;
4602 if (((m_clusters + num) >> s) > m_clfree)
4603 j = ((m_clusters + num) >> s) - m_clfree;
4604 i = MAX(i, j);
4605 if (i + m_clusters >= m_maxlimit(MC_CL))
4606 i = m_maxlimit(MC_CL) - m_clusters;
4607 }
4608 VERIFY((m_total(MC_CL) + i) <= m_maxlimit(MC_CL));
4609 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4610 /* Under minimum */
4611 if (m_bigclusters < MINBIGCL)
4612 return (MINBIGCL - m_bigclusters);
4613 /* Too few (free < 1/16 total) and not over maximum */
4614 if (m_bigclusters < m_maxlimit(MC_BIGCL)) {
4615 if (m_bigclfree >= MBIGCL_LOWAT)
4616 return (0);
4617 if (num >= m_bigclfree)
4618 i = num - m_bigclfree;
4619 if (((m_bigclusters + num) >> 4) > m_bigclfree)
4620 j = ((m_bigclusters + num) >> 4) - m_bigclfree;
4621 i = MAX(i, j);
4622 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
4623 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
4624 }
4625 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
4626 } else {
4627 VERIFY(njcl > 0);
4628 /* Under minimum */
4629 if (m_16kclusters < MIN16KCL)
4630 return (MIN16KCL - m_16kclusters);
4631 /* Too few (free < 1/16 total) and not over maximum */
4632 if (m_16kclusters < m_maxlimit(MC_16KCL)) {
4633 if (m_16kclfree >= M16KCL_LOWAT)
4634 return (0);
4635 if (num >= m_16kclfree)
4636 i = num - m_16kclfree;
4637 if (((m_16kclusters + num) >> 4) > m_16kclfree)
4638 j = ((m_16kclusters + num) >> 4) - m_16kclfree;
4639 i = MAX(i, j);
4640 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
4641 i = m_maxlimit(MC_16KCL) - m_16kclusters;
4642 }
4643 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
4644 }
4645
4646 return (i);
4647 }
4648
4649 /*
4650 * Return the number of bytes in the mbuf chain, m.
4651 */
4652 static unsigned int
4653 m_length(struct mbuf *m)
4654 {
4655 struct mbuf *m0;
4656 unsigned int pktlen;
4657
4658 if (m->m_flags & M_PKTHDR)
4659 return (m->m_pkthdr.len);
4660
4661 pktlen = 0;
4662 for (m0 = m; m0 != NULL; m0 = m0->m_next)
4663 pktlen += m0->m_len;
4664 return (pktlen);
4665 }
4666
4667 /*
4668 * Copy data from a buffer back into the indicated mbuf chain,
4669 * starting "off" bytes from the beginning, extending the mbuf
4670 * chain if necessary.
4671 */
4672 void
4673 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
4674 {
4675 #if DEBUG
4676 struct mbuf *origm = m0;
4677 int error;
4678 #endif /* DEBUG */
4679
4680 if (m0 == NULL)
4681 return;
4682
4683 #if DEBUG
4684 error =
4685 #endif /* DEBUG */
4686 m_copyback0(&m0, off, len, cp,
4687 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
4688
4689 #if DEBUG
4690 if (error != 0 || (m0 != NULL && origm != m0))
4691 panic("m_copyback");
4692 #endif /* DEBUG */
4693 }
4694
4695 struct mbuf *
4696 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
4697 {
4698 int error;
4699
4700 /* don't support chain expansion */
4701 VERIFY(off + len <= m_length(m0));
4702
4703 error = m_copyback0(&m0, off, len, cp,
4704 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
4705 if (error) {
4706 /*
4707 * no way to recover from partial success.
4708 * just free the chain.
4709 */
4710 m_freem(m0);
4711 return (NULL);
4712 }
4713 return (m0);
4714 }
4715
4716 /*
4717 * m_makewritable: ensure the specified range writable.
4718 */
4719 int
4720 m_makewritable(struct mbuf **mp, int off, int len, int how)
4721 {
4722 int error;
4723 #if DEBUG
4724 struct mbuf *n;
4725 int origlen, reslen;
4726
4727 origlen = m_length(*mp);
4728 #endif /* DEBUG */
4729
4730 #if 0 /* M_COPYALL is large enough */
4731 if (len == M_COPYALL)
4732 len = m_length(*mp) - off; /* XXX */
4733 #endif
4734
4735 error = m_copyback0(mp, off, len, NULL,
4736 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
4737
4738 #if DEBUG
4739 reslen = 0;
4740 for (n = *mp; n; n = n->m_next)
4741 reslen += n->m_len;
4742 if (origlen != reslen)
4743 panic("m_makewritable: length changed");
4744 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
4745 panic("m_makewritable: inconsist");
4746 #endif /* DEBUG */
4747
4748 return (error);
4749 }
4750
4751 static int
4752 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
4753 int how)
4754 {
4755 int mlen;
4756 struct mbuf *m, *n;
4757 struct mbuf **mp;
4758 int totlen = 0;
4759 const char *cp = vp;
4760
4761 VERIFY(mp0 != NULL);
4762 VERIFY(*mp0 != NULL);
4763 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
4764 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
4765
4766 /*
4767 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
4768 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
4769 */
4770
4771 VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
4772
4773 mp = mp0;
4774 m = *mp;
4775 while (off > (mlen = m->m_len)) {
4776 off -= mlen;
4777 totlen += mlen;
4778 if (m->m_next == NULL) {
4779 int tspace;
4780 extend:
4781 if (!(flags & M_COPYBACK0_EXTEND))
4782 goto out;
4783
4784 /*
4785 * try to make some space at the end of "m".
4786 */
4787
4788 mlen = m->m_len;
4789 if (off + len >= MINCLSIZE &&
4790 !(m->m_flags & M_EXT) && m->m_len == 0) {
4791 MCLGET(m, how);
4792 }
4793 tspace = M_TRAILINGSPACE(m);
4794 if (tspace > 0) {
4795 tspace = MIN(tspace, off + len);
4796 VERIFY(tspace > 0);
4797 bzero(mtod(m, char *) + m->m_len,
4798 MIN(off, tspace));
4799 m->m_len += tspace;
4800 off += mlen;
4801 totlen -= mlen;
4802 continue;
4803 }
4804
4805 /*
4806 * need to allocate an mbuf.
4807 */
4808
4809 if (off + len >= MINCLSIZE) {
4810 n = m_getcl(how, m->m_type, 0);
4811 } else {
4812 n = _M_GET(how, m->m_type);
4813 }
4814 if (n == NULL) {
4815 goto out;
4816 }
4817 n->m_len = 0;
4818 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
4819 bzero(mtod(n, char *), MIN(n->m_len, off));
4820 m->m_next = n;
4821 }
4822 mp = &m->m_next;
4823 m = m->m_next;
4824 }
4825 while (len > 0) {
4826 mlen = m->m_len - off;
4827 if (mlen != 0 && m_mclhasreference(m)) {
4828 char *datap;
4829 int eatlen;
4830
4831 /*
4832 * this mbuf is read-only.
4833 * allocate a new writable mbuf and try again.
4834 */
4835
4836 #if defined(DIAGNOSTIC)
4837 if (!(flags & M_COPYBACK0_COW))
4838 panic("m_copyback0: read-only");
4839 #endif /* defined(DIAGNOSTIC) */
4840
4841 /*
4842 * if we're going to write into the middle of
4843 * a mbuf, split it first.
4844 */
4845 if (off > 0 && len < mlen) {
4846 n = m_split0(m, off, how, 0);
4847 if (n == NULL)
4848 goto enobufs;
4849 m->m_next = n;
4850 mp = &m->m_next;
4851 m = n;
4852 off = 0;
4853 continue;
4854 }
4855
4856 /*
4857 * XXX TODO coalesce into the trailingspace of
4858 * the previous mbuf when possible.
4859 */
4860
4861 /*
4862 * allocate a new mbuf. copy packet header if needed.
4863 */
4864 n = _M_GET(how, m->m_type);
4865 if (n == NULL)
4866 goto enobufs;
4867 if (off == 0 && (m->m_flags & M_PKTHDR)) {
4868 M_COPY_PKTHDR(n, m);
4869 n->m_len = MHLEN;
4870 } else {
4871 if (len >= MINCLSIZE)
4872 MCLGET(n, M_DONTWAIT);
4873 n->m_len =
4874 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
4875 }
4876 if (n->m_len > len)
4877 n->m_len = len;
4878
4879 /*
4880 * free the region which has been overwritten.
4881 * copying data from old mbufs if requested.
4882 */
4883 if (flags & M_COPYBACK0_PRESERVE)
4884 datap = mtod(n, char *);
4885 else
4886 datap = NULL;
4887 eatlen = n->m_len;
4888 VERIFY(off == 0 || eatlen >= mlen);
4889 if (off > 0) {
4890 VERIFY(len >= mlen);
4891 m->m_len = off;
4892 m->m_next = n;
4893 if (datap) {
4894 m_copydata(m, off, mlen, datap);
4895 datap += mlen;
4896 }
4897 eatlen -= mlen;
4898 mp = &m->m_next;
4899 m = m->m_next;
4900 }
4901 while (m != NULL && m_mclhasreference(m) &&
4902 n->m_type == m->m_type && eatlen > 0) {
4903 mlen = MIN(eatlen, m->m_len);
4904 if (datap) {
4905 m_copydata(m, 0, mlen, datap);
4906 datap += mlen;
4907 }
4908 m->m_data += mlen;
4909 m->m_len -= mlen;
4910 eatlen -= mlen;
4911 if (m->m_len == 0)
4912 *mp = m = m_free(m);
4913 }
4914 if (eatlen > 0)
4915 n->m_len -= eatlen;
4916 n->m_next = m;
4917 *mp = m = n;
4918 continue;
4919 }
4920 mlen = MIN(mlen, len);
4921 if (flags & M_COPYBACK0_COPYBACK) {
4922 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
4923 cp += mlen;
4924 }
4925 len -= mlen;
4926 mlen += off;
4927 off = 0;
4928 totlen += mlen;
4929 if (len == 0)
4930 break;
4931 if (m->m_next == NULL) {
4932 goto extend;
4933 }
4934 mp = &m->m_next;
4935 m = m->m_next;
4936 }
4937 out:
4938 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
4939 VERIFY(flags & M_COPYBACK0_EXTEND);
4940 m->m_pkthdr.len = totlen;
4941 }
4942
4943 return (0);
4944
4945 enobufs:
4946 return (ENOBUFS);
4947 }
4948
4949 char *
4950 mcl_to_paddr(char *addr)
4951 {
4952 vm_offset_t base_phys;
4953
4954 if (!MBUF_IN_MAP(addr))
4955 return (NULL);
4956 base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
4957
4958 if (base_phys == 0)
4959 return (NULL);
4960 return ((char *)((uintptr_t)base_phys | ((uintptr_t)addr & PGOFSET)));
4961 }
4962
4963 /*
4964 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
4965 * And really copy the thing. That way, we don't "precompute" checksums
4966 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
4967 * small packets, don't dup into a cluster. That way received packets
4968 * don't take up too much room in the sockbuf (cf. sbspace()).
4969 */
4970 int MDFail;
4971
4972 struct mbuf *
4973 m_dup(struct mbuf *m, int how)
4974 {
4975 struct mbuf *n, **np;
4976 struct mbuf *top;
4977 int copyhdr = 0;
4978
4979 np = &top;
4980 top = NULL;
4981 if (m->m_flags & M_PKTHDR)
4982 copyhdr = 1;
4983
4984 /*
4985 * Quick check: if we have one mbuf and its data fits in an
4986 * mbuf with packet header, just copy and go.
4987 */
4988 if (m->m_next == NULL) {
4989 /* Then just move the data into an mbuf and be done... */
4990 if (copyhdr) {
4991 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
4992 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
4993 return (NULL);
4994 n->m_len = m->m_len;
4995 m_dup_pkthdr(n, m, how);
4996 bcopy(m->m_data, n->m_data, m->m_len);
4997 return (n);
4998 }
4999 } else if (m->m_len <= MLEN) {
5000 if ((n = _M_GET(how, m->m_type)) == NULL)
5001 return (NULL);
5002 bcopy(m->m_data, n->m_data, m->m_len);
5003 n->m_len = m->m_len;
5004 return (n);
5005 }
5006 }
5007 while (m != NULL) {
5008 #if BLUE_DEBUG
5009 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5010 m->m_data);
5011 #endif
5012 if (copyhdr)
5013 n = _M_GETHDR(how, m->m_type);
5014 else
5015 n = _M_GET(how, m->m_type);
5016 if (n == NULL)
5017 goto nospace;
5018 if (m->m_flags & M_EXT) {
5019 if (m->m_len <= m_maxsize(MC_CL))
5020 MCLGET(n, how);
5021 else if (m->m_len <= m_maxsize(MC_BIGCL))
5022 n = m_mbigget(n, how);
5023 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5024 n = m_m16kget(n, how);
5025 if (!(n->m_flags & M_EXT)) {
5026 (void) m_free(n);
5027 goto nospace;
5028 }
5029 }
5030 *np = n;
5031 if (copyhdr) {
5032 /* Don't use M_COPY_PKTHDR: preserve m_data */
5033 m_dup_pkthdr(n, m, how);
5034 copyhdr = 0;
5035 if (!(n->m_flags & M_EXT))
5036 n->m_data = n->m_pktdat;
5037 }
5038 n->m_len = m->m_len;
5039 /*
5040 * Get the dup on the same bdry as the original
5041 * Assume that the two mbufs have the same offset to data area
5042 * (up to word boundaries)
5043 */
5044 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5045 m = m->m_next;
5046 np = &n->m_next;
5047 #if BLUE_DEBUG
5048 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5049 n->m_data);
5050 #endif
5051 }
5052
5053 if (top == NULL)
5054 MDFail++;
5055 return (top);
5056
5057 nospace:
5058 m_freem(top);
5059 MDFail++;
5060 return (NULL);
5061 }
5062
5063 #define MBUF_MULTIPAGES(m) \
5064 (((m)->m_flags & M_EXT) && \
5065 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
5066 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
5067 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5068
5069 static struct mbuf *
5070 m_expand(struct mbuf *m, struct mbuf **last)
5071 {
5072 struct mbuf *top = NULL;
5073 struct mbuf **nm = &top;
5074 uintptr_t data0, data;
5075 unsigned int len0, len;
5076
5077 VERIFY(MBUF_MULTIPAGES(m));
5078 VERIFY(m->m_next == NULL);
5079 data0 = (uintptr_t)m->m_data;
5080 len0 = m->m_len;
5081 *last = top;
5082
5083 for (;;) {
5084 struct mbuf *n;
5085
5086 data = data0;
5087 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5088 len = NBPG;
5089 else if (!IS_P2ALIGNED(data, NBPG) &&
5090 P2ROUNDUP(data, NBPG) < (data + len0))
5091 len = P2ROUNDUP(data, NBPG) - data;
5092 else
5093 len = len0;
5094
5095 VERIFY(len > 0);
5096 VERIFY(m->m_flags & M_EXT);
5097 m->m_data = (void *)data;
5098 m->m_len = len;
5099
5100 *nm = *last = m;
5101 nm = &m->m_next;
5102 m->m_next = NULL;
5103
5104 data0 += len;
5105 len0 -= len;
5106 if (len0 == 0)
5107 break;
5108
5109 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5110 if (n == NULL) {
5111 m_freem(top);
5112 top = *last = NULL;
5113 break;
5114 }
5115
5116 n->m_ext = m->m_ext;
5117 m_incref(m);
5118 n->m_flags |= M_EXT;
5119 m = n;
5120 }
5121 return (top);
5122 }
5123
5124 struct mbuf *
5125 m_normalize(struct mbuf *m)
5126 {
5127 struct mbuf *top = NULL;
5128 struct mbuf **nm = &top;
5129 boolean_t expanded = FALSE;
5130
5131 while (m != NULL) {
5132 struct mbuf *n;
5133
5134 n = m->m_next;
5135 m->m_next = NULL;
5136
5137 /* Does the data cross one or more page boundaries? */
5138 if (MBUF_MULTIPAGES(m)) {
5139 struct mbuf *last;
5140 if ((m = m_expand(m, &last)) == NULL) {
5141 m_freem(n);
5142 m_freem(top);
5143 top = NULL;
5144 break;
5145 }
5146 *nm = m;
5147 nm = &last->m_next;
5148 expanded = TRUE;
5149 } else {
5150 *nm = m;
5151 nm = &m->m_next;
5152 }
5153 m = n;
5154 }
5155 if (expanded)
5156 atomic_add_32(&mb_normalized, 1);
5157 return (top);
5158 }
5159
5160 void
5161 m_mchtype(struct mbuf *m, int t)
5162 {
5163 mtype_stat_inc(t);
5164 mtype_stat_dec(m->m_type);
5165 (m)->m_type = t;
5166 }
5167
5168 void *
5169 m_mtod(struct mbuf *m)
5170 {
5171 return (MTOD(m, void *));
5172 }
5173
5174 struct mbuf *
5175 m_dtom(void *x)
5176 {
5177 return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5178 }
5179
5180 void
5181 m_mcheck(struct mbuf *m)
5182 {
5183 _MCHECK(m);
5184 }
5185
5186 /*
5187 * Inform the corresponding mcache(s) that there's a waiter below.
5188 */
5189 static void
5190 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5191 {
5192 mcache_waiter_inc(m_cache(class));
5193 if (comp) {
5194 if (class == MC_CL) {
5195 mcache_waiter_inc(m_cache(MC_MBUF_CL));
5196 } else if (class == MC_BIGCL) {
5197 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5198 } else if (class == MC_16KCL) {
5199 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
5200 } else {
5201 mcache_waiter_inc(m_cache(MC_MBUF_CL));
5202 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5203 }
5204 }
5205 }
5206
5207 /*
5208 * Inform the corresponding mcache(s) that there's no more waiter below.
5209 */
5210 static void
5211 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
5212 {
5213 mcache_waiter_dec(m_cache(class));
5214 if (comp) {
5215 if (class == MC_CL) {
5216 mcache_waiter_dec(m_cache(MC_MBUF_CL));
5217 } else if (class == MC_BIGCL) {
5218 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5219 } else if (class == MC_16KCL) {
5220 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
5221 } else {
5222 mcache_waiter_dec(m_cache(MC_MBUF_CL));
5223 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5224 }
5225 }
5226 }
5227
5228 /*
5229 * Called during blocking allocation. Returns TRUE if one or more objects
5230 * are available at the per-CPU caches layer and that allocation should be
5231 * retried at that level.
5232 */
5233 static boolean_t
5234 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
5235 {
5236 boolean_t mcache_retry = FALSE;
5237
5238 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5239
5240 /* Check if there's anything at the cache layer */
5241 if (mbuf_cached_above(class, wait)) {
5242 mcache_retry = TRUE;
5243 goto done;
5244 }
5245
5246 /* Nothing? Then try hard to get it from somewhere */
5247 m_reclaim(class, num, (wait & MCR_COMP));
5248
5249 /* We tried hard and got something? */
5250 if (m_infree(class) > 0) {
5251 mbstat.m_wait++;
5252 goto done;
5253 } else if (mbuf_cached_above(class, wait)) {
5254 mbstat.m_wait++;
5255 mcache_retry = TRUE;
5256 goto done;
5257 } else if (wait & MCR_TRYHARD) {
5258 mcache_retry = TRUE;
5259 goto done;
5260 }
5261
5262 /*
5263 * There's really nothing for us right now; inform the
5264 * cache(s) that there is a waiter below and go to sleep.
5265 */
5266 mbuf_waiter_inc(class, (wait & MCR_COMP));
5267
5268 VERIFY(!(wait & MCR_NOSLEEP));
5269 mb_waiters++;
5270 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
5271
5272 /* We are now up; stop getting notified until next round */
5273 mbuf_waiter_dec(class, (wait & MCR_COMP));
5274
5275 /* We waited and got something */
5276 if (m_infree(class) > 0) {
5277 mbstat.m_wait++;
5278 goto done;
5279 } else if (mbuf_cached_above(class, wait)) {
5280 mbstat.m_wait++;
5281 mcache_retry = TRUE;
5282 }
5283 done:
5284 return (mcache_retry);
5285 }
5286
5287 static void
5288 mbuf_worker_thread(void)
5289 {
5290 int mbuf_expand;
5291
5292 while (1) {
5293 lck_mtx_lock(mbuf_mlock);
5294
5295 mbuf_expand = 0;
5296 if (mbuf_expand_mcl) {
5297 int n;
5298
5299 /* Adjust to current number of cluster in use */
5300 n = mbuf_expand_mcl -
5301 (m_total(MC_CL) - m_infree(MC_CL));
5302 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
5303 n = m_maxlimit(MC_CL) - m_total(MC_CL);
5304 mbuf_expand_mcl = 0;
5305
5306 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
5307 mbuf_expand++;
5308 }
5309 if (mbuf_expand_big) {
5310 int n;
5311
5312 /* Adjust to current number of 4 KB cluster in use */
5313 n = mbuf_expand_big -
5314 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
5315 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
5316 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
5317 mbuf_expand_big = 0;
5318
5319 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
5320 mbuf_expand++;
5321 }
5322 if (mbuf_expand_16k) {
5323 int n;
5324
5325 /* Adjust to current number of 16 KB cluster in use */
5326 n = mbuf_expand_16k -
5327 (m_total(MC_16KCL) - m_infree(MC_16KCL));
5328 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
5329 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5330 mbuf_expand_16k = 0;
5331
5332 if (n > 0)
5333 (void) freelist_populate(MC_16KCL, n, M_WAIT);
5334 }
5335
5336 /*
5337 * Because we can run out of memory before filling the mbuf
5338 * map, we should not allocate more clusters than they are
5339 * mbufs -- otherwise we could have a large number of useless
5340 * clusters allocated.
5341 */
5342 if (mbuf_expand) {
5343 while (m_total(MC_MBUF) <
5344 (m_total(MC_BIGCL) + m_total(MC_CL))) {
5345 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
5346 break;
5347 }
5348 }
5349
5350 lck_mtx_unlock(mbuf_mlock);
5351
5352 assert_wait(&mbuf_worker_run, THREAD_UNINT);
5353 (void) thread_block((thread_continue_t)mbuf_worker_thread);
5354 }
5355 }
5356
5357 static void
5358 mbuf_worker_thread_init(void)
5359 {
5360 mbuf_worker_ready++;
5361 mbuf_worker_thread();
5362 }
5363
5364 static mcl_slab_t *
5365 slab_get(void *buf)
5366 {
5367 mcl_slabg_t *slg;
5368 unsigned int ix, k;
5369
5370 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5371
5372 VERIFY(MBUF_IN_MAP(buf));
5373 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
5374 VERIFY(ix < maxslabgrp);
5375
5376 if ((slg = slabstbl[ix]) == NULL) {
5377 /*
5378 * In the current implementation, we never shrink the memory
5379 * pool (hence the cluster map); if we attempt to reallocate
5380 * a cluster group when it's already allocated, panic since
5381 * this is a sign of a memory corruption (slabstbl[ix] got
5382 * nullified). This also means that there shouldn't be any
5383 * hole in the kernel sub-map for the mbuf pool.
5384 */
5385 ++slabgrp;
5386 VERIFY(ix < slabgrp);
5387 /*
5388 * Slabs expansion can only be done single threaded; when
5389 * we get here, it must be as a result of m_clalloc() which
5390 * is serialized and therefore mb_clalloc_busy must be set.
5391 */
5392 VERIFY(mb_clalloc_busy);
5393 lck_mtx_unlock(mbuf_mlock);
5394
5395 /* This is a new buffer; create the slabs group for it */
5396 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
5397 M_WAITOK | M_ZERO);
5398 VERIFY(slg != NULL);
5399
5400 lck_mtx_lock(mbuf_mlock);
5401 /*
5402 * No other thread could have gone into m_clalloc() after
5403 * we dropped the lock above, so verify that it's true.
5404 */
5405 VERIFY(mb_clalloc_busy);
5406
5407 slabstbl[ix] = slg;
5408
5409 /* Chain each slab in the group to its forward neighbor */
5410 for (k = 1; k < NSLABSPMB; k++)
5411 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
5412 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
5413
5414 /* And chain the last slab in the previous group to this */
5415 if (ix > 0) {
5416 VERIFY(slabstbl[ix - 1]->
5417 slg_slab[NSLABSPMB - 1].sl_next == NULL);
5418 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
5419 &slg->slg_slab[0];
5420 }
5421 }
5422
5423 ix = MTOCL(buf) % NSLABSPMB;
5424 VERIFY(ix < NSLABSPMB);
5425
5426 return (&slg->slg_slab[ix]);
5427 }
5428
5429 static void
5430 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
5431 void *base, void *head, unsigned int len, int refcnt, int chunks)
5432 {
5433 sp->sl_class = class;
5434 sp->sl_flags = flags;
5435 sp->sl_base = base;
5436 sp->sl_head = head;
5437 sp->sl_len = len;
5438 sp->sl_refcnt = refcnt;
5439 sp->sl_chunks = chunks;
5440 slab_detach(sp);
5441 }
5442
5443 static void
5444 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
5445 {
5446 VERIFY(slab_is_detached(sp));
5447 m_slab_cnt(class)++;
5448 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
5449 sp->sl_flags &= ~SLF_DETACHED;
5450 if (class == MC_BIGCL) {
5451 sp = sp->sl_next;
5452 /* Next slab must already be present */
5453 VERIFY(sp != NULL);
5454 VERIFY(slab_is_detached(sp));
5455 sp->sl_flags &= ~SLF_DETACHED;
5456 } else if (class == MC_16KCL) {
5457 int k;
5458 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5459 sp = sp->sl_next;
5460 /* Next slab must already be present */
5461 VERIFY(sp != NULL);
5462 VERIFY(slab_is_detached(sp));
5463 sp->sl_flags &= ~SLF_DETACHED;
5464 }
5465 }
5466 }
5467
5468 static void
5469 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
5470 {
5471 VERIFY(!slab_is_detached(sp));
5472 VERIFY(m_slab_cnt(class) > 0);
5473 m_slab_cnt(class)--;
5474 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
5475 slab_detach(sp);
5476 if (class == MC_BIGCL) {
5477 sp = sp->sl_next;
5478 /* Next slab must already be present */
5479 VERIFY(sp != NULL);
5480 VERIFY(!slab_is_detached(sp));
5481 slab_detach(sp);
5482 } else if (class == MC_16KCL) {
5483 int k;
5484 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5485 sp = sp->sl_next;
5486 /* Next slab must already be present */
5487 VERIFY(sp != NULL);
5488 VERIFY(!slab_is_detached(sp));
5489 slab_detach(sp);
5490 }
5491 }
5492 }
5493
5494 static boolean_t
5495 slab_inrange(mcl_slab_t *sp, void *buf)
5496 {
5497 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
5498 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
5499 }
5500
5501 #undef panic
5502
5503 static void
5504 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
5505 {
5506 int i;
5507 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
5508 uintptr_t buf = (uintptr_t)sp->sl_base;
5509
5510 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
5511 void *next = ((mcache_obj_t *)buf)->obj_next;
5512 if (next != addr)
5513 continue;
5514 if (mclaudit == NULL) {
5515 if (next != NULL && !MBUF_IN_MAP(next)) {
5516 mcache_t *cp = m_cache(sp->sl_class);
5517 panic("%s: %s buffer %p in slab %p modified "
5518 "after free at offset 0: %p out of range "
5519 "[%p-%p)\n", __func__, cp->mc_name,
5520 (void *)buf, sp, next, mbutl, embutl);
5521 /* NOTREACHED */
5522 }
5523 } else {
5524 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
5525 (mcache_obj_t *)buf);
5526 mcl_audit_verify_nextptr(next, mca);
5527 }
5528 }
5529 }
5530
5531 static void
5532 slab_detach(mcl_slab_t *sp)
5533 {
5534 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
5535 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
5536 sp->sl_flags |= SLF_DETACHED;
5537 }
5538
5539 static boolean_t
5540 slab_is_detached(mcl_slab_t *sp)
5541 {
5542 return ((intptr_t)sp->sl_link.tqe_next == -1 &&
5543 (intptr_t)sp->sl_link.tqe_prev == -1 &&
5544 (sp->sl_flags & SLF_DETACHED));
5545 }
5546
5547 static void
5548 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
5549 mcache_obj_t **con_list, size_t con_size, unsigned int num)
5550 {
5551 mcache_audit_t *mca, *mca_tail;
5552 mcache_obj_t *con = NULL;
5553 boolean_t save_contents = (con_list != NULL);
5554 unsigned int i, ix;
5555
5556 ASSERT(num <= NMBPCL);
5557 ASSERT(con_list == NULL || con_size != 0);
5558
5559 ix = MTOCL(buf);
5560 /* Make sure we haven't been here before */
5561 for (i = 0; i < NMBPCL; i++)
5562 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
5563
5564 mca = mca_tail = *mca_list;
5565 if (save_contents)
5566 con = *con_list;
5567
5568 for (i = 0; i < num; i++) {
5569 mcache_audit_t *next;
5570
5571 next = mca->mca_next;
5572 bzero(mca, sizeof (*mca));
5573 mca->mca_next = next;
5574 mclaudit[ix].cl_audit[i] = mca;
5575
5576 /* Attach the contents buffer if requested */
5577 if (save_contents) {
5578 VERIFY(con != NULL);
5579 mca->mca_contents_size = con_size;
5580 mca->mca_contents = con;
5581 con = con->obj_next;
5582 bzero(mca->mca_contents, mca->mca_contents_size);
5583 }
5584
5585 mca_tail = mca;
5586 mca = mca->mca_next;
5587 }
5588
5589 if (save_contents)
5590 *con_list = con;
5591
5592 *mca_list = mca_tail->mca_next;
5593 mca_tail->mca_next = NULL;
5594 }
5595
5596 /*
5597 * Given an address of a buffer (mbuf/cluster/big cluster), return
5598 * the corresponding audit structure for that buffer.
5599 */
5600 static mcache_audit_t *
5601 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
5602 {
5603 mcache_audit_t *mca = NULL;
5604 int ix = MTOCL(o);
5605
5606 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
5607
5608 switch (class) {
5609 case MC_MBUF:
5610 /*
5611 * For the mbuf case, find the index of the cluster
5612 * used by the mbuf and use that index to locate the
5613 * base address of the cluster. Then find out the
5614 * mbuf index relative to the cluster base and use
5615 * it to locate the audit structure.
5616 */
5617 VERIFY(MCLIDX(CLTOM(ix), o) < (int)NMBPCL);
5618 mca = mclaudit[ix].cl_audit[MCLIDX(CLTOM(ix), o)];
5619 break;
5620
5621 case MC_CL:
5622 case MC_BIGCL:
5623 case MC_16KCL:
5624 /*
5625 * Same as above, but only return the first element.
5626 */
5627 mca = mclaudit[ix].cl_audit[0];
5628 break;
5629
5630 default:
5631 VERIFY(0);
5632 /* NOTREACHED */
5633 }
5634
5635 return (mca);
5636 }
5637
5638 static void
5639 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
5640 boolean_t alloc)
5641 {
5642 struct mbuf *m = addr;
5643 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
5644
5645 VERIFY(mca->mca_contents != NULL &&
5646 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5647
5648 mcl_audit_verify_nextptr(next, mca);
5649
5650 if (!alloc) {
5651 /* Save constructed mbuf fields */
5652 mcl_audit_save_mbuf(m, mca);
5653 mcache_set_pattern(MCACHE_FREE_PATTERN, m, m_maxsize(MC_MBUF));
5654 ((mcache_obj_t *)m)->obj_next = next;
5655 return;
5656 }
5657
5658 /* Check if the buffer has been corrupted while in freelist */
5659 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
5660
5661 /* Restore constructed mbuf fields */
5662 mcl_audit_restore_mbuf(m, mca, composite);
5663 }
5664
5665 static void
5666 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
5667 {
5668 struct mbuf *ms = (struct mbuf *)mca->mca_contents;
5669
5670 if (composite) {
5671 struct mbuf *next = m->m_next;
5672 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
5673 MBUF_IS_COMPOSITE(ms));
5674 /*
5675 * We could have hand-picked the mbuf fields and restore
5676 * them individually, but that will be a maintenance
5677 * headache. Instead, restore everything that was saved;
5678 * the mbuf layer will recheck and reinitialize anyway.
5679 */
5680 bcopy(ms, m, mca->mca_contents_size);
5681 m->m_next = next;
5682 } else {
5683 /*
5684 * For a regular mbuf (no cluster attached) there's nothing
5685 * to restore other than the type field, which is expected
5686 * to be MT_FREE.
5687 */
5688 m->m_type = ms->m_type;
5689 }
5690 _MCHECK(m);
5691 }
5692
5693 static void
5694 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
5695 {
5696 _MCHECK(m);
5697 bcopy(m, mca->mca_contents, mca->mca_contents_size);
5698 }
5699
5700 static void
5701 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
5702 boolean_t save_next)
5703 {
5704 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
5705
5706 if (!alloc) {
5707 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
5708 if (save_next) {
5709 mcl_audit_verify_nextptr(next, mca);
5710 ((mcache_obj_t *)addr)->obj_next = next;
5711 }
5712 } else {
5713 /* Check if the buffer has been corrupted while in freelist */
5714 mcl_audit_verify_nextptr(next, mca);
5715 mcache_audit_free_verify_set(mca, addr, 0, size);
5716 }
5717 }
5718
5719 static void
5720 mcl_audit_mcheck_panic(struct mbuf *m)
5721 {
5722 mcache_audit_t *mca;
5723
5724 MRANGE(m);
5725 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
5726
5727 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
5728 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
5729 /* NOTREACHED */
5730 }
5731
5732 static void
5733 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
5734 {
5735 if (next != NULL && next != (void *)MCACHE_FREE_PATTERN &&
5736 !MBUF_IN_MAP(next)) {
5737 panic("mcl_audit: buffer %p modified after free at offset 0: "
5738 "%p out of range [%p-%p)\n%s\n",
5739 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
5740 /* NOTREACHED */
5741 }
5742 }
5743
5744 SYSCTL_DECL(_kern_ipc);
5745 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD | CTLFLAG_LOCKED,
5746 0, 0, mbstat_sysctl, "S,mbstat", "");
5747 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLFLAG_RD | CTLFLAG_LOCKED,
5748 0, 0, mb_stat_sysctl, "S,mb_stat", "");
5749 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, CTLFLAG_RD | CTLFLAG_LOCKED,
5750 &mb_normalized, 0, "");