]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_mbuf.c
xnu-1228.12.14.tar.gz
[apple/xnu.git] / bsd / kern / uipc_mbuf.c
1 /*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
80
81 #include <kern/kern_types.h>
82 #include <kern/simple_lock.h>
83 #include <kern/queue.h>
84 #include <kern/sched_prim.h>
85 #include <kern/cpu_number.h>
86
87 #include <libkern/OSAtomic.h>
88 #include <libkern/libkern.h>
89
90 #include <IOKit/IOMapper.h>
91
92 #include <machine/limits.h>
93 #include <machine/machine_routines.h>
94
95 #if CONFIG_MACF_NET
96 #include <security/mac_framework.h>
97 #endif /* MAC_NET */
98
99 #include <sys/mcache.h>
100
101 /*
102 * MBUF IMPLEMENTATION NOTES.
103 *
104 * There is a total of 5 per-CPU caches:
105 *
106 * MC_MBUF:
107 * This is a cache of rudimentary objects of MSIZE in size; each
108 * object represents an mbuf structure. This cache preserves only
109 * the m_type field of the mbuf during its transactions.
110 *
111 * MC_CL:
112 * This is a cache of rudimentary objects of MCLBYTES in size; each
113 * object represents a mcluster structure. This cache does not
114 * preserve the contents of the objects during its transactions.
115 *
116 * MC_BIGCL:
117 * This is a cache of rudimentary objects of NBPG in size; each
118 * object represents a mbigcluster structure. This cache does not
119 * preserve the contents of the objects during its transaction.
120 *
121 * MC_MBUF_CL:
122 * This is a cache of mbufs each having a cluster attached to it.
123 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
124 * fields of the mbuf related to the external cluster are preserved
125 * during transactions.
126 *
127 * MC_MBUF_BIGCL:
128 * This is a cache of mbufs each having a big cluster attached to it.
129 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
130 * fields of the mbuf related to the external cluster are preserved
131 * during transactions.
132 *
133 * OBJECT ALLOCATION:
134 *
135 * Allocation requests are handled first at the per-CPU (mcache) layer
136 * before falling back to the slab layer. Performance is optimal when
137 * the request is satisfied at the CPU layer because global data/lock
138 * never gets accessed. When the slab layer is entered for allocation,
139 * the slab freelist will be checked first for available objects before
140 * the VM backing store is invoked. Slab layer operations are serialized
141 * for all of the caches as the mbuf global lock is held most of the time.
142 * Allocation paths are different depending on the class of objects:
143 *
144 * a. Rudimentary object:
145 *
146 * { m_get_common(), m_clattach(), m_mclget(),
147 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
148 * composite object allocation }
149 * | ^
150 * | |
151 * | +-----------------------+
152 * v |
153 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
154 * | ^
155 * v |
156 * [CPU cache] -------> (found?) -------+
157 * | |
158 * v |
159 * mbuf_slab_alloc() |
160 * | |
161 * v |
162 * +---------> [freelist] -------> (found?) -------+
163 * | |
164 * | v
165 * | m_clalloc()
166 * | |
167 * | v
168 * +---<<---- kmem_mb_alloc()
169 *
170 * b. Composite object:
171 *
172 * { m_getpackets_internal(), m_allocpacket_internal() }
173 * | ^
174 * | |
175 * | +------ (done) ---------+
176 * v |
177 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
178 * | ^
179 * v |
180 * [CPU cache] -------> (found?) -------+
181 * | |
182 * v |
183 * mbuf_cslab_alloc() |
184 * | |
185 * v |
186 * [freelist] -------> (found?) -------+
187 * | |
188 * v |
189 * (rudimentary object) |
190 * mcache_alloc/mcache_alloc_ext() ------>>-----+
191 *
192 * Auditing notes: If auditing is enabled, buffers will be subjected to
193 * integrity checks by the audit routine. This is done by verifying their
194 * contents against DEADBEEF (free) pattern before returning them to caller.
195 * As part of this step, the routine will also record the transaction and
196 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
197 * also restore any constructed data structure fields if necessary.
198 *
199 * OBJECT DEALLOCATION:
200 *
201 * Freeing an object simply involves placing it into the CPU cache; this
202 * pollutes the cache to benefit subsequent allocations. The slab layer
203 * will only be entered if the object is to be purged out of the cache.
204 * During normal operations, this happens only when the CPU layer resizes
205 * its bucket while it's adjusting to the allocation load. Deallocation
206 * paths are different depending on the class of objects:
207 *
208 * a. Rudimentary object:
209 *
210 * { m_free(), m_freem_list(), composite object deallocation }
211 * | ^
212 * | |
213 * | +------ (done) ---------+
214 * v |
215 * mcache_free/mcache_free_ext() |
216 * | |
217 * v |
218 * mbuf_slab_audit() |
219 * | |
220 * v |
221 * [CPU cache] ---> (not purging?) -----+
222 * | |
223 * v |
224 * mbuf_slab_free() |
225 * | |
226 * v |
227 * [freelist] ----------->>------------+
228 * (objects never get purged to VM)
229 *
230 * b. Composite object:
231 *
232 * { m_free(), m_freem_list() }
233 * | ^
234 * | |
235 * | +------ (done) ---------+
236 * v |
237 * mcache_free/mcache_free_ext() |
238 * | |
239 * v |
240 * mbuf_cslab_audit() |
241 * | |
242 * v |
243 * [CPU cache] ---> (not purging?) -----+
244 * | |
245 * v |
246 * mbuf_cslab_free() |
247 * | |
248 * v |
249 * [freelist] ---> (not purging?) -----+
250 * | |
251 * v |
252 * (rudimentary object) |
253 * mcache_free/mcache_free_ext() ------->>------+
254 *
255 * Auditing notes: If auditing is enabled, the audit routine will save
256 * any constructed data structure fields (if necessary) before filling the
257 * contents of the buffers with DEADBEEF (free) pattern and recording the
258 * transaction. Buffers that are freed (whether at CPU or slab layer) are
259 * expected to contain the free pattern.
260 *
261 * DEBUGGING:
262 *
263 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
264 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
265 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
266 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Note
267 * that debugging consumes more CPU and memory.
268 *
269 * Each object is associated with exactly one mcache_audit_t structure that
270 * contains the information related to its last buffer transaction. Given
271 * an address of an object, the audit structure can be retrieved by finding
272 * the position of the object relevant to the base address of the cluster:
273 *
274 * +------------+ +=============+
275 * | mbuf addr | | mclaudit[i] |
276 * +------------+ +=============+
277 * | | cl_audit[0] |
278 * i = MTOCL(addr) +-------------+
279 * | +-----> | cl_audit[1] | -----> mcache_audit_t
280 * b = CLTOM(i) | +-------------+
281 * | | | ... |
282 * x = MCLIDX(b, addr) | +-------------+
283 * | | | cl_audit[7] |
284 * +-----------------+ +-------------+
285 * (e.g. x == 1)
286 *
287 * The mclaudit[] array is allocated at initialization time, but its contents
288 * get populated when the corresponding cluster is created. Because a cluster
289 * can be turned into NMBPCL number of mbufs, we preserve enough space for the
290 * mbufs so that there is a 1-to-1 mapping between them. A cluster that never
291 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
292 * remaining entries unused. For big clusters, only one entry is allocated
293 * and used for the entire cluster pair.
294 */
295
296 /* TODO: should be in header file */
297 /* kernel translater */
298 extern vm_offset_t kmem_mb_alloc(vm_map_t, int);
299 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
300 extern vm_map_t mb_map; /* special map */
301
302 /* Global lock */
303 static lck_mtx_t *mbuf_mlock;
304 static lck_attr_t *mbuf_mlock_attr;
305 static lck_grp_t *mbuf_mlock_grp;
306 static lck_grp_attr_t *mbuf_mlock_grp_attr;
307
308 /* Back-end (common) layer */
309 static void *mbuf_worker_run; /* wait channel for worker thread */
310 static int mbuf_worker_ready; /* worker thread is runnable */
311 static int mbuf_expand_mcl; /* number of cluster creation requets */
312 static int mbuf_expand_big; /* number of big cluster creation requests */
313 static int mbuf_expand_16k; /* number of 16K cluster creation requests */
314 static int ncpu; /* number of CPUs */
315 static int *mcl_paddr; /* Array of cluster physical addresses */
316 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
317 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
318 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
319 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
320 static unsigned int mb_normalized; /* number of packets "normalized" */
321
322 typedef enum {
323 MC_MBUF = 0, /* Regular mbuf */
324 MC_CL, /* Cluster */
325 MC_BIGCL, /* Large (4K) cluster */
326 MC_16KCL, /* Jumbo (16K) cluster */
327 MC_MBUF_CL, /* mbuf + cluster */
328 MC_MBUF_BIGCL, /* mbuf + large (4K) cluster */
329 MC_MBUF_16KCL /* mbuf + jumbo (16K) cluster */
330 } mbuf_class_t;
331
332 #define MBUF_CLASS_MIN MC_MBUF
333 #define MBUF_CLASS_MAX MC_MBUF_16KCL
334 #define MBUF_CLASS_LAST MC_16KCL
335 #define MBUF_CLASS_VALID(c) \
336 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
337 #define MBUF_CLASS_COMPOSITE(c) \
338 ((int)(c) > MBUF_CLASS_LAST)
339
340
341 /*
342 * mbuf specific mcache allocation request flags.
343 */
344 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
345
346 /*
347 * Per-cluster slab structure.
348 *
349 * A slab is a cluster control structure that contains one or more object
350 * chunks; the available chunks are chained in the slab's freelist (sl_head).
351 * Each time a chunk is taken out of the slab, the slab's reference count
352 * gets incremented. When all chunks have been taken out, the empty slab
353 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
354 * returned to a slab causes the slab's reference count to be decremented;
355 * it also causes the slab to be reinserted back to class's slab list, if
356 * it's not already done.
357 *
358 * Compartmentalizing of the object chunks into slabs allows us to easily
359 * merge one or more slabs together when the adjacent slabs are idle, as
360 * well as to convert or move a slab from one class to another; e.g. the
361 * mbuf cluster slab can be converted to a regular cluster slab when all
362 * mbufs in the slab have been freed.
363 *
364 * A slab may also span across multiple clusters for chunks larger than
365 * a cluster's size. In this case, only the slab of the first cluster is
366 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
367 * that they are part of the larger slab.
368 */
369 typedef struct mcl_slab {
370 struct mcl_slab *sl_next; /* neighboring slab */
371 u_int8_t sl_class; /* controlling mbuf class */
372 int8_t sl_refcnt; /* outstanding allocations */
373 int8_t sl_chunks; /* chunks (bufs) in this slab */
374 u_int16_t sl_flags; /* slab flags (see below) */
375 u_int16_t sl_len; /* slab length */
376 void *sl_base; /* base of allocated memory */
377 void *sl_head; /* first free buffer */
378 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
379 } mcl_slab_t;
380
381 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
382 #define SLF_PARTIAL 0x0002 /* part of another slab */
383 #define SLF_DETACHED 0x0004 /* not in slab freelist */
384
385 /*
386 * The array of slabs are broken into groups of arrays per 1MB of kernel
387 * memory to reduce the footprint. Each group is allocated on demand
388 * whenever a new piece of memory mapped in from the VM crosses the 1MB
389 * boundary.
390 */
391 #define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */
392
393 typedef struct mcl_slabg {
394 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */
395 } mcl_slabg_t;
396
397 /*
398 * Per-cluster audit structure.
399 */
400 typedef struct {
401 mcache_audit_t *cl_audit[NMBPCL]; /* array of audits */
402 } mcl_audit_t;
403
404 #if CONFIG_MBUF_NOEXPAND
405 static unsigned int maxmbufcl;
406 #endif /* CONFIG_MBUF_NOEXPAND */
407
408 /*
409 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
410 * and m_ext structures. If auditing is enabled, we allocate a shadow
411 * mbuf structure of this size inside each audit structure, and the
412 * contents of the real mbuf gets copied into it when the mbuf is freed.
413 * This allows us to pattern-fill the mbuf for integrity check, and to
414 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
415 * Note that we don't save the contents of clusters when they are freed;
416 * we simply pattern-fill them.
417 */
418 #if defined(__LP64__)
419 #define AUDIT_CONTENTS_SIZE 160
420 #else
421 #define AUDIT_CONTENTS_SIZE 80
422 #endif /* __LP64__ */
423
424 /*
425 * mbuf specific mcache audit flags
426 */
427 #define MB_INUSE 0x01 /* object has not been returned to slab */
428 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
429 #define MB_SCVALID 0x04 /* object has valid saved contents */
430
431 /*
432 * Each of the following two arrays hold up to nmbclusters elements.
433 */
434 static mcl_audit_t *mclaudit; /* array of cluster audit information */
435 static mcl_slabg_t **slabstbl; /* cluster slabs table */
436 static unsigned int maxslabgrp; /* max # of entries in slabs table */
437 static unsigned int slabgrp; /* # of entries in slabs table */
438
439 /* Globals */
440 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
441 int njcl; /* # of clusters for jumbo sizes */
442 int njclbytes; /* size of a jumbo cluster */
443 union mcluster *mbutl; /* first mapped cluster address */
444 union mcluster *embutl; /* ending virtual address of mclusters */
445 int max_linkhdr; /* largest link-level header */
446 int max_protohdr; /* largest protocol header */
447 int max_hdr; /* largest link+protocol header */
448 int max_datalen; /* MHLEN - max_hdr */
449
450 /* TODO: should be in header file */
451 int do_reclaim = 0;
452
453 /* The minimum number of objects that are allocated, to start. */
454 #define MINCL 32
455 #define MINBIGCL (MINCL >> 1)
456 #define MIN16KCL (MINCL >> 2)
457
458 /* Low watermarks (only map in pages once free counts go below) */
459 #define MCL_LOWAT MINCL
460 #define MBIGCL_LOWAT MINBIGCL
461 #define M16KCL_LOWAT MIN16KCL
462
463 typedef struct {
464 mbuf_class_t mtbl_class; /* class type */
465 mcache_t *mtbl_cache; /* mcache for this buffer class */
466 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
467 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
468 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
469 u_int32_t mtbl_maxsize; /* maximum buffer size */
470 int mtbl_minlimit; /* minimum allowed */
471 int mtbl_maxlimit; /* maximum allowed */
472 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
473 } mbuf_table_t;
474
475 #define m_class(c) mbuf_table[c].mtbl_class
476 #define m_cache(c) mbuf_table[c].mtbl_cache
477 #define m_slablist(c) mbuf_table[c].mtbl_slablist
478 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
479 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
480 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
481 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
482 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
483 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
484 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
485 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
486 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
487 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
488 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
489 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
490 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
491 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
492 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
493 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
494 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
495
496 static mbuf_table_t mbuf_table[] = {
497 /*
498 * The caches for mbufs, regular clusters and big clusters.
499 */
500 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
501 NULL, NULL, 0, 0, 0, 0 },
502 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
503 NULL, NULL, 0, 0, 0, 0 },
504 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
505 NULL, NULL, 0, 0, 0, 0 },
506 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
507 NULL, NULL, 0, 0, 0, 0 },
508 /*
509 * The following are special caches; they serve as intermediate
510 * caches backed by the above rudimentary caches. Each object
511 * in the cache is an mbuf with a cluster attached to it. Unlike
512 * the above caches, these intermediate caches do not directly
513 * deal with the slab structures; instead, the constructed
514 * cached elements are simply stored in the freelists.
515 */
516 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
517 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
518 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
519 };
520
521 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
522
523 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
524 static int mb_waiters; /* number of sleepers */
525
526 /* The following are used to serialize m_clalloc() */
527 static boolean_t mb_clalloc_busy;
528 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
529 static int mb_clalloc_waiters;
530
531 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
532 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
533 static void mbuf_table_init(void);
534 static inline void m_incref(struct mbuf *);
535 static inline u_int32_t m_decref(struct mbuf *);
536 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
537 static void mbuf_worker_thread_init(void);
538 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
539 static void slab_free(mbuf_class_t, mcache_obj_t *);
540 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
541 unsigned int, int);
542 static void mbuf_slab_free(void *, mcache_obj_t *, int);
543 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
544 static void mbuf_slab_notify(void *, u_int32_t);
545 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
546 unsigned int);
547 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
548 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
549 unsigned int, int);
550 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
551 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
552 static int freelist_populate(mbuf_class_t, unsigned int, int);
553 static boolean_t mbuf_cached_above(mbuf_class_t, int);
554 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
555 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
556 static int m_howmany(int, size_t);
557 static void mbuf_worker_thread(void);
558 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
559
560 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
561 size_t, unsigned int);
562 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
563 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
564 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
565 boolean_t);
566 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
567 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
568 static void mcl_audit_mcheck_panic(struct mbuf *);
569 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
570
571 static mcl_slab_t *slab_get(void *);
572 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
573 void *, void *, unsigned int, int, int);
574 static void slab_insert(mcl_slab_t *, mbuf_class_t);
575 static void slab_remove(mcl_slab_t *, mbuf_class_t);
576 static boolean_t slab_inrange(mcl_slab_t *, void *);
577 static void slab_nextptr_panic(mcl_slab_t *, void *);
578 static void slab_detach(mcl_slab_t *);
579 static boolean_t slab_is_detached(mcl_slab_t *);
580
581 /*
582 * This flag is set for all mbufs that come out of and into the composite
583 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
584 * are marked with such a flag have clusters attached to them, and will be
585 * treated differently when they are freed; instead of being placed back
586 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
587 * are placed back into the appropriate composite cache's freelist, and the
588 * actual freeing is deferred until the composite objects are purged. At
589 * such a time, this flag will be cleared from the mbufs and the objects
590 * will be freed into their own separate freelists.
591 */
592 #define EXTF_COMPOSITE 0x1
593
594 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
595 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
596 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
597 #define MBUF_IS_COMPOSITE(m) \
598 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE))
599
600 /*
601 * Macros used to verify the integrity of the mbuf.
602 */
603 #define _MCHECK(m) { \
604 if ((m)->m_type != MT_FREE) { \
605 if (mclaudit == NULL) \
606 panic("MCHECK: m_type=%d m=%p", \
607 (u_int16_t)(m)->m_type, m); \
608 else \
609 mcl_audit_mcheck_panic(m); \
610 } \
611 }
612
613 #define MBUF_IN_MAP(addr) \
614 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
615
616 #define MRANGE(addr) { \
617 if (!MBUF_IN_MAP(addr)) \
618 panic("MRANGE: address out of range 0x%p", addr); \
619 }
620
621 /*
622 * Macro version of mtod.
623 */
624 #define MTOD(m, t) ((t)((m)->m_data))
625
626 /*
627 * Macros to obtain cluster index and base cluster address.
628 */
629 #define MTOCL(x) (((char *)(x) - (char *)mbutl) >> MCLSHIFT)
630 #define CLTOM(x) ((union mcluster *)(mbutl + (x)))
631
632 /*
633 * Macro to find the mbuf index relative to the cluster base.
634 */
635 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> 8)
636
637 /*
638 * Macros used during mbuf and cluster initialization.
639 */
640 #define MBUF_INIT(m, pkthdr, type) { \
641 _MCHECK(m); \
642 (m)->m_next = (m)->m_nextpkt = NULL; \
643 (m)->m_len = 0; \
644 (m)->m_type = type; \
645 if ((pkthdr) == 0) { \
646 (m)->m_data = (m)->m_dat; \
647 (m)->m_flags = 0; \
648 } else { \
649 (m)->m_data = (m)->m_pktdat; \
650 (m)->m_flags = M_PKTHDR; \
651 (m)->m_pkthdr.rcvif = NULL; \
652 (m)->m_pkthdr.len = 0; \
653 (m)->m_pkthdr.header = NULL; \
654 (m)->m_pkthdr.csum_flags = 0; \
655 (m)->m_pkthdr.csum_data = 0; \
656 (m)->m_pkthdr.reserved0 = NULL; \
657 (m)->m_pkthdr.vlan_tag = 0; \
658 (m)->m_pkthdr.socket_id = 0; \
659 m_tag_init(m); \
660 } \
661 }
662
663 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
664 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
665 (m)->m_flags |= M_EXT; \
666 (m)->m_ext.ext_size = (size); \
667 (m)->m_ext.ext_free = (free); \
668 (m)->m_ext.ext_arg = (arg); \
669 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
670 &(m)->m_ext.ext_refs; \
671 MEXT_RFA(m) = (rfa); \
672 MEXT_REF(m) = (ref); \
673 MEXT_FLAGS(m) = (flag); \
674 }
675
676 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
677 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
678
679 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
680 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
681
682 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
683 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
684
685 /*
686 * Macro to convert BSD malloc sleep flag to mcache's
687 */
688 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
689
690 /*
691 * The structure that holds all mbuf class statistics exportable via sysctl.
692 * Similar to mbstat structure, the mb_stat structure is protected by the
693 * global mbuf lock. It contains additional information about the classes
694 * that allows for a more accurate view of the state of the allocator.
695 */
696 struct mb_stat *mb_stat;
697
698 #define MB_STAT_SIZE(n) \
699 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
700
701 /*
702 * The legacy structure holding all of the mbuf allocation statistics.
703 * The actual statistics used by the kernel are stored in the mbuf_table
704 * instead, and are updated atomically while the global mbuf lock is held.
705 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
706 * Unlike before, the kernel no longer relies on the contents of mbstat for
707 * its operations (e.g. cluster expansion) because the structure is exposed
708 * to outside and could possibly be modified, therefore making it unsafe.
709 * With the exception of the mbstat.m_mtypes array (see below), all of the
710 * statistics are updated as they change.
711 */
712 struct mbstat mbstat;
713
714 #define MBSTAT_MTYPES_MAX \
715 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
716
717 /*
718 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
719 * atomically and stored in a per-CPU structure which is lock-free; this is
720 * done in order to avoid writing to the global mbstat data structure which
721 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
722 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
723 * array and returned to the application. Any updates for types greater or
724 * equal than MT_MAX would be done atomically to the mbstat; this slows down
725 * performance but is okay since the kernel uses only up to MT_MAX-1 while
726 * anything beyond that (up to type 255) is considered a corner case.
727 */
728 typedef struct {
729 unsigned int cpu_mtypes[MT_MAX];
730 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
731
732 typedef struct {
733 mtypes_cpu_t mbs_cpu[1];
734 } mbuf_mtypes_t;
735
736 static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */
737
738 #define MBUF_MTYPES_SIZE(n) \
739 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
740
741 #define MTYPES_CPU(p) \
742 ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
743
744 /* This should be in a header file */
745 #define atomic_add_32(a, n) ((void) OSAddAtomic(n, (volatile SInt32 *)a))
746
747 #define mtype_stat_add(type, n) { \
748 if ((unsigned)(type) < MT_MAX) { \
749 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
750 atomic_add_32(&mbs->cpu_mtypes[type], n); \
751 } else if ((unsigned)(type) < MBSTAT_MTYPES_MAX) { \
752 atomic_add_32(&mbstat.m_mtypes[type], n); \
753 } \
754 }
755
756 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
757 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
758 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
759
760 static int
761 mbstat_sysctl SYSCTL_HANDLER_ARGS
762 {
763 #pragma unused(oidp, arg1, arg2)
764 int m, n;
765 mtypes_cpu_t mtc;
766
767 bzero(&mtc, sizeof (mtc));
768 for (m = 0; m < ncpu; m++) {
769 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
770 mtypes_cpu_t temp;
771
772 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
773 sizeof (temp.cpu_mtypes));
774
775 for (n = 0; n < MT_MAX; n++)
776 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
777 }
778 lck_mtx_lock(mbuf_mlock);
779 for (n = 0; n < MT_MAX; n++)
780 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
781 lck_mtx_unlock(mbuf_mlock);
782
783 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
784 }
785
786 static int
787 mb_stat_sysctl SYSCTL_HANDLER_ARGS
788 {
789 #pragma unused(oidp, arg1, arg2)
790 mcache_t *cp;
791 mcache_cpu_t *ccp;
792 mb_class_stat_t *sp;
793 int k, m, bktsize;
794
795 lck_mtx_lock(mbuf_mlock);
796 for (k = 0; k < NELEM(mbuf_table); k++) {
797 cp = m_cache(k);
798 ccp = &cp->mc_cpu[0];
799 bktsize = ccp->cc_bktsize;
800 sp = mbuf_table[k].mtbl_stats;
801
802 if (cp->mc_flags & MCF_NOCPUCACHE)
803 sp->mbcl_mc_state = MCS_DISABLED;
804 else if (cp->mc_purge_cnt > 0)
805 sp->mbcl_mc_state = MCS_PURGING;
806 else if (bktsize == 0)
807 sp->mbcl_mc_state = MCS_OFFLINE;
808 else
809 sp->mbcl_mc_state = MCS_ONLINE;
810
811 sp->mbcl_mc_cached = 0;
812 for (m = 0; m < ncpu; m++) {
813 ccp = &cp->mc_cpu[m];
814 if (ccp->cc_objs > 0)
815 sp->mbcl_mc_cached += ccp->cc_objs;
816 if (ccp->cc_pobjs > 0)
817 sp->mbcl_mc_cached += ccp->cc_pobjs;
818 }
819 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
820 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
821 sp->mbcl_infree;
822
823 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
824 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
825 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
826
827 /* Calculate total count specific to each class */
828 sp->mbcl_ctotal = sp->mbcl_total;
829 switch (m_class(k)) {
830 case MC_MBUF:
831 /* Deduct mbufs used in composite caches */
832 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
833 m_total(MC_MBUF_BIGCL));
834 break;
835
836 case MC_CL:
837 /* Deduct clusters used in composite cache and mbufs */
838 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
839 (P2ROUNDUP(m_total(MC_MBUF), NMBPCL)/NMBPCL));
840 break;
841
842 case MC_BIGCL:
843 /* Deduct clusters used in composite cache */
844 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
845 break;
846
847 case MC_16KCL:
848 /* Deduct clusters used in composite cache */
849 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
850 break;
851
852 default:
853 break;
854 }
855 }
856 lck_mtx_unlock(mbuf_mlock);
857
858 return (SYSCTL_OUT(req, mb_stat, MB_STAT_SIZE(NELEM(mbuf_table))));
859 }
860
861 static inline void
862 m_incref(struct mbuf *m)
863 {
864 UInt32 old, new;
865 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
866
867 do {
868 old = *addr;
869 new = old + 1;
870 ASSERT(new != 0);
871 } while (!OSCompareAndSwap(old, new, addr));
872 }
873
874 static inline u_int32_t
875 m_decref(struct mbuf *m)
876 {
877 UInt32 old, new;
878 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
879
880 do {
881 old = *addr;
882 new = old - 1;
883 ASSERT(old != 0);
884 } while (!OSCompareAndSwap(old, new, addr));
885
886 return (new);
887 }
888
889 static void
890 mbuf_table_init(void)
891 {
892 int m;
893
894 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
895 M_TEMP, M_WAITOK | M_ZERO);
896 VERIFY(mb_stat != NULL);
897
898 mb_stat->mbs_cnt = NELEM(mbuf_table);
899 for (m = 0; m < NELEM(mbuf_table); m++)
900 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
901
902 #if CONFIG_MBUF_JUMBO
903 /*
904 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
905 * this only on platforms where jumbo cluster pool is enabled.
906 */
907 njcl = nmbclusters / 3;
908 njclbytes = M16KCLBYTES;
909 #endif /* CONFIG_MBUF_JUMBO */
910
911 /*
912 * nclusters is going to be split in 2 to hold both the 2K
913 * and the 4K pools, so make sure each half is even.
914 */
915 nclusters = P2ROUNDDOWN(nmbclusters - njcl, 4);
916 if (njcl > 0) {
917 /*
918 * Each jumbo cluster takes 8 2K clusters, so make
919 * sure that the pool size is evenly divisible by 8.
920 */
921 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
922 }
923
924 #if CONFIG_MBUF_NOEXPAND
925 /* Only use 4k clusters if we're setting aside more than 256k */
926 if (nmbclusters <= 128) {
927 maxmbufcl = nmbclusters / 4;
928 } else {
929 /* Half to big clusters, half to small */
930 maxmbufcl = (nmbclusters / 4) * 3;
931 }
932 #endif /* CONFIG_MBUF_NOEXPAND */
933
934 /*
935 * 1/2 of the map is reserved for 2K clusters. Out of this, 1/16th
936 * of the total number of 2K clusters allocated is reserved and cannot
937 * be turned into mbufs. It can only be used for pure cluster objects.
938 */
939 m_minlimit(MC_CL) = (nclusters >> 5);
940 m_maxlimit(MC_CL) = (nclusters >> 1);
941 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
942 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
943
944 /*
945 * The remaining (15/16th) can be turned into mbufs.
946 */
947 m_minlimit(MC_MBUF) = 0;
948 m_maxlimit(MC_MBUF) = (m_maxlimit(MC_CL) - m_minlimit(MC_CL)) * NMBPCL;
949 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
950 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
951
952 /*
953 * The other 1/2 of the map is reserved for 4K clusters.
954 */
955 m_minlimit(MC_BIGCL) = 0;
956 m_maxlimit(MC_BIGCL) = m_maxlimit(MC_CL) >> 1;
957 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = NBPG;
958 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
959
960 /*
961 * Set limits for the composite classes.
962 */
963 m_minlimit(MC_MBUF_CL) = 0;
964 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL) - m_minlimit(MC_CL);
965 m_maxsize(MC_MBUF_CL) = MCLBYTES;
966 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
967 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
968
969 m_minlimit(MC_MBUF_BIGCL) = 0;
970 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
971 m_maxsize(MC_MBUF_BIGCL) = NBPG;
972 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
973 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
974
975 /*
976 * And for jumbo classes.
977 */
978 m_minlimit(MC_16KCL) = 0;
979 m_maxlimit(MC_16KCL) = (njcl >> 3);
980 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
981 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
982
983 m_minlimit(MC_MBUF_16KCL) = 0;
984 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
985 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
986 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
987 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
988
989 /*
990 * Initialize the legacy mbstat structure.
991 */
992 bzero(&mbstat, sizeof (mbstat));
993 mbstat.m_msize = m_maxsize(MC_MBUF);
994 mbstat.m_mclbytes = m_maxsize(MC_CL);
995 mbstat.m_minclsize = MINCLSIZE;
996 mbstat.m_mlen = MLEN;
997 mbstat.m_mhlen = MHLEN;
998 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
999 }
1000
1001 __private_extern__ void
1002 mbinit(void)
1003 {
1004 unsigned int m;
1005 int initmcl = MINCL;
1006 int mcl_pages;
1007 void *buf;
1008
1009 if (nmbclusters == 0)
1010 nmbclusters = NMBCLUSTERS;
1011
1012 /* Setup the mbuf table */
1013 mbuf_table_init();
1014
1015 /* Global lock for common layer */
1016 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1017 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1018 mbuf_mlock_attr = lck_attr_alloc_init();
1019 mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr);
1020
1021 /* Allocate cluster slabs table */
1022 maxslabgrp = P2ROUNDUP(nmbclusters, NSLABSPMB) / NSLABSPMB;
1023 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1024 M_TEMP, M_WAITOK | M_ZERO);
1025 VERIFY(slabstbl != NULL);
1026
1027 /* Allocate audit structures if needed */
1028 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1029 mbuf_debug |= mcache_getflags();
1030 if (mbuf_debug & MCF_AUDIT) {
1031 MALLOC(mclaudit, mcl_audit_t *,
1032 nmbclusters * sizeof (*mclaudit), M_TEMP,
1033 M_WAITOK | M_ZERO);
1034 VERIFY(mclaudit != NULL);
1035
1036 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1037 AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1038 VERIFY(mcl_audit_con_cache != NULL);
1039 }
1040
1041 /* Calculate the number of pages assigned to the cluster pool */
1042 mcl_pages = nmbclusters/(NBPG/CLBYTES);
1043 MALLOC(mcl_paddr, int *, mcl_pages * sizeof (int), M_TEMP, M_WAITOK);
1044 VERIFY(mcl_paddr != NULL);
1045
1046 /* Register with the I/O Bus mapper */
1047 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1048 bzero((char *)mcl_paddr, mcl_pages * sizeof (int));
1049
1050 embutl = (union mcluster *)
1051 ((unsigned char *)mbutl + (nmbclusters * MCLBYTES));
1052
1053 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1054
1055 lck_mtx_lock(mbuf_mlock);
1056
1057 if (m_clalloc(MAX(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES) == 0)
1058 panic("mbinit: m_clalloc failed\n");
1059
1060 lck_mtx_unlock(mbuf_mlock);
1061
1062 (void) kernel_thread(kernel_task, mbuf_worker_thread_init);
1063
1064 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1065 0, 0, MCR_SLEEP);
1066
1067 /* Create the cache for each class */
1068 for (m = 0; m < NELEM(mbuf_table); m++) {
1069 void *allocfunc, *freefunc, *auditfunc;
1070 u_int32_t flags;
1071
1072 flags = mbuf_debug;
1073 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1074 m_class(m) == MC_MBUF_16KCL) {
1075 allocfunc = mbuf_cslab_alloc;
1076 freefunc = mbuf_cslab_free;
1077 auditfunc = mbuf_cslab_audit;
1078 } else {
1079 allocfunc = mbuf_slab_alloc;
1080 freefunc = mbuf_slab_free;
1081 auditfunc = mbuf_slab_audit;
1082 }
1083
1084 /*
1085 * Disable per-CPU caches for jumbo classes if there
1086 * is no jumbo cluster pool available in the system.
1087 * The cache itself is still created (but will never
1088 * be populated) since it simplifies the code.
1089 */
1090 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1091 njcl == 0)
1092 flags |= MCF_NOCPUCACHE;
1093
1094 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1095 allocfunc, freefunc, auditfunc, mbuf_slab_notify,
1096 (void *)m, flags, MCR_SLEEP);
1097 }
1098
1099 /*
1100 * Allocate structure for per-CPU statistics that's aligned
1101 * on the CPU cache boundary; this code assumes that we never
1102 * uninitialize this framework, since the original address
1103 * before alignment is not saved.
1104 */
1105 ncpu = ml_get_max_cpus();
1106 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1107 M_TEMP, M_WAITOK);
1108 VERIFY(buf != NULL);
1109
1110 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1111 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1112
1113 printf("mbinit: done\n");
1114 }
1115
1116 /*
1117 * Obtain a slab of object(s) from the class's freelist.
1118 */
1119 static mcache_obj_t *
1120 slab_alloc(mbuf_class_t class, int wait)
1121 {
1122 mcl_slab_t *sp;
1123 mcache_obj_t *buf;
1124
1125 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1126
1127 VERIFY(class != MC_16KCL || njcl > 0);
1128
1129 /* This should always be NULL for us */
1130 VERIFY(m_cobjlist(class) == NULL);
1131
1132 /*
1133 * Treat composite objects as having longer lifespan by using
1134 * a slab from the reverse direction, in hoping that this could
1135 * reduce the probability of fragmentation for slabs that hold
1136 * more than one buffer chunks (e.g. mbuf slabs). For other
1137 * slabs, this probably doesn't make much of a difference.
1138 */
1139 if (class == MC_MBUF && (wait & MCR_COMP))
1140 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1141 else
1142 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1143
1144 if (sp == NULL) {
1145 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1146 /* The slab list for this class is empty */
1147 return (NULL);
1148 }
1149
1150 VERIFY(m_infree(class) > 0);
1151 VERIFY(!slab_is_detached(sp));
1152 VERIFY(sp->sl_class == class &&
1153 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1154 buf = sp->sl_head;
1155 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1156
1157 if (class == MC_MBUF) {
1158 sp->sl_head = buf->obj_next;
1159 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPCL - 1));
1160 } else {
1161 sp->sl_head = NULL;
1162 }
1163 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1164 slab_nextptr_panic(sp, sp->sl_head);
1165 /* In case sl_head is in the map but not in the slab */
1166 VERIFY(slab_inrange(sp, sp->sl_head));
1167 /* NOTREACHED */
1168 }
1169
1170 /* Increment slab reference */
1171 sp->sl_refcnt++;
1172
1173 if (mclaudit != NULL) {
1174 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1175 mca->mca_uflags = 0;
1176 /* Save contents on mbuf objects only */
1177 if (class == MC_MBUF)
1178 mca->mca_uflags |= MB_SCVALID;
1179 }
1180
1181 if (class == MC_CL) {
1182 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1183 /*
1184 * A 2K cluster slab can have at most 1 reference.
1185 */
1186 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1187 sp->sl_len == m_maxsize(MC_CL) && sp->sl_head == NULL);
1188 } else if (class == MC_BIGCL) {
1189 mcl_slab_t *nsp = sp->sl_next;
1190 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1191 m_infree(MC_MBUF_BIGCL);
1192 /*
1193 * Increment 2nd slab. A 4K big cluster takes
1194 * 2 slabs, each having at most 1 reference.
1195 */
1196 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1197 sp->sl_len == m_maxsize(MC_BIGCL) && sp->sl_head == NULL);
1198 /* Next slab must already be present */
1199 VERIFY(nsp != NULL);
1200 nsp->sl_refcnt++;
1201 VERIFY(!slab_is_detached(nsp));
1202 VERIFY(nsp->sl_class == MC_BIGCL &&
1203 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1204 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1205 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1206 nsp->sl_head == NULL);
1207 } else if (class == MC_16KCL) {
1208 mcl_slab_t *nsp;
1209 int k;
1210
1211 --m_infree(MC_16KCL);
1212 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1213 sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1214 /*
1215 * Increment 2nd-8th slab. A 16K big cluster takes
1216 * 8 cluster slabs, each having at most 1 reference.
1217 */
1218 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1219 nsp = nsp->sl_next;
1220 /* Next slab must already be present */
1221 VERIFY(nsp != NULL);
1222 nsp->sl_refcnt++;
1223 VERIFY(!slab_is_detached(nsp));
1224 VERIFY(nsp->sl_class == MC_16KCL &&
1225 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1226 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1227 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1228 nsp->sl_head == NULL);
1229 }
1230 } else {
1231 ASSERT(class == MC_MBUF);
1232 --m_infree(MC_MBUF);
1233 /*
1234 * If auditing is turned on, this check is
1235 * deferred until later in mbuf_slab_audit().
1236 */
1237 if (mclaudit == NULL)
1238 _MCHECK((struct mbuf *)buf);
1239 /*
1240 * Since we have incremented the reference count above,
1241 * an mbuf slab (formerly a 2K cluster slab that was cut
1242 * up into mbufs) must have a reference count between 1
1243 * and NMBPCL at this point.
1244 */
1245 VERIFY(sp->sl_refcnt >= 1 &&
1246 (unsigned short)sp->sl_refcnt <= NMBPCL &&
1247 sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1248 VERIFY((unsigned short)sp->sl_refcnt < NMBPCL ||
1249 sp->sl_head == NULL);
1250 }
1251
1252 /* If empty, remove this slab from the class's freelist */
1253 if (sp->sl_head == NULL) {
1254 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPCL);
1255 slab_remove(sp, class);
1256 }
1257
1258 return (buf);
1259 }
1260
1261 /*
1262 * Place a slab of object(s) back into a class's slab list.
1263 */
1264 static void
1265 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1266 {
1267 mcl_slab_t *sp;
1268
1269 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1270
1271 VERIFY(class != MC_16KCL || njcl > 0);
1272 VERIFY(buf->obj_next == NULL);
1273 sp = slab_get(buf);
1274 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1275 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1276
1277 /* Decrement slab reference */
1278 sp->sl_refcnt--;
1279
1280 if (class == MC_CL || class == MC_BIGCL) {
1281 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1282 /*
1283 * A 2K cluster slab can have at most 1 reference
1284 * which must be 0 at this point.
1285 */
1286 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1287 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1288 VERIFY(slab_is_detached(sp));
1289 if (class == MC_BIGCL) {
1290 mcl_slab_t *nsp = sp->sl_next;
1291 VERIFY(IS_P2ALIGNED(buf, NBPG));
1292 /* Next slab must already be present */
1293 VERIFY(nsp != NULL);
1294 /* Decrement 2nd slab reference */
1295 nsp->sl_refcnt--;
1296 /*
1297 * A 4K big cluster takes 2 slabs, both
1298 * must now have 0 reference.
1299 */
1300 VERIFY(slab_is_detached(nsp));
1301 VERIFY(nsp->sl_class == MC_BIGCL &&
1302 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1303 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1304 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1305 nsp->sl_head == NULL);
1306 }
1307 } else if (class == MC_16KCL) {
1308 mcl_slab_t *nsp;
1309 int k;
1310 /*
1311 * A 16K cluster takes 8 cluster slabs, all must
1312 * now have 0 reference.
1313 */
1314 VERIFY(IS_P2ALIGNED(buf, NBPG));
1315 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1316 sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1317 VERIFY(slab_is_detached(sp));
1318 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1319 nsp = nsp->sl_next;
1320 /* Next slab must already be present */
1321 VERIFY(nsp != NULL);
1322 nsp->sl_refcnt--;
1323 VERIFY(slab_is_detached(nsp));
1324 VERIFY(nsp->sl_class == MC_16KCL &&
1325 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1326 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1327 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1328 nsp->sl_head == NULL);
1329 }
1330 } else {
1331 /*
1332 * An mbuf slab has a total of NMBPL reference counts.
1333 * Since we have decremented the reference above, it
1334 * must now be between 0 and NMBPCL-1.
1335 */
1336 VERIFY(sp->sl_refcnt >= 0 &&
1337 (unsigned short)sp->sl_refcnt <= (NMBPCL - 1) &&
1338 sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1339 VERIFY(sp->sl_refcnt < (NMBPCL - 1) ||
1340 (slab_is_detached(sp) && sp->sl_head == NULL));
1341 }
1342
1343 /*
1344 * When auditing is enabled, ensure that the buffer still
1345 * contains the free pattern. Otherwise it got corrupted
1346 * while at the CPU cache layer.
1347 */
1348 if (mclaudit != NULL) {
1349 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1350 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1351 mca->mca_uflags &= ~MB_SCVALID;
1352 }
1353
1354 if (class == MC_CL) {
1355 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1356 } else if (class == MC_BIGCL) {
1357 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1358 m_infree(MC_MBUF_BIGCL);
1359 } else if (class == MC_16KCL) {
1360 ++m_infree(MC_16KCL);
1361 } else {
1362 ++m_infree(MC_MBUF);
1363 buf->obj_next = sp->sl_head;
1364 }
1365 sp->sl_head = buf;
1366
1367 /* All mbufs are freed; return the cluster that we stole earlier */
1368 if (sp->sl_refcnt == 0 && class == MC_MBUF) {
1369 int i = NMBPCL;
1370
1371 m_total(MC_MBUF) -= NMBPCL;
1372 mbstat.m_mbufs = m_total(MC_MBUF);
1373 m_infree(MC_MBUF) -= NMBPCL;
1374 mtype_stat_add(MT_FREE, -NMBPCL);
1375
1376 while (i--) {
1377 struct mbuf *m = sp->sl_head;
1378 VERIFY(m != NULL);
1379 sp->sl_head = m->m_next;
1380 m->m_next = NULL;
1381 }
1382 VERIFY(sp->sl_head == NULL);
1383
1384 /* Remove the slab from the mbuf class's slab list */
1385 slab_remove(sp, class);
1386
1387 /* Reinitialize it as a 2K cluster slab */
1388 slab_init(sp, MC_CL, sp->sl_flags, sp->sl_base, sp->sl_base,
1389 sp->sl_len, 0, 1);
1390
1391 if (mclaudit != NULL)
1392 mcache_set_pattern(MCACHE_FREE_PATTERN,
1393 (caddr_t)sp->sl_head, m_maxsize(MC_CL));
1394
1395 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1396
1397 VERIFY(slab_is_detached(sp));
1398 /* And finally switch class */
1399 class = MC_CL;
1400 }
1401
1402 /* Reinsert the slab to the class's slab list */
1403 if (slab_is_detached(sp))
1404 slab_insert(sp, class);
1405 }
1406
1407 /*
1408 * Common allocator for rudimentary objects called by the CPU cache layer
1409 * during an allocation request whenever there is no available element in the
1410 * bucket layer. It returns one or more elements from the appropriate global
1411 * freelist. If the freelist is empty, it will attempt to populate it and
1412 * retry the allocation.
1413 */
1414 static unsigned int
1415 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1416 {
1417 mbuf_class_t class = (mbuf_class_t)arg;
1418 unsigned int need = num;
1419 mcache_obj_t **list = *plist;
1420
1421 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1422 ASSERT(need > 0);
1423
1424 lck_mtx_lock(mbuf_mlock);
1425
1426 for (;;) {
1427 if ((*list = slab_alloc(class, wait)) != NULL) {
1428 (*list)->obj_next = NULL;
1429 list = *plist = &(*list)->obj_next;
1430
1431 if (--need == 0) {
1432 /*
1433 * If the number of elements in freelist has
1434 * dropped below low watermark, asynchronously
1435 * populate the freelist now rather than doing
1436 * it later when we run out of elements.
1437 */
1438 if (!mbuf_cached_above(class, wait) &&
1439 m_infree(class) < m_total(class) >> 5) {
1440 (void) freelist_populate(class, 1,
1441 M_DONTWAIT);
1442 }
1443 break;
1444 }
1445 } else {
1446 VERIFY(m_infree(class) == 0 || class == MC_CL);
1447
1448 (void) freelist_populate(class, 1,
1449 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1450
1451 if (m_infree(class) > 0)
1452 continue;
1453
1454 /* Check if there's anything at the cache layer */
1455 if (mbuf_cached_above(class, wait))
1456 break;
1457
1458 /* We have nothing and cannot block; give up */
1459 if (wait & MCR_NOSLEEP) {
1460 if (!(wait & MCR_TRYHARD)) {
1461 m_fail_cnt(class)++;
1462 mbstat.m_drops++;
1463 break;
1464 }
1465 }
1466
1467 /*
1468 * If the freelist is still empty and the caller is
1469 * willing to be blocked, sleep on the wait channel
1470 * until an element is available. Otherwise, if
1471 * MCR_TRYHARD is set, do our best to satisfy the
1472 * request without having to go to sleep.
1473 */
1474 if (mbuf_worker_ready &&
1475 mbuf_sleep(class, need, wait))
1476 break;
1477
1478 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1479 }
1480 }
1481
1482 m_alloc_cnt(class) += num - need;
1483 lck_mtx_unlock(mbuf_mlock);
1484
1485 return (num - need);
1486 }
1487
1488 /*
1489 * Common de-allocator for rudimentary objects called by the CPU cache
1490 * layer when one or more elements need to be returned to the appropriate
1491 * global freelist.
1492 */
1493 static void
1494 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
1495 {
1496 mbuf_class_t class = (mbuf_class_t)arg;
1497 mcache_obj_t *nlist;
1498 unsigned int num = 0;
1499 int w;
1500
1501 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1502
1503 lck_mtx_lock(mbuf_mlock);
1504
1505 for (;;) {
1506 nlist = list->obj_next;
1507 list->obj_next = NULL;
1508 slab_free(class, list);
1509 ++num;
1510 if ((list = nlist) == NULL)
1511 break;
1512 }
1513 m_free_cnt(class) += num;
1514
1515 if ((w = mb_waiters) > 0)
1516 mb_waiters = 0;
1517
1518 lck_mtx_unlock(mbuf_mlock);
1519
1520 if (w != 0)
1521 wakeup(mb_waitchan);
1522 }
1523
1524 /*
1525 * Common auditor for rudimentary objects called by the CPU cache layer
1526 * during an allocation or free request. For the former, this is called
1527 * after the objects are obtained from either the bucket or slab layer
1528 * and before they are returned to the caller. For the latter, this is
1529 * called immediately during free and before placing the objects into
1530 * the bucket or slab layer.
1531 */
1532 static void
1533 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1534 {
1535 mbuf_class_t class = (mbuf_class_t)arg;
1536 mcache_audit_t *mca;
1537
1538 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1539
1540 while (list != NULL) {
1541 lck_mtx_lock(mbuf_mlock);
1542 mca = mcl_audit_buf2mca(class, list);
1543
1544 /* Do the sanity checks */
1545 if (class == MC_MBUF) {
1546 mcl_audit_mbuf(mca, list, FALSE, alloc);
1547 ASSERT(mca->mca_uflags & MB_SCVALID);
1548 } else {
1549 mcl_audit_cluster(mca, list, m_maxsize(class),
1550 alloc, TRUE);
1551 ASSERT(!(mca->mca_uflags & MB_SCVALID));
1552 }
1553 /* Record this transaction */
1554 mcache_buffer_log(mca, list, m_cache(class));
1555 if (alloc)
1556 mca->mca_uflags |= MB_INUSE;
1557 else
1558 mca->mca_uflags &= ~MB_INUSE;
1559 /* Unpair the object (unconditionally) */
1560 mca->mca_uptr = NULL;
1561 lck_mtx_unlock(mbuf_mlock);
1562
1563 list = list->obj_next;
1564 }
1565 }
1566
1567 /*
1568 * Common notify routine for all caches. It is called by mcache when
1569 * one or more objects get freed. We use this indication to trigger
1570 * the wakeup of any sleeping threads so that they can retry their
1571 * allocation requests.
1572 */
1573 static void
1574 mbuf_slab_notify(void *arg, u_int32_t reason)
1575 {
1576 mbuf_class_t class = (mbuf_class_t)arg;
1577 int w;
1578
1579 ASSERT(MBUF_CLASS_VALID(class));
1580
1581 if (reason != MCN_RETRYALLOC)
1582 return;
1583
1584 lck_mtx_lock(mbuf_mlock);
1585 if ((w = mb_waiters) > 0) {
1586 m_notified(class)++;
1587 mb_waiters = 0;
1588 }
1589 lck_mtx_unlock(mbuf_mlock);
1590
1591 if (w != 0)
1592 wakeup(mb_waitchan);
1593 }
1594
1595 /*
1596 * Obtain object(s) from the composite class's freelist.
1597 */
1598 static unsigned int
1599 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
1600 {
1601 unsigned int need = num;
1602 mcl_slab_t *sp, *clsp, *nsp;
1603 struct mbuf *m;
1604 mcache_obj_t **list = *plist;
1605 void *cl;
1606
1607 VERIFY(need > 0);
1608 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1609 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1610
1611 /* Get what we can from the freelist */
1612 while ((*list = m_cobjlist(class)) != NULL) {
1613 MRANGE(*list);
1614
1615 m = (struct mbuf *)*list;
1616 sp = slab_get(m);
1617 cl = m->m_ext.ext_buf;
1618 clsp = slab_get(cl);
1619 VERIFY(m->m_flags == M_EXT && cl != NULL);
1620 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
1621 VERIFY(clsp->sl_refcnt == 1);
1622 if (class == MC_MBUF_BIGCL) {
1623 nsp = clsp->sl_next;
1624 /* Next slab must already be present */
1625 VERIFY(nsp != NULL);
1626 VERIFY(nsp->sl_refcnt == 1);
1627 } else if (class == MC_MBUF_16KCL) {
1628 int k;
1629 for (nsp = clsp, k = 1;
1630 k < (M16KCLBYTES / MCLBYTES); k++) {
1631 nsp = nsp->sl_next;
1632 /* Next slab must already be present */
1633 VERIFY(nsp != NULL);
1634 VERIFY(nsp->sl_refcnt == 1);
1635 }
1636 }
1637
1638 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
1639 !MBUF_IN_MAP(m_cobjlist(class))) {
1640 slab_nextptr_panic(sp, m_cobjlist(class));
1641 /* NOTREACHED */
1642 }
1643 (*list)->obj_next = NULL;
1644 list = *plist = &(*list)->obj_next;
1645
1646 if (--need == 0)
1647 break;
1648 }
1649 m_infree(class) -= (num - need);
1650
1651 return (num - need);
1652 }
1653
1654 /*
1655 * Place object(s) back into a composite class's freelist.
1656 */
1657 static unsigned int
1658 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
1659 {
1660 mcache_obj_t *o, *tail;
1661 unsigned int num = 0;
1662 struct mbuf *m, *ms;
1663 mcache_audit_t *mca = NULL;
1664 mcache_obj_t *ref_list = NULL;
1665 mcl_slab_t *clsp, *nsp;
1666 void *cl;
1667
1668 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1669 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1670 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1671
1672 o = tail = list;
1673
1674 while ((m = ms = (struct mbuf *)o) != NULL) {
1675 mcache_obj_t *rfa, *nexto = o->obj_next;
1676
1677 /* Do the mbuf sanity checks */
1678 if (mclaudit != NULL) {
1679 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1680 mcache_audit_free_verify(mca, m, 0, m_maxsize(MC_MBUF));
1681 ms = (struct mbuf *)mca->mca_contents;
1682 }
1683
1684 /* Do the cluster sanity checks */
1685 cl = ms->m_ext.ext_buf;
1686 clsp = slab_get(cl);
1687 if (mclaudit != NULL) {
1688 size_t size;
1689 if (class == MC_MBUF_CL)
1690 size = m_maxsize(MC_CL);
1691 else if (class == MC_MBUF_BIGCL)
1692 size = m_maxsize(MC_BIGCL);
1693 else
1694 size = m_maxsize(MC_16KCL);
1695 mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL,
1696 (mcache_obj_t *)cl), cl, 0, size);
1697 }
1698 VERIFY(ms->m_type == MT_FREE);
1699 VERIFY(ms->m_flags == M_EXT);
1700 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1701 VERIFY(clsp->sl_refcnt == 1);
1702 if (class == MC_MBUF_BIGCL) {
1703 nsp = clsp->sl_next;
1704 /* Next slab must already be present */
1705 VERIFY(nsp != NULL);
1706 VERIFY(nsp->sl_refcnt == 1);
1707 } else if (class == MC_MBUF_16KCL) {
1708 int k;
1709 for (nsp = clsp, k = 1;
1710 k < (M16KCLBYTES / MCLBYTES); k++) {
1711 nsp = nsp->sl_next;
1712 /* Next slab must already be present */
1713 VERIFY(nsp != NULL);
1714 VERIFY(nsp->sl_refcnt == 1);
1715 }
1716 }
1717
1718 /*
1719 * If we're asked to purge, restore the actual mbuf using
1720 * contents of the shadow structure (if auditing is enabled)
1721 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1722 * about to free it and the attached cluster into their caches.
1723 */
1724 if (purged) {
1725 /* Restore constructed mbuf fields */
1726 if (mclaudit != NULL)
1727 mcl_audit_restore_mbuf(m, mca, TRUE);
1728
1729 MEXT_REF(m) = 0;
1730 MEXT_FLAGS(m) = 0;
1731
1732 rfa = (mcache_obj_t *)MEXT_RFA(m);
1733 rfa->obj_next = ref_list;
1734 ref_list = rfa;
1735 MEXT_RFA(m) = NULL;
1736
1737 m->m_type = MT_FREE;
1738 m->m_flags = m->m_len = 0;
1739 m->m_next = m->m_nextpkt = NULL;
1740
1741 /* Save mbuf fields and make auditing happy */
1742 if (mclaudit != NULL)
1743 mcl_audit_mbuf(mca, o, FALSE, FALSE);
1744
1745 VERIFY(m_total(class) > 0);
1746 m_total(class)--;
1747
1748 /* Free the mbuf */
1749 o->obj_next = NULL;
1750 slab_free(MC_MBUF, o);
1751
1752 /* And free the cluster */
1753 ((mcache_obj_t *)cl)->obj_next = NULL;
1754 if (class == MC_MBUF_CL)
1755 slab_free(MC_CL, cl);
1756 else if (class == MC_MBUF_BIGCL)
1757 slab_free(MC_BIGCL, cl);
1758 else
1759 slab_free(MC_16KCL, cl);
1760 }
1761
1762 ++num;
1763 tail = o;
1764 o = nexto;
1765 }
1766
1767 if (!purged) {
1768 tail->obj_next = m_cobjlist(class);
1769 m_cobjlist(class) = list;
1770 m_infree(class) += num;
1771 } else if (ref_list != NULL) {
1772 mcache_free_ext(ref_cache, ref_list);
1773 }
1774
1775 return (num);
1776 }
1777
1778 /*
1779 * Common allocator for composite objects called by the CPU cache layer
1780 * during an allocation request whenever there is no available element in
1781 * the bucket layer. It returns one or more composite elements from the
1782 * appropriate global freelist. If the freelist is empty, it will attempt
1783 * to obtain the rudimentary objects from their caches and construct them
1784 * into composite mbuf + cluster objects.
1785 */
1786 static unsigned int
1787 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
1788 int wait)
1789 {
1790 mbuf_class_t class = (mbuf_class_t)arg;
1791 mcache_t *cp = NULL;
1792 unsigned int num = 0, cnum = 0, want = needed;
1793 mcache_obj_t *ref_list = NULL;
1794 mcache_obj_t *mp_list = NULL;
1795 mcache_obj_t *clp_list = NULL;
1796 mcache_obj_t **list;
1797 struct ext_ref *rfa;
1798 struct mbuf *m;
1799 void *cl;
1800
1801 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1802 ASSERT(needed > 0);
1803
1804 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1805
1806 /* There should not be any slab for this class */
1807 VERIFY(m_slab_cnt(class) == 0 &&
1808 m_slablist(class).tqh_first == NULL &&
1809 m_slablist(class).tqh_last == NULL);
1810
1811 lck_mtx_lock(mbuf_mlock);
1812
1813 /* Try using the freelist first */
1814 num = cslab_alloc(class, plist, needed);
1815 list = *plist;
1816 if (num == needed) {
1817 m_alloc_cnt(class) += num;
1818 lck_mtx_unlock(mbuf_mlock);
1819 return (needed);
1820 }
1821
1822 lck_mtx_unlock(mbuf_mlock);
1823
1824 /*
1825 * We could not satisfy the request using the freelist alone;
1826 * allocate from the appropriate rudimentary caches and use
1827 * whatever we can get to construct the composite objects.
1828 */
1829 needed -= num;
1830
1831 /*
1832 * Mark these allocation requests as coming from a composite cache.
1833 * Also, if the caller is willing to be blocked, mark the request
1834 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
1835 * slab layer waiting for the individual object when one or more
1836 * of the already-constructed composite objects are available.
1837 */
1838 wait |= MCR_COMP;
1839 if (!(wait & MCR_NOSLEEP))
1840 wait |= MCR_FAILOK;
1841
1842 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
1843 if (needed == 0) {
1844 ASSERT(mp_list == NULL);
1845 goto fail;
1846 }
1847 if (class == MC_MBUF_CL)
1848 cp = m_cache(MC_CL);
1849 else if (class == MC_MBUF_BIGCL)
1850 cp = m_cache(MC_BIGCL);
1851 else
1852 cp = m_cache(MC_16KCL);
1853 needed = mcache_alloc_ext(cp, &clp_list, needed, wait);
1854 if (needed == 0) {
1855 ASSERT(clp_list == NULL);
1856 goto fail;
1857 }
1858 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
1859 if (needed == 0) {
1860 ASSERT(ref_list == NULL);
1861 goto fail;
1862 }
1863
1864 /*
1865 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
1866 * overs will get freed accordingly before we return to caller.
1867 */
1868 for (cnum = 0; cnum < needed; cnum++) {
1869 struct mbuf *ms;
1870
1871 m = ms = (struct mbuf *)mp_list;
1872 mp_list = mp_list->obj_next;
1873
1874 cl = clp_list;
1875 clp_list = clp_list->obj_next;
1876 ((mcache_obj_t *)cl)->obj_next = NULL;
1877
1878 rfa = (struct ext_ref *)ref_list;
1879 ref_list = ref_list->obj_next;
1880 ((mcache_obj_t *)rfa)->obj_next = NULL;
1881
1882 /*
1883 * If auditing is enabled, construct the shadow mbuf
1884 * in the audit structure instead of in the actual one.
1885 * mbuf_cslab_audit() will take care of restoring the
1886 * contents after the integrity check.
1887 */
1888 if (mclaudit != NULL) {
1889 mcache_audit_t *mca, *cl_mca;
1890 size_t size;
1891
1892 lck_mtx_lock(mbuf_mlock);
1893 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1894 ms = ((struct mbuf *)mca->mca_contents);
1895 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
1896
1897 /*
1898 * Pair them up. Note that this is done at the time
1899 * the mbuf+cluster objects are constructed. This
1900 * information should be treated as "best effort"
1901 * debugging hint since more than one mbufs can refer
1902 * to a cluster. In that case, the cluster might not
1903 * be freed along with the mbuf it was paired with.
1904 */
1905 mca->mca_uptr = cl_mca;
1906 cl_mca->mca_uptr = mca;
1907
1908 ASSERT(mca->mca_uflags & MB_SCVALID);
1909 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
1910 lck_mtx_unlock(mbuf_mlock);
1911
1912 /* Technically, they are in the freelist */
1913 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
1914 m_maxsize(MC_MBUF));
1915 if (class == MC_MBUF_CL)
1916 size = m_maxsize(MC_CL);
1917 else if (class == MC_MBUF_BIGCL)
1918 size = m_maxsize(MC_BIGCL);
1919 else
1920 size = m_maxsize(MC_16KCL);
1921 mcache_set_pattern(MCACHE_FREE_PATTERN, cl, size);
1922 }
1923
1924 MBUF_INIT(ms, 0, MT_FREE);
1925 if (class == MC_MBUF_16KCL) {
1926 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1927 } else if (class == MC_MBUF_BIGCL) {
1928 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1929 } else {
1930 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1931 }
1932 VERIFY(ms->m_flags == M_EXT);
1933 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1934
1935 *list = (mcache_obj_t *)m;
1936 (*list)->obj_next = NULL;
1937 list = *plist = &(*list)->obj_next;
1938 }
1939
1940 fail:
1941 /*
1942 * Free up what's left of the above.
1943 */
1944 if (mp_list != NULL)
1945 mcache_free_ext(m_cache(MC_MBUF), mp_list);
1946 if (clp_list != NULL)
1947 mcache_free_ext(cp, clp_list);
1948 if (ref_list != NULL)
1949 mcache_free_ext(ref_cache, ref_list);
1950
1951 lck_mtx_lock(mbuf_mlock);
1952 if (num > 0 || cnum > 0) {
1953 m_total(class) += cnum;
1954 VERIFY(m_total(class) <= m_maxlimit(class));
1955 m_alloc_cnt(class) += num + cnum;
1956 }
1957 if ((num + cnum) < want)
1958 m_fail_cnt(class) += (want - (num + cnum));
1959 lck_mtx_unlock(mbuf_mlock);
1960
1961 return (num + cnum);
1962 }
1963
1964 /*
1965 * Common de-allocator for composite objects called by the CPU cache
1966 * layer when one or more elements need to be returned to the appropriate
1967 * global freelist.
1968 */
1969 static void
1970 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
1971 {
1972 mbuf_class_t class = (mbuf_class_t)arg;
1973 unsigned int num;
1974 int w;
1975
1976 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1977
1978 lck_mtx_lock(mbuf_mlock);
1979
1980 num = cslab_free(class, list, purged);
1981 m_free_cnt(class) += num;
1982
1983 if ((w = mb_waiters) > 0)
1984 mb_waiters = 0;
1985
1986 lck_mtx_unlock(mbuf_mlock);
1987
1988 if (w != 0)
1989 wakeup(mb_waitchan);
1990 }
1991
1992 /*
1993 * Common auditor for composite objects called by the CPU cache layer
1994 * during an allocation or free request. For the former, this is called
1995 * after the objects are obtained from either the bucket or slab layer
1996 * and before they are returned to the caller. For the latter, this is
1997 * called immediately during free and before placing the objects into
1998 * the bucket or slab layer.
1999 */
2000 static void
2001 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2002 {
2003 mbuf_class_t class = (mbuf_class_t)arg;
2004 mcache_audit_t *mca;
2005 struct mbuf *m, *ms;
2006 mcl_slab_t *clsp, *nsp;
2007 size_t size;
2008 void *cl;
2009
2010 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2011
2012 while ((m = ms = (struct mbuf *)list) != NULL) {
2013 lck_mtx_lock(mbuf_mlock);
2014 /* Do the mbuf sanity checks and record its transaction */
2015 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2016 mcl_audit_mbuf(mca, m, TRUE, alloc);
2017 mcache_buffer_log(mca, m, m_cache(class));
2018 if (alloc)
2019 mca->mca_uflags |= MB_COMP_INUSE;
2020 else
2021 mca->mca_uflags &= ~MB_COMP_INUSE;
2022
2023 /*
2024 * Use the shadow mbuf in the audit structure if we are
2025 * freeing, since the contents of the actual mbuf has been
2026 * pattern-filled by the above call to mcl_audit_mbuf().
2027 */
2028 if (!alloc)
2029 ms = (struct mbuf *)mca->mca_contents;
2030
2031 /* Do the cluster sanity checks and record its transaction */
2032 cl = ms->m_ext.ext_buf;
2033 clsp = slab_get(cl);
2034 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2035 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2036 VERIFY(clsp->sl_refcnt == 1);
2037 if (class == MC_MBUF_BIGCL) {
2038 nsp = clsp->sl_next;
2039 /* Next slab must already be present */
2040 VERIFY(nsp != NULL);
2041 VERIFY(nsp->sl_refcnt == 1);
2042 } else if (class == MC_MBUF_16KCL) {
2043 int k;
2044 for (nsp = clsp, k = 1;
2045 k < (M16KCLBYTES / MCLBYTES); k++) {
2046 nsp = nsp->sl_next;
2047 /* Next slab must already be present */
2048 VERIFY(nsp != NULL);
2049 VERIFY(nsp->sl_refcnt == 1);
2050 }
2051 }
2052
2053 mca = mcl_audit_buf2mca(MC_CL, cl);
2054 if (class == MC_MBUF_CL)
2055 size = m_maxsize(MC_CL);
2056 else if (class == MC_MBUF_BIGCL)
2057 size = m_maxsize(MC_BIGCL);
2058 else
2059 size = m_maxsize(MC_16KCL);
2060 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2061 mcache_buffer_log(mca, cl, m_cache(class));
2062 if (alloc)
2063 mca->mca_uflags |= MB_COMP_INUSE;
2064 else
2065 mca->mca_uflags &= ~MB_COMP_INUSE;
2066 lck_mtx_unlock(mbuf_mlock);
2067
2068 list = list->obj_next;
2069 }
2070 }
2071
2072 /*
2073 * Allocate some number of mbuf clusters and place on cluster freelist.
2074 */
2075 static int
2076 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2077 {
2078 int i;
2079 vm_size_t size = 0;
2080 int numpages = 0;
2081 vm_offset_t page = 0;
2082 mcache_audit_t *mca_list = NULL;
2083 mcache_obj_t *con_list = NULL;
2084 mcl_slab_t *sp;
2085
2086 VERIFY(bufsize == m_maxsize(MC_CL) ||
2087 bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL));
2088
2089 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2090
2091 /*
2092 * Multiple threads may attempt to populate the cluster map one
2093 * after another. Since we drop the lock below prior to acquiring
2094 * the physical page(s), our view of the cluster map may no longer
2095 * be accurate, and we could end up over-committing the pages beyond
2096 * the maximum allowed for each class. To prevent it, this entire
2097 * operation (including the page mapping) is serialized.
2098 */
2099 while (mb_clalloc_busy) {
2100 mb_clalloc_waiters++;
2101 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2102 (PZERO-1), "m_clalloc", NULL);
2103 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2104 }
2105
2106 /* We are busy now; tell everyone else to go away */
2107 mb_clalloc_busy = TRUE;
2108
2109 /*
2110 * Honor the caller's wish to block or not block. We have a way
2111 * to grow the pool asynchronously using the mbuf worker thread.
2112 */
2113 i = m_howmany(num, bufsize);
2114 if (i == 0 || (wait & M_DONTWAIT))
2115 goto out;
2116
2117 lck_mtx_unlock(mbuf_mlock);
2118
2119 size = round_page_32(i * bufsize);
2120 page = kmem_mb_alloc(mb_map, size);
2121
2122 if (page == 0) {
2123 if (bufsize <= m_maxsize(MC_BIGCL)) {
2124 /* Try for 1 page if failed, only for 2KB/4KB request */
2125 size = NBPG;
2126 page = kmem_mb_alloc(mb_map, size);
2127 }
2128
2129 if (page == 0) {
2130 lck_mtx_lock(mbuf_mlock);
2131 goto out;
2132 }
2133 }
2134
2135 VERIFY(IS_P2ALIGNED(page, NBPG));
2136 numpages = size / NBPG;
2137
2138 /* If auditing is enabled, allocate the audit structures now */
2139 if (mclaudit != NULL) {
2140 int needed;
2141
2142 /*
2143 * Yes, I realize this is a waste of memory for clusters
2144 * that never get transformed into mbufs, as we may end
2145 * up with NMBPCL-1 unused audit structures per cluster.
2146 * But doing so tremendously simplifies the allocation
2147 * strategy, since at this point we are not holding the
2148 * mbuf lock and the caller is okay to be blocked. For
2149 * the case of big clusters, we allocate one structure
2150 * for each as we never turn them into mbufs.
2151 */
2152 if (bufsize == m_maxsize(MC_CL)) {
2153 needed = numpages * 2 * NMBPCL;
2154
2155 i = mcache_alloc_ext(mcl_audit_con_cache,
2156 &con_list, needed, MCR_SLEEP);
2157
2158 VERIFY(con_list != NULL && i == needed);
2159 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2160 needed = numpages;
2161 } else {
2162 needed = numpages / (M16KCLBYTES / NBPG);
2163 }
2164
2165 i = mcache_alloc_ext(mcache_audit_cache,
2166 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2167
2168 VERIFY(mca_list != NULL && i == needed);
2169 }
2170
2171 lck_mtx_lock(mbuf_mlock);
2172
2173 for (i = 0; i < numpages; i++, page += NBPG) {
2174 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2175 ppnum_t new_page = pmap_find_phys(kernel_pmap,
2176 (vm_address_t)page);
2177
2178 /*
2179 * In the case of no mapper being available the following
2180 * code noops and returns the input page; if there is a
2181 * mapper the appropriate I/O page is returned.
2182 */
2183 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2184 mcl_paddr[offset] = new_page << PGSHIFT;
2185
2186 /* Pattern-fill this fresh page */
2187 if (mclaudit != NULL)
2188 mcache_set_pattern(MCACHE_FREE_PATTERN,
2189 (caddr_t)page, NBPG);
2190
2191 if (bufsize == m_maxsize(MC_CL)) {
2192 union mcluster *mcl = (union mcluster *)page;
2193
2194 /* 1st cluster in the page */
2195 sp = slab_get(mcl);
2196 if (mclaudit != NULL)
2197 mcl_audit_init(mcl, &mca_list, &con_list,
2198 AUDIT_CONTENTS_SIZE, NMBPCL);
2199
2200 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2201 slab_init(sp, MC_CL, SLF_MAPPED,
2202 mcl, mcl, bufsize, 0, 1);
2203
2204 /* Insert this slab */
2205 slab_insert(sp, MC_CL);
2206
2207 /* Update stats now since slab_get() drops the lock */
2208 mbstat.m_clfree = ++m_infree(MC_CL) +
2209 m_infree(MC_MBUF_CL);
2210 mbstat.m_clusters = ++m_total(MC_CL);
2211 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2212
2213 /* 2nd cluster in the page */
2214 sp = slab_get(++mcl);
2215 if (mclaudit != NULL)
2216 mcl_audit_init(mcl, &mca_list, &con_list,
2217 AUDIT_CONTENTS_SIZE, NMBPCL);
2218
2219 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2220 slab_init(sp, MC_CL, SLF_MAPPED,
2221 mcl, mcl, bufsize, 0, 1);
2222
2223 /* Insert this slab */
2224 slab_insert(sp, MC_CL);
2225
2226 /* Update stats now since slab_get() drops the lock */
2227 mbstat.m_clfree = ++m_infree(MC_CL) +
2228 m_infree(MC_MBUF_CL);
2229 mbstat.m_clusters = ++m_total(MC_CL);
2230 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2231 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2232 union mbigcluster *mbc = (union mbigcluster *)page;
2233 mcl_slab_t *nsp;
2234
2235 /* One for the entire page */
2236 sp = slab_get(mbc);
2237 if (mclaudit != NULL)
2238 mcl_audit_init(mbc, &mca_list, NULL, 0, 1);
2239
2240 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2241 slab_init(sp, MC_BIGCL, SLF_MAPPED,
2242 mbc, mbc, bufsize, 0, 1);
2243
2244 /* 2nd cluster's slab is part of the previous one */
2245 nsp = slab_get(((union mcluster *)page) + 1);
2246 slab_init(nsp, MC_BIGCL, SLF_MAPPED | SLF_PARTIAL,
2247 mbc, NULL, 0, 0, 0);
2248
2249 /* Insert this slab */
2250 slab_insert(sp, MC_BIGCL);
2251
2252 /* Update stats now since slab_get() drops the lock */
2253 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2254 m_infree(MC_MBUF_BIGCL);
2255 mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2256 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2257 } else if ((i % (M16KCLBYTES / NBPG)) == 0) {
2258 union m16kcluster *m16kcl = (union m16kcluster *)page;
2259 mcl_slab_t *nsp;
2260 int k;
2261
2262 VERIFY(njcl > 0);
2263 /* One for the entire 16KB */
2264 sp = slab_get(m16kcl);
2265 if (mclaudit != NULL)
2266 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2267
2268 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2269 slab_init(sp, MC_16KCL, SLF_MAPPED,
2270 m16kcl, m16kcl, bufsize, 0, 1);
2271
2272 /* 2nd-8th cluster's slab is part of the first one */
2273 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
2274 nsp = slab_get(((union mcluster *)page) + k);
2275 VERIFY(nsp->sl_refcnt == 0 &&
2276 nsp->sl_flags == 0);
2277 slab_init(nsp, MC_16KCL,
2278 SLF_MAPPED | SLF_PARTIAL,
2279 m16kcl, NULL, 0, 0, 0);
2280 }
2281
2282 /* Insert this slab */
2283 slab_insert(sp, MC_16KCL);
2284
2285 /* Update stats now since slab_get() drops the lock */
2286 m_infree(MC_16KCL)++;
2287 m_total(MC_16KCL)++;
2288 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2289 }
2290 }
2291 VERIFY(mca_list == NULL && con_list == NULL);
2292
2293 /* We're done; let others enter */
2294 mb_clalloc_busy = FALSE;
2295 if (mb_clalloc_waiters > 0) {
2296 mb_clalloc_waiters = 0;
2297 wakeup(mb_clalloc_waitchan);
2298 }
2299
2300 if (bufsize == m_maxsize(MC_CL))
2301 return (numpages << 1);
2302 else if (bufsize == m_maxsize(MC_BIGCL))
2303 return (numpages);
2304
2305 VERIFY(bufsize == m_maxsize(MC_16KCL));
2306 return (numpages / (M16KCLBYTES / NBPG));
2307
2308 out:
2309 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2310
2311 /* We're done; let others enter */
2312 mb_clalloc_busy = FALSE;
2313 if (mb_clalloc_waiters > 0) {
2314 mb_clalloc_waiters = 0;
2315 wakeup(mb_clalloc_waitchan);
2316 }
2317
2318 /*
2319 * When non-blocking we kick a thread if we have to grow the
2320 * pool or if the number of free clusters is less than requested.
2321 */
2322 if (bufsize == m_maxsize(MC_CL)) {
2323 if (i > 0) {
2324 /*
2325 * Remember total number of clusters needed
2326 * at this time.
2327 */
2328 i += m_total(MC_CL);
2329 if (i > mbuf_expand_mcl) {
2330 mbuf_expand_mcl = i;
2331 if (mbuf_worker_ready)
2332 wakeup((caddr_t)&mbuf_worker_run);
2333 }
2334 }
2335
2336 if (m_infree(MC_CL) >= num)
2337 return (1);
2338 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2339 if (i > 0) {
2340 /*
2341 * Remember total number of 4KB clusters needed
2342 * at this time.
2343 */
2344 i += m_total(MC_BIGCL);
2345 if (i > mbuf_expand_big) {
2346 mbuf_expand_big = i;
2347 if (mbuf_worker_ready)
2348 wakeup((caddr_t)&mbuf_worker_run);
2349 }
2350 }
2351
2352 if (m_infree(MC_BIGCL) >= num)
2353 return (1);
2354 } else {
2355 if (i > 0) {
2356 /*
2357 * Remember total number of 16KB clusters needed
2358 * at this time.
2359 */
2360 i += m_total(MC_16KCL);
2361 if (i > mbuf_expand_16k) {
2362 mbuf_expand_16k = i;
2363 if (mbuf_worker_ready)
2364 wakeup((caddr_t)&mbuf_worker_run);
2365 }
2366 }
2367
2368 if (m_infree(MC_16KCL) >= num)
2369 return (1);
2370 }
2371 return (0);
2372 }
2373
2374 /*
2375 * Populate the global freelist of the corresponding buffer class.
2376 */
2377 static int
2378 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2379 {
2380 mcache_obj_t *o = NULL;
2381 int i;
2382
2383 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2384 class == MC_16KCL);
2385
2386 #if CONFIG_MBUF_NOEXPAND
2387 if ((mbstat.m_mbufs / NMBPCL) >= maxmbufcl) {
2388 #if DEBUG
2389 static int printonce = 1;
2390 if (printonce == 1) {
2391 printonce = 0;
2392 printf("m_expand failed, allocated %ld out of %d "
2393 "clusters\n", mbstat.m_mbufs / NMBPCL,
2394 nmbclusters);
2395 }
2396 #endif /* DEBUG */
2397 return (0);
2398 }
2399 #endif /* CONFIG_MBUF_NOEXPAND */
2400
2401 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2402
2403 switch (class) {
2404 case MC_MBUF:
2405 case MC_CL:
2406 i = m_clalloc(num, wait, m_maxsize(MC_CL));
2407
2408 /* Respect the 2K clusters minimum limit */
2409 if (m_total(MC_CL) == m_maxlimit(MC_CL) &&
2410 m_infree(MC_CL) <= m_minlimit(MC_CL)) {
2411 if (class != MC_CL || (wait & MCR_COMP))
2412 return (0);
2413 }
2414 if (class == MC_CL)
2415 return (i != 0);
2416 break;
2417
2418 case MC_BIGCL:
2419 case MC_16KCL:
2420 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2421 /* NOTREACHED */
2422
2423 default:
2424 VERIFY(0);
2425 /* NOTREACHED */
2426 }
2427
2428 /* Steal a cluster and cut it up to create NMBPCL mbufs */
2429 if ((o = slab_alloc(MC_CL, wait)) != NULL) {
2430 struct mbuf *m = (struct mbuf *)o;
2431 mcache_audit_t *mca = NULL;
2432 mcl_slab_t *sp = slab_get(o);
2433
2434 VERIFY(slab_is_detached(sp) &&
2435 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2436
2437 /* Make sure that the cluster is unmolested while in freelist */
2438 if (mclaudit != NULL) {
2439 mca = mcl_audit_buf2mca(MC_CL, o);
2440 mcache_audit_free_verify(mca, o, 0, m_maxsize(MC_CL));
2441 }
2442
2443 /* Reinitialize it as an mbuf slab */
2444 slab_init(sp, MC_MBUF, sp->sl_flags, sp->sl_base, NULL,
2445 sp->sl_len, 0, NMBPCL);
2446
2447 VERIFY(m == (struct mbuf *)sp->sl_base);
2448 VERIFY(sp->sl_head == NULL);
2449
2450 m_total(MC_MBUF) += NMBPCL;
2451 mbstat.m_mbufs = m_total(MC_MBUF);
2452 m_infree(MC_MBUF) += NMBPCL;
2453 mtype_stat_add(MT_FREE, NMBPCL);
2454
2455 i = NMBPCL;
2456 while (i--) {
2457 /*
2458 * If auditing is enabled, construct the shadow mbuf
2459 * in the audit structure instead of the actual one.
2460 * mbuf_slab_audit() will take care of restoring the
2461 * contents after the integrity check.
2462 */
2463 if (mclaudit != NULL) {
2464 struct mbuf *ms;
2465 mca = mcl_audit_buf2mca(MC_MBUF,
2466 (mcache_obj_t *)m);
2467 ms = ((struct mbuf *)mca->mca_contents);
2468 ms->m_type = MT_FREE;
2469 } else {
2470 m->m_type = MT_FREE;
2471 }
2472 m->m_next = sp->sl_head;
2473 sp->sl_head = (void *)m++;
2474 }
2475
2476 /* Insert it into the mbuf class's slab list */
2477 slab_insert(sp, MC_MBUF);
2478
2479 if ((i = mb_waiters) > 0)
2480 mb_waiters = 0;
2481 if (i != 0)
2482 wakeup(mb_waitchan);
2483
2484 return (1);
2485 }
2486
2487 return (0);
2488 }
2489
2490 /*
2491 * (Inaccurately) check if it might be worth a trip back to the
2492 * mcache layer due the availability of objects there. We'll
2493 * end up back here if there's nothing up there.
2494 */
2495 static boolean_t
2496 mbuf_cached_above(mbuf_class_t class, int wait)
2497 {
2498 switch (class) {
2499 case MC_MBUF:
2500 if (wait & MCR_COMP)
2501 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
2502 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2503 break;
2504
2505 case MC_CL:
2506 if (wait & MCR_COMP)
2507 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
2508 break;
2509
2510 case MC_BIGCL:
2511 if (wait & MCR_COMP)
2512 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2513 break;
2514
2515 case MC_16KCL:
2516 if (wait & MCR_COMP)
2517 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
2518 break;
2519
2520 case MC_MBUF_CL:
2521 case MC_MBUF_BIGCL:
2522 case MC_MBUF_16KCL:
2523 break;
2524
2525 default:
2526 VERIFY(0);
2527 /* NOTREACHED */
2528 }
2529
2530 return (!mcache_bkt_isempty(m_cache(class)));
2531 }
2532
2533 /*
2534 * If possible, convert constructed objects to raw ones.
2535 */
2536 static boolean_t
2537 mbuf_steal(mbuf_class_t class, unsigned int num)
2538 {
2539 mcache_obj_t *top = NULL;
2540 mcache_obj_t **list = &top;
2541 unsigned int tot = 0;
2542
2543 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2544
2545 switch (class) {
2546 case MC_MBUF:
2547 case MC_CL:
2548 case MC_BIGCL:
2549 case MC_16KCL:
2550 return (FALSE);
2551
2552 case MC_MBUF_CL:
2553 case MC_MBUF_BIGCL:
2554 case MC_MBUF_16KCL:
2555 /* Get the required number of constructed objects if possible */
2556 if (m_infree(class) > m_minlimit(class)) {
2557 tot = cslab_alloc(class, &list,
2558 MIN(num, m_infree(class)));
2559 }
2560
2561 /* And destroy them to get back the raw objects */
2562 if (top != NULL)
2563 (void) cslab_free(class, top, 1);
2564 break;
2565
2566 default:
2567 VERIFY(0);
2568 /* NOTREACHED */
2569 }
2570
2571 return (tot == num);
2572 }
2573
2574 static void
2575 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
2576 {
2577 int m, bmap = 0;
2578
2579 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2580
2581 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2582 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2583 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2584
2585 /*
2586 * This logic can be made smarter; for now, simply mark
2587 * all other related classes as potential victims.
2588 */
2589 switch (class) {
2590 case MC_MBUF:
2591 m_wantpurge(MC_CL)++;
2592 m_wantpurge(MC_MBUF_CL)++;
2593 m_wantpurge(MC_MBUF_BIGCL)++;
2594 break;
2595
2596 case MC_CL:
2597 m_wantpurge(MC_MBUF)++;
2598 if (!comp)
2599 m_wantpurge(MC_MBUF_CL)++;
2600 break;
2601
2602 case MC_BIGCL:
2603 if (!comp)
2604 m_wantpurge(MC_MBUF_BIGCL)++;
2605 break;
2606
2607 case MC_16KCL:
2608 if (!comp)
2609 m_wantpurge(MC_MBUF_16KCL)++;
2610 break;
2611
2612 default:
2613 VERIFY(0);
2614 /* NOTREACHED */
2615 }
2616
2617 /*
2618 * Run through each marked class and check if we really need to
2619 * purge (and therefore temporarily disable) the per-CPU caches
2620 * layer used by the class. If so, remember the classes since
2621 * we are going to drop the lock below prior to purging.
2622 */
2623 for (m = 0; m < NELEM(mbuf_table); m++) {
2624 if (m_wantpurge(m) > 0) {
2625 m_wantpurge(m) = 0;
2626 /*
2627 * Try hard to steal the required number of objects
2628 * from the freelist of other mbuf classes. Only
2629 * purge and disable the per-CPU caches layer when
2630 * we don't have enough; it's the last resort.
2631 */
2632 if (!mbuf_steal(m, num))
2633 bmap |= (1 << m);
2634 }
2635 }
2636
2637 lck_mtx_unlock(mbuf_mlock);
2638
2639 if (bmap != 0) {
2640 /* drain is performed in pfslowtimo(), to avoid deadlocks */
2641 do_reclaim = 1;
2642
2643 /* Sigh; we have no other choices but to ask mcache to purge */
2644 for (m = 0; m < NELEM(mbuf_table); m++) {
2645 if ((bmap & (1 << m)) &&
2646 mcache_purge_cache(m_cache(m))) {
2647 lck_mtx_lock(mbuf_mlock);
2648 m_purge_cnt(m)++;
2649 mbstat.m_drain++;
2650 lck_mtx_unlock(mbuf_mlock);
2651 }
2652 }
2653 } else {
2654 /*
2655 * Request mcache to reap extra elements from all of its caches;
2656 * note that all reaps are serialized and happen only at a fixed
2657 * interval.
2658 */
2659 mcache_reap();
2660 }
2661 lck_mtx_lock(mbuf_mlock);
2662 }
2663
2664 static inline struct mbuf *
2665 m_get_common(int wait, short type, int hdr)
2666 {
2667 struct mbuf *m;
2668 int mcflags = MSLEEPF(wait);
2669
2670 /* Is this due to a non-blocking retry? If so, then try harder */
2671 if (mcflags & MCR_NOSLEEP)
2672 mcflags |= MCR_TRYHARD;
2673
2674 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
2675 if (m != NULL) {
2676 MBUF_INIT(m, hdr, type);
2677 mtype_stat_inc(type);
2678 mtype_stat_dec(MT_FREE);
2679 #if CONFIG_MACF_NET
2680 if (hdr && mac_init_mbuf(m, wait) != 0) {
2681 m_free(m);
2682 return (NULL);
2683 }
2684 #endif /* MAC_NET */
2685 }
2686 return (m);
2687 }
2688
2689 /*
2690 * Space allocation routines; these are also available as macros
2691 * for critical paths.
2692 */
2693 #define _M_GET(wait, type) m_get_common(wait, type, 0)
2694 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
2695 #define _M_RETRY(wait, type) _M_GET(wait, type)
2696 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
2697 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
2698 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
2699
2700 struct mbuf *
2701 m_get(int wait, int type)
2702 {
2703 return (_M_GET(wait, type));
2704 }
2705
2706 struct mbuf *
2707 m_gethdr(int wait, int type)
2708 {
2709 return (_M_GETHDR(wait, type));
2710 }
2711
2712 struct mbuf *
2713 m_retry(int wait, int type)
2714 {
2715 return (_M_RETRY(wait, type));
2716 }
2717
2718 struct mbuf *
2719 m_retryhdr(int wait, int type)
2720 {
2721 return (_M_RETRYHDR(wait, type));
2722 }
2723
2724 struct mbuf *
2725 m_getclr(int wait, int type)
2726 {
2727 struct mbuf *m;
2728
2729 _MGET(m, wait, type);
2730 if (m != NULL)
2731 bzero(MTOD(m, caddr_t), MLEN);
2732 return (m);
2733 }
2734
2735 struct mbuf *
2736 m_free(struct mbuf *m)
2737 {
2738 struct mbuf *n = m->m_next;
2739
2740 if (m->m_type == MT_FREE)
2741 panic("m_free: freeing an already freed mbuf");
2742
2743 /* Free the aux data and tags if there is any */
2744 if (m->m_flags & M_PKTHDR) {
2745 m_tag_delete_chain(m, NULL);
2746 }
2747
2748 if (m->m_flags & M_EXT) {
2749 u_int32_t refcnt;
2750 u_int32_t flags;
2751
2752 refcnt = m_decref(m);
2753 flags = MEXT_FLAGS(m);
2754 if (refcnt == 0 && flags == 0) {
2755 if (m->m_ext.ext_free == NULL) {
2756 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2757 } else if (m->m_ext.ext_free == m_bigfree) {
2758 mcache_free(m_cache(MC_BIGCL),
2759 m->m_ext.ext_buf);
2760 } else if (m->m_ext.ext_free == m_16kfree) {
2761 mcache_free(m_cache(MC_16KCL),
2762 m->m_ext.ext_buf);
2763 } else {
2764 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2765 m->m_ext.ext_size, m->m_ext.ext_arg);
2766 }
2767 mcache_free(ref_cache, MEXT_RFA(m));
2768 MEXT_RFA(m) = NULL;
2769 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2770 VERIFY(m->m_type != MT_FREE);
2771
2772 mtype_stat_dec(m->m_type);
2773 mtype_stat_inc(MT_FREE);
2774
2775 m->m_type = MT_FREE;
2776 m->m_flags = M_EXT;
2777 m->m_len = 0;
2778 m->m_next = m->m_nextpkt = NULL;
2779
2780 /* "Free" into the intermediate cache */
2781 if (m->m_ext.ext_free == NULL) {
2782 mcache_free(m_cache(MC_MBUF_CL), m);
2783 } else if (m->m_ext.ext_free == m_bigfree) {
2784 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2785 } else {
2786 VERIFY(m->m_ext.ext_free == m_16kfree);
2787 mcache_free(m_cache(MC_MBUF_16KCL), m);
2788 }
2789 return (n);
2790 }
2791 }
2792
2793 if (m->m_type != MT_FREE) {
2794 mtype_stat_dec(m->m_type);
2795 mtype_stat_inc(MT_FREE);
2796 }
2797
2798 m->m_type = MT_FREE;
2799 m->m_flags = m->m_len = 0;
2800 m->m_next = m->m_nextpkt = NULL;
2801
2802 mcache_free(m_cache(MC_MBUF), m);
2803
2804 return (n);
2805 }
2806
2807 __private_extern__ struct mbuf *
2808 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
2809 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
2810 int wait)
2811 {
2812 struct ext_ref *rfa = NULL;
2813
2814 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
2815 return (NULL);
2816
2817 if (m->m_flags & M_EXT) {
2818 u_int32_t refcnt;
2819 u_int32_t flags;
2820
2821 refcnt = m_decref(m);
2822 flags = MEXT_FLAGS(m);
2823 if (refcnt == 0 && flags == 0) {
2824 if (m->m_ext.ext_free == NULL) {
2825 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2826 } else if (m->m_ext.ext_free == m_bigfree) {
2827 mcache_free(m_cache(MC_BIGCL),
2828 m->m_ext.ext_buf);
2829 } else if (m->m_ext.ext_free == m_16kfree) {
2830 mcache_free(m_cache(MC_16KCL),
2831 m->m_ext.ext_buf);
2832 } else {
2833 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2834 m->m_ext.ext_size, m->m_ext.ext_arg);
2835 }
2836 /* Re-use the reference structure */
2837 rfa = MEXT_RFA(m);
2838 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2839 VERIFY(m->m_type != MT_FREE);
2840
2841 mtype_stat_dec(m->m_type);
2842 mtype_stat_inc(MT_FREE);
2843
2844 m->m_type = MT_FREE;
2845 m->m_flags = M_EXT;
2846 m->m_len = 0;
2847 m->m_next = m->m_nextpkt = NULL;
2848 /* "Free" into the intermediate cache */
2849 if (m->m_ext.ext_free == NULL) {
2850 mcache_free(m_cache(MC_MBUF_CL), m);
2851 } else if (m->m_ext.ext_free == m_bigfree) {
2852 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2853 } else {
2854 VERIFY(m->m_ext.ext_free == m_16kfree);
2855 mcache_free(m_cache(MC_MBUF_16KCL), m);
2856 }
2857 /*
2858 * Allocate a new mbuf, since we didn't divorce
2859 * the composite mbuf + cluster pair above.
2860 */
2861 if ((m = _M_GETHDR(wait, type)) == NULL)
2862 return (NULL);
2863 }
2864 }
2865
2866 if (rfa == NULL &&
2867 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
2868 m_free(m);
2869 return (NULL);
2870 }
2871
2872 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
2873
2874 return (m);
2875 }
2876
2877 /* m_mclget() add an mbuf cluster to a normal mbuf */
2878 struct mbuf *
2879 m_mclget(struct mbuf *m, int wait)
2880 {
2881 struct ext_ref *rfa;
2882
2883 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2884 return (m);
2885
2886 m->m_ext.ext_buf = m_mclalloc(wait);
2887 if (m->m_ext.ext_buf != NULL) {
2888 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2889 } else {
2890 mcache_free(ref_cache, rfa);
2891 }
2892 return (m);
2893 }
2894
2895 /* Allocate an mbuf cluster */
2896 caddr_t
2897 m_mclalloc(int wait)
2898 {
2899 int mcflags = MSLEEPF(wait);
2900
2901 /* Is this due to a non-blocking retry? If so, then try harder */
2902 if (mcflags & MCR_NOSLEEP)
2903 mcflags |= MCR_TRYHARD;
2904
2905 return (mcache_alloc(m_cache(MC_CL), mcflags));
2906 }
2907
2908 /* Free an mbuf cluster */
2909 void
2910 m_mclfree(caddr_t p)
2911 {
2912 mcache_free(m_cache(MC_CL), p);
2913 }
2914
2915 /*
2916 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
2917 * another mbuf
2918 */
2919 int
2920 m_mclhasreference(struct mbuf *m)
2921 {
2922 if (!(m->m_flags & M_EXT))
2923 return (0);
2924
2925 ASSERT(MEXT_RFA(m) != NULL);
2926
2927 return (MEXT_REF(m) > 1);
2928 }
2929
2930 __private_extern__ caddr_t
2931 m_bigalloc(int wait)
2932 {
2933 int mcflags = MSLEEPF(wait);
2934
2935 /* Is this due to a non-blocking retry? If so, then try harder */
2936 if (mcflags & MCR_NOSLEEP)
2937 mcflags |= MCR_TRYHARD;
2938
2939 return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
2940 }
2941
2942 __private_extern__ void
2943 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
2944 {
2945 mcache_free(m_cache(MC_BIGCL), p);
2946 }
2947
2948 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
2949 __private_extern__ struct mbuf *
2950 m_mbigget(struct mbuf *m, int wait)
2951 {
2952 struct ext_ref *rfa;
2953
2954 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2955 return (m);
2956
2957 m->m_ext.ext_buf = m_bigalloc(wait);
2958 if (m->m_ext.ext_buf != NULL) {
2959 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2960 } else {
2961 mcache_free(ref_cache, rfa);
2962 }
2963 return (m);
2964 }
2965
2966 __private_extern__ caddr_t
2967 m_16kalloc(int wait)
2968 {
2969 int mcflags = MSLEEPF(wait);
2970
2971 /* Is this due to a non-blocking retry? If so, then try harder */
2972 if (mcflags & MCR_NOSLEEP)
2973 mcflags |= MCR_TRYHARD;
2974
2975 return (mcache_alloc(m_cache(MC_16KCL), mcflags));
2976 }
2977
2978 __private_extern__ void
2979 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
2980 {
2981 mcache_free(m_cache(MC_16KCL), p);
2982 }
2983
2984 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
2985 __private_extern__ struct mbuf *
2986 m_m16kget(struct mbuf *m, int wait)
2987 {
2988 struct ext_ref *rfa;
2989
2990 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2991 return (m);
2992
2993 m->m_ext.ext_buf = m_16kalloc(wait);
2994 if (m->m_ext.ext_buf != NULL) {
2995 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2996 } else {
2997 mcache_free(ref_cache, rfa);
2998 }
2999 return (m);
3000 }
3001
3002 /* */
3003 void
3004 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3005 {
3006 #if CONFIG_MACF_NET
3007 /* We will be taking over the tags of 'to' */
3008 if (to->m_flags & M_PKTHDR)
3009 m_tag_delete_chain(to, NULL);
3010 #endif /* MAC_NET */
3011 to->m_pkthdr = from->m_pkthdr; /* especially tags */
3012 m_tag_init(from); /* purge tags from src */
3013 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3014 if ((to->m_flags & M_EXT) == 0)
3015 to->m_data = to->m_pktdat;
3016 }
3017
3018 /*
3019 * Duplicate "from"'s mbuf pkthdr in "to".
3020 * "from" must have M_PKTHDR set, and "to" must be empty.
3021 * In particular, this does a deep copy of the packet tags.
3022 */
3023 static int
3024 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3025 {
3026 #if CONFIG_MACF_NET
3027 if (to->m_flags & M_PKTHDR)
3028 m_tag_delete_chain(to, NULL);
3029 #endif /* MAC_NET */
3030 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3031 if ((to->m_flags & M_EXT) == 0)
3032 to->m_data = to->m_pktdat;
3033 to->m_pkthdr = from->m_pkthdr;
3034 m_tag_init(to);
3035 return (m_tag_copy_chain(to, from, how));
3036 }
3037
3038 /*
3039 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3040 * if wantall is not set, return whatever number were available. Set up the
3041 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3042 * are chained on the m_nextpkt field. Any packets requested beyond this
3043 * are chained onto the last packet header's m_next field. The size of
3044 * the cluster is controlled by the parameter bufsize.
3045 */
3046 __private_extern__ struct mbuf *
3047 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3048 int wait, int wantall, size_t bufsize)
3049 {
3050 struct mbuf *m;
3051 struct mbuf **np, *top;
3052 unsigned int pnum, needed = *num_needed;
3053 mcache_obj_t *mp_list = NULL;
3054 int mcflags = MSLEEPF(wait);
3055 u_int32_t flag;
3056 struct ext_ref *rfa;
3057 mcache_t *cp;
3058 void *cl;
3059
3060 ASSERT(bufsize == m_maxsize(MC_CL) ||
3061 bufsize == m_maxsize(MC_BIGCL) ||
3062 bufsize == m_maxsize(MC_16KCL));
3063
3064 /*
3065 * Caller must first check for njcl because this
3066 * routine is internal and not exposed/used via KPI.
3067 */
3068 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3069
3070 top = NULL;
3071 np = &top;
3072 pnum = 0;
3073
3074 /*
3075 * The caller doesn't want all the requested buffers; only some.
3076 * Try hard to get what we can, but don't block. This effectively
3077 * overrides MCR_SLEEP, since this thread will not go to sleep
3078 * if we can't get all the buffers.
3079 */
3080 if (!wantall || (mcflags & MCR_NOSLEEP))
3081 mcflags |= MCR_TRYHARD;
3082
3083 /* Allocate the composite mbuf + cluster elements from the cache */
3084 if (bufsize == m_maxsize(MC_CL))
3085 cp = m_cache(MC_MBUF_CL);
3086 else if (bufsize == m_maxsize(MC_BIGCL))
3087 cp = m_cache(MC_MBUF_BIGCL);
3088 else
3089 cp = m_cache(MC_MBUF_16KCL);
3090 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3091
3092 for (pnum = 0; pnum < needed; pnum++) {
3093 m = (struct mbuf *)mp_list;
3094 mp_list = mp_list->obj_next;
3095
3096 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3097 cl = m->m_ext.ext_buf;
3098 rfa = MEXT_RFA(m);
3099
3100 ASSERT(cl != NULL && rfa != NULL);
3101 VERIFY(MBUF_IS_COMPOSITE(m));
3102
3103 flag = MEXT_FLAGS(m);
3104
3105 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3106 if (bufsize == m_maxsize(MC_16KCL)) {
3107 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3108 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3109 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3110 } else {
3111 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3112 }
3113
3114 if (num_with_pkthdrs > 0) {
3115 --num_with_pkthdrs;
3116 #if CONFIG_MACF_NET
3117 if (mac_mbuf_label_init(m, wait) != 0) {
3118 m_free(m);
3119 break;
3120 }
3121 #endif /* MAC_NET */
3122 }
3123
3124 *np = m;
3125 if (num_with_pkthdrs > 0)
3126 np = &m->m_nextpkt;
3127 else
3128 np = &m->m_next;
3129 }
3130 ASSERT(pnum != *num_needed || mp_list == NULL);
3131 if (mp_list != NULL)
3132 mcache_free_ext(cp, mp_list);
3133
3134 if (pnum > 0) {
3135 mtype_stat_add(MT_DATA, pnum);
3136 mtype_stat_sub(MT_FREE, pnum);
3137 }
3138
3139 if (wantall && (pnum != *num_needed)) {
3140 if (top != NULL)
3141 m_freem_list(top);
3142 return (NULL);
3143 }
3144
3145 *num_needed = pnum;
3146 return (top);
3147 }
3148
3149 /*
3150 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3151 * wantall is not set, return whatever number were available. The size of
3152 * each mbuf in the list is controlled by the parameter packetlen. Each
3153 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3154 * in the chain is called a segment. If maxsegments is not null and the
3155 * value pointed to is not null, this specify the maximum number of segments
3156 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3157 * is zero the caller does not have any restriction on the number of segments.
3158 * The actual number of segments of a mbuf chain is return in the value
3159 * pointed to by maxsegments.
3160 */
3161 __private_extern__ struct mbuf *
3162 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3163 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3164 {
3165 struct mbuf **np, *top, *first = NULL;
3166 size_t bufsize, r_bufsize;
3167 unsigned int num = 0;
3168 unsigned int nsegs = 0;
3169 unsigned int needed, resid;
3170 int mcflags = MSLEEPF(wait);
3171 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3172 mcache_t *cp = NULL, *rcp = NULL;
3173
3174 if (*numlist == 0)
3175 return (NULL);
3176
3177 top = NULL;
3178 np = &top;
3179
3180 if (wantsize == 0) {
3181 if (packetlen <= MINCLSIZE) {
3182 bufsize = packetlen;
3183 } else if (packetlen > m_maxsize(MC_CL)) {
3184 /* Use 4KB if jumbo cluster pool isn't available */
3185 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3186 bufsize = m_maxsize(MC_BIGCL);
3187 else
3188 bufsize = m_maxsize(MC_16KCL);
3189 } else {
3190 bufsize = m_maxsize(MC_CL);
3191 }
3192 } else if (wantsize == m_maxsize(MC_CL) ||
3193 wantsize == m_maxsize(MC_BIGCL) ||
3194 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3195 bufsize = wantsize;
3196 } else {
3197 return (NULL);
3198 }
3199
3200 if (bufsize <= MHLEN) {
3201 nsegs = 1;
3202 } else if (bufsize <= MINCLSIZE) {
3203 if (maxsegments != NULL && *maxsegments == 1) {
3204 bufsize = m_maxsize(MC_CL);
3205 nsegs = 1;
3206 } else {
3207 nsegs = 2;
3208 }
3209 } else if (bufsize == m_maxsize(MC_16KCL)) {
3210 VERIFY(njcl > 0);
3211 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3212 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3213 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3214 } else {
3215 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3216 }
3217 if (maxsegments != NULL) {
3218 if (*maxsegments && nsegs > *maxsegments) {
3219 *maxsegments = nsegs;
3220 return (NULL);
3221 }
3222 *maxsegments = nsegs;
3223 }
3224
3225 /*
3226 * The caller doesn't want all the requested buffers; only some.
3227 * Try hard to get what we can, but don't block. This effectively
3228 * overrides MCR_SLEEP, since this thread will not go to sleep
3229 * if we can't get all the buffers.
3230 */
3231 if (!wantall || (mcflags & MCR_NOSLEEP))
3232 mcflags |= MCR_TRYHARD;
3233
3234 /*
3235 * Simple case where all elements in the lists/chains are mbufs.
3236 * Unless bufsize is greater than MHLEN, each segment chain is made
3237 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3238 * of 2 mbufs; the second one is used for the residual data, i.e.
3239 * the remaining data that cannot fit into the first mbuf.
3240 */
3241 if (bufsize <= MINCLSIZE) {
3242 /* Allocate the elements in one shot from the mbuf cache */
3243 ASSERT(bufsize <= MHLEN || nsegs == 2);
3244 cp = m_cache(MC_MBUF);
3245 needed = mcache_alloc_ext(cp, &mp_list,
3246 (*numlist) * nsegs, mcflags);
3247
3248 /*
3249 * The number of elements must be even if we are to use an
3250 * mbuf (instead of a cluster) to store the residual data.
3251 * If we couldn't allocate the requested number of mbufs,
3252 * trim the number down (if it's odd) in order to avoid
3253 * creating a partial segment chain.
3254 */
3255 if (bufsize > MHLEN && (needed & 0x1))
3256 needed--;
3257
3258 while (num < needed) {
3259 struct mbuf *m;
3260
3261 m = (struct mbuf *)mp_list;
3262 mp_list = mp_list->obj_next;
3263 ASSERT(m != NULL);
3264
3265 MBUF_INIT(m, 1, MT_DATA);
3266 #if CONFIG_MACF_NET
3267 if (mac_init_mbuf(m, wait) != 0) {
3268 m_free(m);
3269 break;
3270 }
3271 #endif /* MAC_NET */
3272 num++;
3273 if (bufsize > MHLEN) {
3274 /* A second mbuf for this segment chain */
3275 m->m_next = (struct mbuf *)mp_list;
3276 mp_list = mp_list->obj_next;
3277 ASSERT(m->m_next != NULL);
3278
3279 MBUF_INIT(m->m_next, 0, MT_DATA);
3280 num++;
3281 }
3282 *np = m;
3283 np = &m->m_nextpkt;
3284 }
3285 ASSERT(num != *numlist || mp_list == NULL);
3286
3287 if (num > 0) {
3288 mtype_stat_add(MT_DATA, num);
3289 mtype_stat_sub(MT_FREE, num);
3290 }
3291 num /= nsegs;
3292
3293 /* We've got them all; return to caller */
3294 if (num == *numlist)
3295 return (top);
3296
3297 goto fail;
3298 }
3299
3300 /*
3301 * Complex cases where elements are made up of one or more composite
3302 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3303 * be illustrated as follows:
3304 *
3305 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3306 *
3307 * Every composite mbuf + cluster element comes from the intermediate
3308 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
3309 * the last composite element will come from the MC_MBUF_CL cache,
3310 * unless the residual data is larger than 2KB where we use the
3311 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
3312 * data is defined as extra data beyond the first element that cannot
3313 * fit into the previous element, i.e. there is no residual data if
3314 * the chain only has 1 segment.
3315 */
3316 r_bufsize = bufsize;
3317 resid = packetlen > bufsize ? packetlen % bufsize : 0;
3318 if (resid > 0) {
3319 /* There is residual data; figure out the cluster size */
3320 if (wantsize == 0 && packetlen > MINCLSIZE) {
3321 /*
3322 * Caller didn't request that all of the segments
3323 * in the chain use the same cluster size; use the
3324 * smaller of the cluster sizes.
3325 */
3326 if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3327 r_bufsize = m_maxsize(MC_16KCL);
3328 else if (resid > m_maxsize(MC_CL))
3329 r_bufsize = m_maxsize(MC_BIGCL);
3330 else
3331 r_bufsize = m_maxsize(MC_CL);
3332 } else {
3333 /* Use the same cluster size as the other segments */
3334 resid = 0;
3335 }
3336 }
3337
3338 needed = *numlist;
3339 if (resid > 0) {
3340 /*
3341 * Attempt to allocate composite mbuf + cluster elements for
3342 * the residual data in each chain; record the number of such
3343 * elements that can be allocated so that we know how many
3344 * segment chains we can afford to create.
3345 */
3346 if (r_bufsize <= m_maxsize(MC_CL))
3347 rcp = m_cache(MC_MBUF_CL);
3348 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3349 rcp = m_cache(MC_MBUF_BIGCL);
3350 else
3351 rcp = m_cache(MC_MBUF_16KCL);
3352 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3353
3354 if (needed == 0)
3355 goto fail;
3356
3357 /* This is temporarily reduced for calculation */
3358 ASSERT(nsegs > 1);
3359 nsegs--;
3360 }
3361
3362 /*
3363 * Attempt to allocate the rest of the composite mbuf + cluster
3364 * elements for the number of segment chains that we need.
3365 */
3366 if (bufsize <= m_maxsize(MC_CL))
3367 cp = m_cache(MC_MBUF_CL);
3368 else if (bufsize <= m_maxsize(MC_BIGCL))
3369 cp = m_cache(MC_MBUF_BIGCL);
3370 else
3371 cp = m_cache(MC_MBUF_16KCL);
3372 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3373
3374 /* Round it down to avoid creating a partial segment chain */
3375 needed = (needed / nsegs) * nsegs;
3376 if (needed == 0)
3377 goto fail;
3378
3379 if (resid > 0) {
3380 /*
3381 * We're about to construct the chain(s); take into account
3382 * the number of segments we have created above to hold the
3383 * residual data for each chain, as well as restore the
3384 * original count of segments per chain.
3385 */
3386 ASSERT(nsegs > 0);
3387 needed += needed / nsegs;
3388 nsegs++;
3389 }
3390
3391 for (;;) {
3392 struct mbuf *m;
3393 u_int32_t flag;
3394 struct ext_ref *rfa;
3395 void *cl;
3396 int pkthdr;
3397
3398 ++num;
3399 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3400 m = (struct mbuf *)mp_list;
3401 mp_list = mp_list->obj_next;
3402 } else {
3403 m = (struct mbuf *)rmp_list;
3404 rmp_list = rmp_list->obj_next;
3405 }
3406 ASSERT(m != NULL);
3407 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3408 VERIFY(m->m_ext.ext_free == NULL ||
3409 m->m_ext.ext_free == m_bigfree ||
3410 m->m_ext.ext_free == m_16kfree);
3411
3412 cl = m->m_ext.ext_buf;
3413 rfa = MEXT_RFA(m);
3414
3415 ASSERT(cl != NULL && rfa != NULL);
3416 VERIFY(MBUF_IS_COMPOSITE(m));
3417
3418 flag = MEXT_FLAGS(m);
3419
3420 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
3421 if (pkthdr)
3422 first = m;
3423 MBUF_INIT(m, pkthdr, MT_DATA);
3424 if (m->m_ext.ext_free == m_16kfree) {
3425 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3426 } else if (m->m_ext.ext_free == m_bigfree) {
3427 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3428 } else {
3429 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3430 }
3431 #if CONFIG_MACF_NET
3432 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
3433 --num;
3434 m_free(m);
3435 break;
3436 }
3437 #endif /* MAC_NET */
3438
3439 *np = m;
3440 if ((num % nsegs) == 0)
3441 np = &first->m_nextpkt;
3442 else
3443 np = &m->m_next;
3444
3445 if (num == needed)
3446 break;
3447 }
3448
3449 if (num > 0) {
3450 mtype_stat_add(MT_DATA, num);
3451 mtype_stat_sub(MT_FREE, num);
3452 }
3453
3454 num /= nsegs;
3455
3456 /* We've got them all; return to caller */
3457 if (num == *numlist) {
3458 ASSERT(mp_list == NULL && rmp_list == NULL);
3459 return (top);
3460 }
3461
3462 fail:
3463 /* Free up what's left of the above */
3464 if (mp_list != NULL)
3465 mcache_free_ext(cp, mp_list);
3466 if (rmp_list != NULL)
3467 mcache_free_ext(rcp, rmp_list);
3468 if (wantall && top != NULL) {
3469 m_freem(top);
3470 return (NULL);
3471 }
3472 *numlist = num;
3473 return (top);
3474 }
3475
3476 /*
3477 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3478 * packets on receive ring.
3479 */
3480 __private_extern__ struct mbuf *
3481 m_getpacket_how(int wait)
3482 {
3483 unsigned int num_needed = 1;
3484
3485 return (m_getpackets_internal(&num_needed, 1, wait, 1,
3486 m_maxsize(MC_CL)));
3487 }
3488
3489 /*
3490 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3491 * packets on receive ring.
3492 */
3493 struct mbuf *
3494 m_getpacket(void)
3495 {
3496 unsigned int num_needed = 1;
3497
3498 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
3499 m_maxsize(MC_CL)));
3500 }
3501
3502 /*
3503 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3504 * if this can't be met, return whatever number were available. Set up the
3505 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
3506 * are chained on the m_nextpkt field. Any packets requested beyond this are
3507 * chained onto the last packet header's m_next field.
3508 */
3509 struct mbuf *
3510 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
3511 {
3512 unsigned int n = num_needed;
3513
3514 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
3515 m_maxsize(MC_CL)));
3516 }
3517
3518 /*
3519 * Return a list of mbuf hdrs set up as packet hdrs chained together
3520 * on the m_nextpkt field
3521 */
3522 struct mbuf *
3523 m_getpackethdrs(int num_needed, int how)
3524 {
3525 struct mbuf *m;
3526 struct mbuf **np, *top;
3527
3528 top = NULL;
3529 np = &top;
3530
3531 while (num_needed--) {
3532 m = _M_RETRYHDR(how, MT_DATA);
3533 if (m == NULL)
3534 break;
3535
3536 *np = m;
3537 np = &m->m_nextpkt;
3538 }
3539
3540 return (top);
3541 }
3542
3543 /*
3544 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
3545 * for mbufs packets freed. Used by the drivers.
3546 */
3547 int
3548 m_freem_list(struct mbuf *m)
3549 {
3550 struct mbuf *nextpkt;
3551 mcache_obj_t *mp_list = NULL;
3552 mcache_obj_t *mcl_list = NULL;
3553 mcache_obj_t *mbc_list = NULL;
3554 mcache_obj_t *m16k_list = NULL;
3555 mcache_obj_t *m_mcl_list = NULL;
3556 mcache_obj_t *m_mbc_list = NULL;
3557 mcache_obj_t *m_m16k_list = NULL;
3558 mcache_obj_t *ref_list = NULL;
3559 int pktcount = 0;
3560 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
3561
3562 while (m != NULL) {
3563 pktcount++;
3564
3565 nextpkt = m->m_nextpkt;
3566 m->m_nextpkt = NULL;
3567
3568 while (m != NULL) {
3569 struct mbuf *next = m->m_next;
3570 mcache_obj_t *o, *rfa;
3571 u_int32_t refcnt, flags;
3572
3573 if (m->m_type == MT_FREE)
3574 panic("m_free: freeing an already freed mbuf");
3575
3576 if (m->m_type != MT_FREE)
3577 mt_free++;
3578
3579 if (m->m_flags & M_PKTHDR) {
3580 m_tag_delete_chain(m, NULL);
3581 }
3582
3583 if (!(m->m_flags & M_EXT))
3584 goto simple_free;
3585
3586 o = (mcache_obj_t *)m->m_ext.ext_buf;
3587 refcnt = m_decref(m);
3588 flags = MEXT_FLAGS(m);
3589 if (refcnt == 0 && flags == 0) {
3590 if (m->m_ext.ext_free == NULL) {
3591 o->obj_next = mcl_list;
3592 mcl_list = o;
3593 } else if (m->m_ext.ext_free == m_bigfree) {
3594 o->obj_next = mbc_list;
3595 mbc_list = o;
3596 } else if (m->m_ext.ext_free == m_16kfree) {
3597 o->obj_next = m16k_list;
3598 m16k_list = o;
3599 } else {
3600 (*(m->m_ext.ext_free))((caddr_t)o,
3601 m->m_ext.ext_size,
3602 m->m_ext.ext_arg);
3603 }
3604 rfa = (mcache_obj_t *)MEXT_RFA(m);
3605 rfa->obj_next = ref_list;
3606 ref_list = rfa;
3607 MEXT_RFA(m) = NULL;
3608 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
3609 VERIFY(m->m_type != MT_FREE);
3610 /*
3611 * Amortize the costs of atomic operations
3612 * by doing them at the end, if possible.
3613 */
3614 if (m->m_type == MT_DATA)
3615 mt_data++;
3616 else if (m->m_type == MT_HEADER)
3617 mt_header++;
3618 else if (m->m_type == MT_SONAME)
3619 mt_soname++;
3620 else if (m->m_type == MT_TAG)
3621 mt_tag++;
3622 else
3623 mtype_stat_dec(m->m_type);
3624
3625 m->m_type = MT_FREE;
3626 m->m_flags = M_EXT;
3627 m->m_len = 0;
3628 m->m_next = m->m_nextpkt = NULL;
3629
3630 /* "Free" into the intermediate cache */
3631 o = (mcache_obj_t *)m;
3632 if (m->m_ext.ext_free == NULL) {
3633 o->obj_next = m_mcl_list;
3634 m_mcl_list = o;
3635 } else if (m->m_ext.ext_free == m_bigfree) {
3636 o->obj_next = m_mbc_list;
3637 m_mbc_list = o;
3638 } else {
3639 VERIFY(m->m_ext.ext_free == m_16kfree);
3640 o->obj_next = m_m16k_list;
3641 m_m16k_list = o;
3642 }
3643 m = next;
3644 continue;
3645 }
3646 simple_free:
3647 /*
3648 * Amortize the costs of atomic operations
3649 * by doing them at the end, if possible.
3650 */
3651 if (m->m_type == MT_DATA)
3652 mt_data++;
3653 else if (m->m_type == MT_HEADER)
3654 mt_header++;
3655 else if (m->m_type == MT_SONAME)
3656 mt_soname++;
3657 else if (m->m_type == MT_TAG)
3658 mt_tag++;
3659 else if (m->m_type != MT_FREE)
3660 mtype_stat_dec(m->m_type);
3661
3662 m->m_type = MT_FREE;
3663 m->m_flags = m->m_len = 0;
3664 m->m_next = m->m_nextpkt = NULL;
3665
3666 ((mcache_obj_t *)m)->obj_next = mp_list;
3667 mp_list = (mcache_obj_t *)m;
3668
3669 m = next;
3670 }
3671
3672 m = nextpkt;
3673 }
3674
3675 if (mt_free > 0)
3676 mtype_stat_add(MT_FREE, mt_free);
3677 if (mt_data > 0)
3678 mtype_stat_sub(MT_DATA, mt_data);
3679 if (mt_header > 0)
3680 mtype_stat_sub(MT_HEADER, mt_header);
3681 if (mt_soname > 0)
3682 mtype_stat_sub(MT_SONAME, mt_soname);
3683 if (mt_tag > 0)
3684 mtype_stat_sub(MT_TAG, mt_tag);
3685
3686 if (mp_list != NULL)
3687 mcache_free_ext(m_cache(MC_MBUF), mp_list);
3688 if (mcl_list != NULL)
3689 mcache_free_ext(m_cache(MC_CL), mcl_list);
3690 if (mbc_list != NULL)
3691 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
3692 if (m16k_list != NULL)
3693 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
3694 if (m_mcl_list != NULL)
3695 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
3696 if (m_mbc_list != NULL)
3697 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
3698 if (m_m16k_list != NULL)
3699 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
3700 if (ref_list != NULL)
3701 mcache_free_ext(ref_cache, ref_list);
3702
3703 return (pktcount);
3704 }
3705
3706 void
3707 m_freem(struct mbuf *m)
3708 {
3709 while (m != NULL)
3710 m = m_free(m);
3711 }
3712
3713 /*
3714 * Mbuffer utility routines.
3715 */
3716
3717 /*
3718 * Compute the amount of space available before the current start
3719 * of data in an mbuf.
3720 */
3721 int
3722 m_leadingspace(struct mbuf *m)
3723 {
3724 if (m->m_flags & M_EXT) {
3725 if (MCLHASREFERENCE(m))
3726 return (0);
3727 return (m->m_data - m->m_ext.ext_buf);
3728 }
3729 if (m->m_flags & M_PKTHDR)
3730 return (m->m_data - m->m_pktdat);
3731 return (m->m_data - m->m_dat);
3732 }
3733
3734 /*
3735 * Compute the amount of space available after the end of data in an mbuf.
3736 */
3737 int
3738 m_trailingspace(struct mbuf *m)
3739 {
3740 if (m->m_flags & M_EXT) {
3741 if (MCLHASREFERENCE(m))
3742 return (0);
3743 return (m->m_ext.ext_buf + m->m_ext.ext_size -
3744 (m->m_data + m->m_len));
3745 }
3746 return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
3747 }
3748
3749 /*
3750 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
3751 * copy junk along. Does not adjust packet header length.
3752 */
3753 struct mbuf *
3754 m_prepend(struct mbuf *m, int len, int how)
3755 {
3756 struct mbuf *mn;
3757
3758 _MGET(mn, how, m->m_type);
3759 if (mn == NULL) {
3760 m_freem(m);
3761 return (NULL);
3762 }
3763 if (m->m_flags & M_PKTHDR) {
3764 M_COPY_PKTHDR(mn, m);
3765 m->m_flags &= ~M_PKTHDR;
3766 }
3767 mn->m_next = m;
3768 m = mn;
3769 if (len < MHLEN)
3770 MH_ALIGN(m, len);
3771 m->m_len = len;
3772 return (m);
3773 }
3774
3775 /*
3776 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3777 * chain, copy junk along, and adjust length.
3778 */
3779 struct mbuf *
3780 m_prepend_2(struct mbuf *m, int len, int how)
3781 {
3782 if (M_LEADINGSPACE(m) >= len) {
3783 m->m_data -= len;
3784 m->m_len += len;
3785 } else {
3786 m = m_prepend(m, len, how);
3787 }
3788 if ((m) && (m->m_flags & M_PKTHDR))
3789 m->m_pkthdr.len += len;
3790 return (m);
3791 }
3792
3793 /*
3794 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3795 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
3796 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3797 */
3798 int MCFail;
3799
3800 struct mbuf *
3801 m_copym(struct mbuf *m, int off0, int len, int wait)
3802 {
3803 struct mbuf *n, *mhdr = NULL, **np;
3804 int off = off0;
3805 struct mbuf *top;
3806 int copyhdr = 0;
3807
3808 if (off < 0 || len < 0)
3809 panic("m_copym: invalid offset %d or len %d", off, len);
3810
3811 if (off == 0 && (m->m_flags & M_PKTHDR)) {
3812 mhdr = m;
3813 copyhdr = 1;
3814 }
3815
3816 while (off >= m->m_len) {
3817 if (m->m_next == NULL)
3818 panic("m_copym: invalid mbuf chain");
3819 off -= m->m_len;
3820 m = m->m_next;
3821 }
3822 np = &top;
3823 top = NULL;
3824
3825 while (len > 0) {
3826 if (m == NULL) {
3827 if (len != M_COPYALL)
3828 panic("m_copym: len != M_COPYALL");
3829 break;
3830 }
3831
3832 n = _M_RETRY(wait, m->m_type);
3833 *np = n;
3834
3835 if (n == NULL)
3836 goto nospace;
3837
3838 if (copyhdr != 0) {
3839 M_COPY_PKTHDR(n, mhdr);
3840 if (len == M_COPYALL)
3841 n->m_pkthdr.len -= off0;
3842 else
3843 n->m_pkthdr.len = len;
3844 copyhdr = 0;
3845 }
3846 if (len == M_COPYALL) {
3847 if (MIN(len, (m->m_len - off)) == len) {
3848 printf("m->m_len %ld - off %d = %ld, %ld\n",
3849 m->m_len, off, m->m_len - off,
3850 MIN(len, (m->m_len - off)));
3851 }
3852 }
3853 n->m_len = MIN(len, (m->m_len - off));
3854 if (n->m_len == M_COPYALL) {
3855 printf("n->m_len == M_COPYALL, fixing\n");
3856 n->m_len = MHLEN;
3857 }
3858 if (m->m_flags & M_EXT) {
3859 n->m_ext = m->m_ext;
3860 m_incref(m);
3861 n->m_data = m->m_data + off;
3862 n->m_flags |= M_EXT;
3863 } else {
3864 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
3865 (unsigned)n->m_len);
3866 }
3867 if (len != M_COPYALL)
3868 len -= n->m_len;
3869 off = 0;
3870 m = m->m_next;
3871 np = &n->m_next;
3872 }
3873
3874 if (top == NULL)
3875 MCFail++;
3876
3877 return (top);
3878 nospace:
3879
3880 m_freem(top);
3881 MCFail++;
3882 return (NULL);
3883 }
3884
3885 /*
3886 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
3887 * within this routine also, the last mbuf and offset accessed are passed
3888 * out and can be passed back in to avoid having to rescan the entire mbuf
3889 * list (normally hung off of the socket)
3890 */
3891 struct mbuf *
3892 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
3893 struct mbuf **m_last, int *m_off)
3894 {
3895 struct mbuf *n, **np = NULL;
3896 int off = off0, len = len0;
3897 struct mbuf *top = NULL;
3898 int mcflags = MSLEEPF(wait);
3899 int copyhdr = 0;
3900 int type = 0;
3901 mcache_obj_t *list = NULL;
3902 int needed = 0;
3903
3904 if (off == 0 && (m->m_flags & M_PKTHDR))
3905 copyhdr = 1;
3906
3907 if (*m_last != NULL) {
3908 m = *m_last;
3909 off = *m_off;
3910 } else {
3911 while (off >= m->m_len) {
3912 off -= m->m_len;
3913 m = m->m_next;
3914 }
3915 }
3916
3917 n = m;
3918 while (len > 0) {
3919 needed++;
3920 ASSERT(n != NULL);
3921 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
3922 n = n->m_next;
3923 }
3924 needed++;
3925 len = len0;
3926
3927 /*
3928 * If the caller doesn't want to be put to sleep, mark it with
3929 * MCR_TRYHARD so that we may reclaim buffers from other places
3930 * before giving up.
3931 */
3932 if (mcflags & MCR_NOSLEEP)
3933 mcflags |= MCR_TRYHARD;
3934
3935 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
3936 mcflags) != needed)
3937 goto nospace;
3938
3939 needed = 0;
3940 while (len > 0) {
3941 n = (struct mbuf *)list;
3942 list = list->obj_next;
3943 ASSERT(n != NULL && m != NULL);
3944
3945 type = (top == NULL) ? MT_HEADER : m->m_type;
3946 MBUF_INIT(n, (top == NULL), type);
3947 #if CONFIG_MACF_NET
3948 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
3949 mtype_stat_inc(MT_HEADER);
3950 mtype_stat_dec(MT_FREE);
3951 m_free(n);
3952 goto nospace;
3953 }
3954 #endif /* MAC_NET */
3955
3956 if (top == NULL) {
3957 top = n;
3958 np = &top->m_next;
3959 continue;
3960 } else {
3961 needed++;
3962 *np = n;
3963 }
3964
3965 if (copyhdr) {
3966 M_COPY_PKTHDR(n, m);
3967 n->m_pkthdr.len = len;
3968 copyhdr = 0;
3969 }
3970 n->m_len = MIN(len, (m->m_len - off));
3971
3972 if (m->m_flags & M_EXT) {
3973 n->m_ext = m->m_ext;
3974 m_incref(m);
3975 n->m_data = m->m_data + off;
3976 n->m_flags |= M_EXT;
3977 } else {
3978 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
3979 (unsigned)n->m_len);
3980 }
3981 len -= n->m_len;
3982
3983 if (len == 0) {
3984 if ((off + n->m_len) == m->m_len) {
3985 *m_last = m->m_next;
3986 *m_off = 0;
3987 } else {
3988 *m_last = m;
3989 *m_off = off + n->m_len;
3990 }
3991 break;
3992 }
3993 off = 0;
3994 m = m->m_next;
3995 np = &n->m_next;
3996 }
3997
3998 mtype_stat_inc(MT_HEADER);
3999 mtype_stat_add(type, needed);
4000 mtype_stat_sub(MT_FREE, needed + 1);
4001
4002 ASSERT(list == NULL);
4003 return (top);
4004
4005 nospace:
4006 if (list != NULL)
4007 mcache_free_ext(m_cache(MC_MBUF), list);
4008 if (top != NULL)
4009 m_freem(top);
4010 MCFail++;
4011 return (NULL);
4012 }
4013
4014 /*
4015 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4016 * continuing for "len" bytes, into the indicated buffer.
4017 */
4018 void
4019 m_copydata(struct mbuf *m, int off, int len, caddr_t cp)
4020 {
4021 unsigned count;
4022
4023 if (off < 0 || len < 0)
4024 panic("m_copydata: invalid offset %d or len %d", off, len);
4025
4026 while (off > 0) {
4027 if (m == NULL)
4028 panic("m_copydata: invalid mbuf chain");
4029 if (off < m->m_len)
4030 break;
4031 off -= m->m_len;
4032 m = m->m_next;
4033 }
4034 while (len > 0) {
4035 if (m == NULL)
4036 panic("m_copydata: invalid mbuf chain");
4037 count = MIN(m->m_len - off, len);
4038 bcopy(MTOD(m, caddr_t) + off, cp, count);
4039 len -= count;
4040 cp += count;
4041 off = 0;
4042 m = m->m_next;
4043 }
4044 }
4045
4046 /*
4047 * Concatenate mbuf chain n to m. Both chains must be of the same type
4048 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4049 */
4050 void
4051 m_cat(struct mbuf *m, struct mbuf *n)
4052 {
4053 while (m->m_next)
4054 m = m->m_next;
4055 while (n) {
4056 if ((m->m_flags & M_EXT) ||
4057 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4058 /* just join the two chains */
4059 m->m_next = n;
4060 return;
4061 }
4062 /* splat the data from one into the other */
4063 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4064 (u_int)n->m_len);
4065 m->m_len += n->m_len;
4066 n = m_free(n);
4067 }
4068 }
4069
4070 void
4071 m_adj(struct mbuf *mp, int req_len)
4072 {
4073 int len = req_len;
4074 struct mbuf *m;
4075 int count;
4076
4077 if ((m = mp) == NULL)
4078 return;
4079 if (len >= 0) {
4080 /*
4081 * Trim from head.
4082 */
4083 while (m != NULL && len > 0) {
4084 if (m->m_len <= len) {
4085 len -= m->m_len;
4086 m->m_len = 0;
4087 m = m->m_next;
4088 } else {
4089 m->m_len -= len;
4090 m->m_data += len;
4091 len = 0;
4092 }
4093 }
4094 m = mp;
4095 if (m->m_flags & M_PKTHDR)
4096 m->m_pkthdr.len -= (req_len - len);
4097 } else {
4098 /*
4099 * Trim from tail. Scan the mbuf chain,
4100 * calculating its length and finding the last mbuf.
4101 * If the adjustment only affects this mbuf, then just
4102 * adjust and return. Otherwise, rescan and truncate
4103 * after the remaining size.
4104 */
4105 len = -len;
4106 count = 0;
4107 for (;;) {
4108 count += m->m_len;
4109 if (m->m_next == (struct mbuf *)0)
4110 break;
4111 m = m->m_next;
4112 }
4113 if (m->m_len >= len) {
4114 m->m_len -= len;
4115 m = mp;
4116 if (m->m_flags & M_PKTHDR)
4117 m->m_pkthdr.len -= len;
4118 return;
4119 }
4120 count -= len;
4121 if (count < 0)
4122 count = 0;
4123 /*
4124 * Correct length for chain is "count".
4125 * Find the mbuf with last data, adjust its length,
4126 * and toss data from remaining mbufs on chain.
4127 */
4128 m = mp;
4129 if (m->m_flags & M_PKTHDR)
4130 m->m_pkthdr.len = count;
4131 for (; m; m = m->m_next) {
4132 if (m->m_len >= count) {
4133 m->m_len = count;
4134 break;
4135 }
4136 count -= m->m_len;
4137 }
4138 while ((m = m->m_next))
4139 m->m_len = 0;
4140 }
4141 }
4142
4143 /*
4144 * Rearange an mbuf chain so that len bytes are contiguous
4145 * and in the data area of an mbuf (so that mtod and dtom
4146 * will work for a structure of size len). Returns the resulting
4147 * mbuf chain on success, frees it and returns null on failure.
4148 * If there is room, it will add up to max_protohdr-len extra bytes to the
4149 * contiguous region in an attempt to avoid being called next time.
4150 */
4151 int MPFail;
4152
4153 struct mbuf *
4154 m_pullup(struct mbuf *n, int len)
4155 {
4156 struct mbuf *m;
4157 int count;
4158 int space;
4159
4160 /*
4161 * If first mbuf has no cluster, and has room for len bytes
4162 * without shifting current data, pullup into it,
4163 * otherwise allocate a new mbuf to prepend to the chain.
4164 */
4165 if ((n->m_flags & M_EXT) == 0 &&
4166 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4167 if (n->m_len >= len)
4168 return (n);
4169 m = n;
4170 n = n->m_next;
4171 len -= m->m_len;
4172 } else {
4173 if (len > MHLEN)
4174 goto bad;
4175 _MGET(m, M_DONTWAIT, n->m_type);
4176 if (m == 0)
4177 goto bad;
4178 m->m_len = 0;
4179 if (n->m_flags & M_PKTHDR) {
4180 M_COPY_PKTHDR(m, n);
4181 n->m_flags &= ~M_PKTHDR;
4182 }
4183 }
4184 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4185 do {
4186 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4187 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4188 (unsigned)count);
4189 len -= count;
4190 m->m_len += count;
4191 n->m_len -= count;
4192 space -= count;
4193 if (n->m_len)
4194 n->m_data += count;
4195 else
4196 n = m_free(n);
4197 } while (len > 0 && n);
4198 if (len > 0) {
4199 (void) m_free(m);
4200 goto bad;
4201 }
4202 m->m_next = n;
4203 return (m);
4204 bad:
4205 m_freem(n);
4206 MPFail++;
4207 return (0);
4208 }
4209
4210 /*
4211 * Partition an mbuf chain in two pieces, returning the tail --
4212 * all but the first len0 bytes. In case of failure, it returns NULL and
4213 * attempts to restore the chain to its original state.
4214 */
4215 struct mbuf *
4216 m_split(struct mbuf *m0, int len0, int wait)
4217 {
4218 struct mbuf *m, *n;
4219 unsigned len = len0, remain;
4220
4221 for (m = m0; m && len > m->m_len; m = m->m_next)
4222 len -= m->m_len;
4223 if (m == NULL)
4224 return (NULL);
4225 remain = m->m_len - len;
4226 if (m0->m_flags & M_PKTHDR) {
4227 _MGETHDR(n, wait, m0->m_type);
4228 if (n == NULL)
4229 return (NULL);
4230 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4231 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4232 m0->m_pkthdr.len = len0;
4233 if (m->m_flags & M_EXT)
4234 goto extpacket;
4235 if (remain > MHLEN) {
4236 /* m can't be the lead packet */
4237 MH_ALIGN(n, 0);
4238 n->m_next = m_split(m, len, wait);
4239 if (n->m_next == NULL) {
4240 (void) m_free(n);
4241 return (NULL);
4242 } else
4243 return (n);
4244 } else
4245 MH_ALIGN(n, remain);
4246 } else if (remain == 0) {
4247 n = m->m_next;
4248 m->m_next = NULL;
4249 return (n);
4250 } else {
4251 _MGET(n, wait, m->m_type);
4252 if (n == NULL)
4253 return (NULL);
4254 M_ALIGN(n, remain);
4255 }
4256 extpacket:
4257 if (m->m_flags & M_EXT) {
4258 n->m_flags |= M_EXT;
4259 n->m_ext = m->m_ext;
4260 m_incref(m);
4261 n->m_data = m->m_data + len;
4262 } else {
4263 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4264 }
4265 n->m_len = remain;
4266 m->m_len = len;
4267 n->m_next = m->m_next;
4268 m->m_next = NULL;
4269 return (n);
4270 }
4271
4272 /*
4273 * Routine to copy from device local memory into mbufs.
4274 */
4275 struct mbuf *
4276 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4277 void (*copy)(const void *, void *, size_t))
4278 {
4279 struct mbuf *m;
4280 struct mbuf *top = NULL, **mp = &top;
4281 int off = off0, len;
4282 char *cp;
4283 char *epkt;
4284
4285 cp = buf;
4286 epkt = cp + totlen;
4287 if (off) {
4288 /*
4289 * If 'off' is non-zero, packet is trailer-encapsulated,
4290 * so we have to skip the type and length fields.
4291 */
4292 cp += off + 2 * sizeof (u_int16_t);
4293 totlen -= 2 * sizeof (u_int16_t);
4294 }
4295 _MGETHDR(m, M_DONTWAIT, MT_DATA);
4296 if (m == NULL)
4297 return (NULL);
4298 m->m_pkthdr.rcvif = ifp;
4299 m->m_pkthdr.len = totlen;
4300 m->m_len = MHLEN;
4301
4302 while (totlen > 0) {
4303 if (top != NULL) {
4304 _MGET(m, M_DONTWAIT, MT_DATA);
4305 if (m == NULL) {
4306 m_freem(top);
4307 return (NULL);
4308 }
4309 m->m_len = MLEN;
4310 }
4311 len = MIN(totlen, epkt - cp);
4312 if (len >= MINCLSIZE) {
4313 MCLGET(m, M_DONTWAIT);
4314 if (m->m_flags & M_EXT) {
4315 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4316 } else {
4317 /* give up when it's out of cluster mbufs */
4318 if (top != NULL)
4319 m_freem(top);
4320 m_freem(m);
4321 return (NULL);
4322 }
4323 } else {
4324 /*
4325 * Place initial small packet/header at end of mbuf.
4326 */
4327 if (len < m->m_len) {
4328 if (top == NULL &&
4329 len + max_linkhdr <= m->m_len)
4330 m->m_data += max_linkhdr;
4331 m->m_len = len;
4332 } else {
4333 len = m->m_len;
4334 }
4335 }
4336 if (copy)
4337 copy(cp, MTOD(m, caddr_t), (unsigned)len);
4338 else
4339 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4340 cp += len;
4341 *mp = m;
4342 mp = &m->m_next;
4343 totlen -= len;
4344 if (cp == epkt)
4345 cp = buf;
4346 }
4347 return (top);
4348 }
4349
4350 /*
4351 * Cluster freelist allocation check.
4352 */
4353 static int
4354 m_howmany(int num, size_t bufsize)
4355 {
4356 int i = 0, j = 0;
4357 u_int32_t m_clusters, m_bigclusters, m_16kclusters;
4358 u_int32_t m_clfree, m_bigclfree, m_16kclfree;
4359
4360 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4361
4362 m_clusters = m_total(MC_CL);
4363 m_bigclusters = m_total(MC_BIGCL);
4364 m_16kclusters = m_total(MC_16KCL);
4365 m_clfree = m_infree(MC_CL);
4366 m_bigclfree = m_infree(MC_BIGCL);
4367 m_16kclfree = m_infree(MC_16KCL);
4368
4369 /* Bail if we've maxed out the mbuf memory map */
4370 if ((bufsize != m_maxsize(MC_16KCL) &&
4371 (m_clusters + (m_bigclusters << 1) >= nclusters)) ||
4372 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
4373 (m_16kclusters << 3) >= njcl)) {
4374 #if DEBUG
4375 if (bufsize == MCLBYTES && num > m_clfree) {
4376 printf("m_howmany - out of small clusters, "
4377 "%d short\n", num - mbstat.m_clfree);
4378 }
4379 #endif /* DEBUG */
4380 return (0);
4381 }
4382
4383 if (bufsize == m_maxsize(MC_CL)) {
4384 /* Under minimum */
4385 if (m_clusters < MINCL)
4386 return (MINCL - m_clusters);
4387 /* Too few (free < 1/16 total) and not over maximum */
4388 if (m_clusters < m_maxlimit(MC_CL)) {
4389 if (m_clfree >= MCL_LOWAT)
4390 return (0);
4391 if (num >= m_clfree)
4392 i = num - m_clfree;
4393 if (((m_clusters + num) >> 4) > m_clfree)
4394 j = ((m_clusters + num) >> 4) - m_clfree;
4395 i = MAX(i, j);
4396 if (i + m_clusters >= m_maxlimit(MC_CL))
4397 i = m_maxlimit(MC_CL) - m_clusters;
4398 }
4399 VERIFY((m_total(MC_CL) + i) <= m_maxlimit(MC_CL));
4400 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4401 /* Under minimum */
4402 if (m_bigclusters < MINBIGCL)
4403 return (MINBIGCL - m_bigclusters);
4404 /* Too few (free < 1/16 total) and not over maximum */
4405 if (m_bigclusters < m_maxlimit(MC_BIGCL)) {
4406 if (m_bigclfree >= MBIGCL_LOWAT)
4407 return (0);
4408 if (num >= m_bigclfree)
4409 i = num - m_bigclfree;
4410 if (((m_bigclusters + num) >> 4) > m_bigclfree)
4411 j = ((m_bigclusters + num) >> 4) - m_bigclfree;
4412 i = MAX(i, j);
4413 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
4414 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
4415 }
4416 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
4417 } else {
4418 VERIFY(njcl > 0);
4419 /* Under minimum */
4420 if (m_16kclusters < MIN16KCL)
4421 return (MIN16KCL - m_16kclusters);
4422 /* Too few (free < 1/16 total) and not over maximum */
4423 if (m_16kclusters < m_maxlimit(MC_16KCL)) {
4424 if (m_16kclfree >= M16KCL_LOWAT)
4425 return (0);
4426 if (num >= m_16kclfree)
4427 i = num - m_16kclfree;
4428 if (((m_16kclusters + num) >> 4) > m_16kclfree)
4429 j = ((m_16kclusters + num) >> 4) - m_16kclfree;
4430 i = MAX(i, j);
4431 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
4432 i = m_maxlimit(MC_16KCL) - m_16kclusters;
4433 }
4434 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
4435 }
4436
4437 return (i);
4438 }
4439
4440 /*
4441 * Copy data from a buffer back into the indicated mbuf chain,
4442 * starting "off" bytes from the beginning, extending the mbuf
4443 * chain if necessary.
4444 */
4445 void
4446 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
4447 {
4448 int mlen;
4449 struct mbuf *m = m0, *n;
4450 int totlen = 0;
4451
4452 if (m0 == NULL)
4453 return;
4454 while (off > (mlen = m->m_len)) {
4455 off -= mlen;
4456 totlen += mlen;
4457 if (m->m_next == NULL) {
4458 n = m_getclr(M_DONTWAIT, m->m_type);
4459 if (n == NULL)
4460 goto out;
4461 n->m_len = MIN(MLEN, len + off);
4462 m->m_next = n;
4463 }
4464 m = m->m_next;
4465 }
4466 while (len > 0) {
4467 mlen = MIN(m->m_len - off, len);
4468 bcopy(cp, off + MTOD(m, caddr_t), (unsigned)mlen);
4469 cp += mlen;
4470 len -= mlen;
4471 mlen += off;
4472 off = 0;
4473 totlen += mlen;
4474 if (len == 0)
4475 break;
4476 if (m->m_next == NULL) {
4477 n = _M_GET(M_DONTWAIT, m->m_type);
4478 if (n == NULL)
4479 break;
4480 n->m_len = MIN(MLEN, len);
4481 m->m_next = n;
4482 }
4483 m = m->m_next;
4484 }
4485 out:
4486 if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
4487 m->m_pkthdr.len = totlen;
4488 }
4489
4490 char *
4491 mcl_to_paddr(char *addr)
4492 {
4493 int base_phys;
4494
4495 if (!MBUF_IN_MAP(addr))
4496 return (NULL);
4497 base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
4498
4499 if (base_phys == 0)
4500 return (NULL);
4501 return ((char *)((int)base_phys | ((int)addr & PGOFSET)));
4502 }
4503
4504 /*
4505 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
4506 * And really copy the thing. That way, we don't "precompute" checksums
4507 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
4508 * small packets, don't dup into a cluster. That way received packets
4509 * don't take up too much room in the sockbuf (cf. sbspace()).
4510 */
4511 int MDFail;
4512
4513 struct mbuf *
4514 m_dup(struct mbuf *m, int how)
4515 {
4516 struct mbuf *n, **np;
4517 struct mbuf *top;
4518 int copyhdr = 0;
4519
4520 np = &top;
4521 top = NULL;
4522 if (m->m_flags & M_PKTHDR)
4523 copyhdr = 1;
4524
4525 /*
4526 * Quick check: if we have one mbuf and its data fits in an
4527 * mbuf with packet header, just copy and go.
4528 */
4529 if (m->m_next == NULL) {
4530 /* Then just move the data into an mbuf and be done... */
4531 if (copyhdr) {
4532 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
4533 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
4534 return (NULL);
4535 n->m_len = m->m_len;
4536 m_dup_pkthdr(n, m, how);
4537 bcopy(m->m_data, n->m_data, m->m_len);
4538 return (n);
4539 }
4540 } else if (m->m_len <= MLEN) {
4541 if ((n = _M_GET(how, m->m_type)) == NULL)
4542 return (NULL);
4543 bcopy(m->m_data, n->m_data, m->m_len);
4544 n->m_len = m->m_len;
4545 return (n);
4546 }
4547 }
4548 while (m != NULL) {
4549 #if BLUE_DEBUG
4550 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
4551 m->m_data);
4552 #endif
4553 if (copyhdr)
4554 n = _M_GETHDR(how, m->m_type);
4555 else
4556 n = _M_GET(how, m->m_type);
4557 if (n == NULL)
4558 goto nospace;
4559 if (m->m_flags & M_EXT) {
4560 if (m->m_len <= m_maxsize(MC_CL))
4561 MCLGET(n, how);
4562 else if (m->m_len <= m_maxsize(MC_BIGCL))
4563 n = m_mbigget(n, how);
4564 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
4565 n = m_m16kget(n, how);
4566 if (!(n->m_flags & M_EXT)) {
4567 (void) m_free(n);
4568 goto nospace;
4569 }
4570 }
4571 *np = n;
4572 if (copyhdr) {
4573 /* Don't use M_COPY_PKTHDR: preserve m_data */
4574 m_dup_pkthdr(n, m, how);
4575 copyhdr = 0;
4576 if (!(n->m_flags & M_EXT))
4577 n->m_data = n->m_pktdat;
4578 }
4579 n->m_len = m->m_len;
4580 /*
4581 * Get the dup on the same bdry as the original
4582 * Assume that the two mbufs have the same offset to data area
4583 * (up to word boundaries)
4584 */
4585 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
4586 m = m->m_next;
4587 np = &n->m_next;
4588 #if BLUE_DEBUG
4589 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
4590 n->m_data);
4591 #endif
4592 }
4593
4594 if (top == NULL)
4595 MDFail++;
4596 return (top);
4597
4598 nospace:
4599 m_freem(top);
4600 MDFail++;
4601 return (NULL);
4602 }
4603
4604 #define MBUF_MULTIPAGES(m) \
4605 (((m)->m_flags & M_EXT) && \
4606 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
4607 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
4608 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
4609
4610 static struct mbuf *
4611 m_expand(struct mbuf *m, struct mbuf **last)
4612 {
4613 struct mbuf *top = NULL;
4614 struct mbuf **nm = &top;
4615 uintptr_t data0, data;
4616 unsigned int len0, len;
4617
4618 VERIFY(MBUF_MULTIPAGES(m));
4619 VERIFY(m->m_next == NULL);
4620 data0 = (uintptr_t)m->m_data;
4621 len0 = m->m_len;
4622 *last = top;
4623
4624 for (;;) {
4625 struct mbuf *n;
4626
4627 data = data0;
4628 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
4629 len = NBPG;
4630 else if (!IS_P2ALIGNED(data, NBPG) &&
4631 P2ROUNDUP(data, NBPG) < (data + len0))
4632 len = P2ROUNDUP(data, NBPG) - data;
4633 else
4634 len = len0;
4635
4636 VERIFY(len > 0);
4637 VERIFY(m->m_flags & M_EXT);
4638 m->m_data = (void *)data;
4639 m->m_len = len;
4640
4641 *nm = *last = m;
4642 nm = &m->m_next;
4643 m->m_next = NULL;
4644
4645 data0 += len;
4646 len0 -= len;
4647 if (len0 == 0)
4648 break;
4649
4650 n = _M_RETRY(M_DONTWAIT, MT_DATA);
4651 if (n == NULL) {
4652 m_freem(top);
4653 top = *last = NULL;
4654 break;
4655 }
4656
4657 n->m_ext = m->m_ext;
4658 m_incref(m);
4659 n->m_flags |= M_EXT;
4660 m = n;
4661 }
4662 return (top);
4663 }
4664
4665 struct mbuf *
4666 m_normalize(struct mbuf *m)
4667 {
4668 struct mbuf *top = NULL;
4669 struct mbuf **nm = &top;
4670 boolean_t expanded = FALSE;
4671
4672 while (m != NULL) {
4673 struct mbuf *n;
4674
4675 n = m->m_next;
4676 m->m_next = NULL;
4677
4678 /* Does the data cross one or more page boundaries? */
4679 if (MBUF_MULTIPAGES(m)) {
4680 struct mbuf *last;
4681 if ((m = m_expand(m, &last)) == NULL) {
4682 m_freem(n);
4683 m_freem(top);
4684 top = NULL;
4685 break;
4686 }
4687 *nm = m;
4688 nm = &last->m_next;
4689 expanded = TRUE;
4690 } else {
4691 *nm = m;
4692 nm = &m->m_next;
4693 }
4694 m = n;
4695 }
4696 if (expanded)
4697 atomic_add_32(&mb_normalized, 1);
4698 return (top);
4699 }
4700
4701 void
4702 m_mchtype(struct mbuf *m, int t)
4703 {
4704 mtype_stat_inc(t);
4705 mtype_stat_dec(m->m_type);
4706 (m)->m_type = t;
4707 }
4708
4709 void *
4710 m_mtod(struct mbuf *m)
4711 {
4712 return (MTOD(m, void *));
4713 }
4714
4715 struct mbuf *
4716 m_dtom(void *x)
4717 {
4718 return ((struct mbuf *)((u_long)(x) & ~(MSIZE-1)));
4719 }
4720
4721 void
4722 m_mcheck(struct mbuf *m)
4723 {
4724 _MCHECK(m);
4725 }
4726
4727 /*
4728 * Inform the corresponding mcache(s) that there's a waiter below.
4729 */
4730 static void
4731 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
4732 {
4733 mcache_waiter_inc(m_cache(class));
4734 if (comp) {
4735 if (class == MC_CL) {
4736 mcache_waiter_inc(m_cache(MC_MBUF_CL));
4737 } else if (class == MC_BIGCL) {
4738 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4739 } else if (class == MC_16KCL) {
4740 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
4741 } else {
4742 mcache_waiter_inc(m_cache(MC_MBUF_CL));
4743 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4744 }
4745 }
4746 }
4747
4748 /*
4749 * Inform the corresponding mcache(s) that there's no more waiter below.
4750 */
4751 static void
4752 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
4753 {
4754 mcache_waiter_dec(m_cache(class));
4755 if (comp) {
4756 if (class == MC_CL) {
4757 mcache_waiter_dec(m_cache(MC_MBUF_CL));
4758 } else if (class == MC_BIGCL) {
4759 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4760 } else if (class == MC_16KCL) {
4761 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
4762 } else {
4763 mcache_waiter_dec(m_cache(MC_MBUF_CL));
4764 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4765 }
4766 }
4767 }
4768
4769 /*
4770 * Called during blocking allocation. Returns TRUE if one or more objects
4771 * are available at the per-CPU caches layer and that allocation should be
4772 * retried at that level.
4773 */
4774 static boolean_t
4775 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
4776 {
4777 boolean_t mcache_retry = FALSE;
4778
4779 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4780
4781 /* Check if there's anything at the cache layer */
4782 if (mbuf_cached_above(class, wait)) {
4783 mcache_retry = TRUE;
4784 goto done;
4785 }
4786
4787 /* Nothing? Then try hard to get it from somewhere */
4788 m_reclaim(class, num, (wait & MCR_COMP));
4789
4790 /* We tried hard and got something? */
4791 if (m_infree(class) > 0) {
4792 mbstat.m_wait++;
4793 goto done;
4794 } else if (mbuf_cached_above(class, wait)) {
4795 mbstat.m_wait++;
4796 mcache_retry = TRUE;
4797 goto done;
4798 } else if (wait & MCR_TRYHARD) {
4799 mcache_retry = TRUE;
4800 goto done;
4801 }
4802
4803 /*
4804 * There's really nothing for us right now; inform the
4805 * cache(s) that there is a waiter below and go to sleep.
4806 */
4807 mbuf_waiter_inc(class, (wait & MCR_COMP));
4808
4809 VERIFY(!(wait & MCR_NOSLEEP));
4810 mb_waiters++;
4811 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
4812
4813 /* We are now up; stop getting notified until next round */
4814 mbuf_waiter_dec(class, (wait & MCR_COMP));
4815
4816 /* We waited and got something */
4817 if (m_infree(class) > 0) {
4818 mbstat.m_wait++;
4819 goto done;
4820 } else if (mbuf_cached_above(class, wait)) {
4821 mbstat.m_wait++;
4822 mcache_retry = TRUE;
4823 }
4824 done:
4825 return (mcache_retry);
4826 }
4827
4828 static void
4829 mbuf_worker_thread(void)
4830 {
4831 int mbuf_expand;
4832
4833 while (1) {
4834 lck_mtx_lock(mbuf_mlock);
4835
4836 mbuf_expand = 0;
4837 if (mbuf_expand_mcl) {
4838 int n;
4839
4840 /* Adjust to current number of cluster in use */
4841 n = mbuf_expand_mcl -
4842 (m_total(MC_CL) - m_infree(MC_CL));
4843 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
4844 n = m_maxlimit(MC_CL) - m_total(MC_CL);
4845 mbuf_expand_mcl = 0;
4846
4847 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
4848 mbuf_expand++;
4849 }
4850 if (mbuf_expand_big) {
4851 int n;
4852
4853 /* Adjust to current number of 4 KB cluster in use */
4854 n = mbuf_expand_big -
4855 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
4856 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
4857 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
4858 mbuf_expand_big = 0;
4859
4860 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
4861 mbuf_expand++;
4862 }
4863 if (mbuf_expand_16k) {
4864 int n;
4865
4866 /* Adjust to current number of 16 KB cluster in use */
4867 n = mbuf_expand_16k -
4868 (m_total(MC_16KCL) - m_infree(MC_16KCL));
4869 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
4870 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
4871 mbuf_expand_16k = 0;
4872
4873 if (n > 0)
4874 (void) freelist_populate(MC_16KCL, n, M_WAIT);
4875 }
4876
4877 /*
4878 * Because we can run out of memory before filling the mbuf
4879 * map, we should not allocate more clusters than they are
4880 * mbufs -- otherwise we could have a large number of useless
4881 * clusters allocated.
4882 */
4883 if (mbuf_expand) {
4884 while (m_total(MC_MBUF) <
4885 (m_total(MC_BIGCL) + m_total(MC_CL))) {
4886 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
4887 break;
4888 }
4889 }
4890
4891 lck_mtx_unlock(mbuf_mlock);
4892
4893 assert_wait(&mbuf_worker_run, THREAD_UNINT);
4894 (void) thread_block((thread_continue_t)mbuf_worker_thread);
4895 }
4896 }
4897
4898 static void
4899 mbuf_worker_thread_init(void)
4900 {
4901 mbuf_worker_ready++;
4902 mbuf_worker_thread();
4903 }
4904
4905 static mcl_slab_t *
4906 slab_get(void *buf)
4907 {
4908 mcl_slabg_t *slg;
4909 unsigned int ix, k;
4910
4911 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4912
4913 VERIFY(MBUF_IN_MAP(buf));
4914 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
4915 VERIFY(ix < maxslabgrp);
4916
4917 if ((slg = slabstbl[ix]) == NULL) {
4918 /*
4919 * In the current implementation, we never shrink the memory
4920 * pool (hence the cluster map); if we attempt to reallocate
4921 * a cluster group when it's already allocated, panic since
4922 * this is a sign of a memory corruption (slabstbl[ix] got
4923 * nullified). This also means that there shouldn't be any
4924 * hole in the kernel sub-map for the mbuf pool.
4925 */
4926 ++slabgrp;
4927 VERIFY(ix < slabgrp);
4928 /*
4929 * Slabs expansion can only be done single threaded; when
4930 * we get here, it must be as a result of m_clalloc() which
4931 * is serialized and therefore mb_clalloc_busy must be set.
4932 */
4933 VERIFY(mb_clalloc_busy);
4934 lck_mtx_unlock(mbuf_mlock);
4935
4936 /* This is a new buffer; create the slabs group for it */
4937 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
4938 M_WAITOK | M_ZERO);
4939 VERIFY(slg != NULL);
4940
4941 lck_mtx_lock(mbuf_mlock);
4942 /*
4943 * No other thread could have gone into m_clalloc() after
4944 * we dropped the lock above, so verify that it's true.
4945 */
4946 VERIFY(mb_clalloc_busy);
4947
4948 slabstbl[ix] = slg;
4949
4950 /* Chain each slab in the group to its forward neighbor */
4951 for (k = 1; k < NSLABSPMB; k++)
4952 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
4953 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
4954
4955 /* And chain the last slab in the previous group to this */
4956 if (ix > 0) {
4957 VERIFY(slabstbl[ix - 1]->
4958 slg_slab[NSLABSPMB - 1].sl_next == NULL);
4959 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
4960 &slg->slg_slab[0];
4961 }
4962 }
4963
4964 ix = MTOCL(buf) % NSLABSPMB;
4965 VERIFY(ix < NSLABSPMB);
4966
4967 return (&slg->slg_slab[ix]);
4968 }
4969
4970 static void
4971 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
4972 void *base, void *head, unsigned int len, int refcnt, int chunks)
4973 {
4974 sp->sl_class = class;
4975 sp->sl_flags = flags;
4976 sp->sl_base = base;
4977 sp->sl_head = head;
4978 sp->sl_len = len;
4979 sp->sl_refcnt = refcnt;
4980 sp->sl_chunks = chunks;
4981 slab_detach(sp);
4982 }
4983
4984 static void
4985 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
4986 {
4987 VERIFY(slab_is_detached(sp));
4988 m_slab_cnt(class)++;
4989 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
4990 sp->sl_flags &= ~SLF_DETACHED;
4991 if (class == MC_BIGCL) {
4992 sp = sp->sl_next;
4993 /* Next slab must already be present */
4994 VERIFY(sp != NULL);
4995 VERIFY(slab_is_detached(sp));
4996 sp->sl_flags &= ~SLF_DETACHED;
4997 } else if (class == MC_16KCL) {
4998 int k;
4999 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5000 sp = sp->sl_next;
5001 /* Next slab must already be present */
5002 VERIFY(sp != NULL);
5003 VERIFY(slab_is_detached(sp));
5004 sp->sl_flags &= ~SLF_DETACHED;
5005 }
5006 }
5007 }
5008
5009 static void
5010 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
5011 {
5012 VERIFY(!slab_is_detached(sp));
5013 VERIFY(m_slab_cnt(class) > 0);
5014 m_slab_cnt(class)--;
5015 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
5016 slab_detach(sp);
5017 if (class == MC_BIGCL) {
5018 sp = sp->sl_next;
5019 /* Next slab must already be present */
5020 VERIFY(sp != NULL);
5021 VERIFY(!slab_is_detached(sp));
5022 slab_detach(sp);
5023 } else if (class == MC_16KCL) {
5024 int k;
5025 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5026 sp = sp->sl_next;
5027 /* Next slab must already be present */
5028 VERIFY(sp != NULL);
5029 VERIFY(!slab_is_detached(sp));
5030 slab_detach(sp);
5031 }
5032 }
5033 }
5034
5035 static boolean_t
5036 slab_inrange(mcl_slab_t *sp, void *buf)
5037 {
5038 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
5039 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
5040 }
5041
5042 #undef panic(...)
5043
5044 static void
5045 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
5046 {
5047 int i;
5048 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
5049 uintptr_t buf = (uintptr_t)sp->sl_base;
5050
5051 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
5052 void *next = ((mcache_obj_t *)buf)->obj_next;
5053 if (next != addr)
5054 continue;
5055 if (mclaudit == NULL) {
5056 if (next != NULL && !MBUF_IN_MAP(next)) {
5057 mcache_t *cp = m_cache(sp->sl_class);
5058 panic("%s: %s buffer %p in slab %p modified "
5059 "after free at offset 0: %p out of range "
5060 "[%p-%p)\n", __func__, cp->mc_name,
5061 (void *)buf, sp, next, mbutl, embutl);
5062 /* NOTREACHED */
5063 }
5064 } else {
5065 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
5066 (mcache_obj_t *)buf);
5067 mcl_audit_verify_nextptr(next, mca);
5068 }
5069 }
5070 }
5071
5072 static void
5073 slab_detach(mcl_slab_t *sp)
5074 {
5075 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
5076 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
5077 sp->sl_flags |= SLF_DETACHED;
5078 }
5079
5080 static boolean_t
5081 slab_is_detached(mcl_slab_t *sp)
5082 {
5083 return ((intptr_t)sp->sl_link.tqe_next == -1 &&
5084 (intptr_t)sp->sl_link.tqe_prev == -1 &&
5085 (sp->sl_flags & SLF_DETACHED));
5086 }
5087
5088 static void
5089 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
5090 mcache_obj_t **con_list, size_t con_size, unsigned int num)
5091 {
5092 mcache_audit_t *mca, *mca_tail;
5093 mcache_obj_t *con = NULL;
5094 boolean_t save_contents = (con_list != NULL);
5095 unsigned int i, ix;
5096
5097 ASSERT(num <= NMBPCL);
5098 ASSERT(con_list == NULL || con_size != 0);
5099
5100 ix = MTOCL(buf);
5101 /* Make sure we haven't been here before */
5102 for (i = 0; i < NMBPCL; i++)
5103 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
5104
5105 mca = mca_tail = *mca_list;
5106 if (save_contents)
5107 con = *con_list;
5108
5109 for (i = 0; i < num; i++) {
5110 mcache_audit_t *next;
5111
5112 next = mca->mca_next;
5113 bzero(mca, sizeof (*mca));
5114 mca->mca_next = next;
5115 mclaudit[ix].cl_audit[i] = mca;
5116
5117 /* Attach the contents buffer if requested */
5118 if (save_contents) {
5119 VERIFY(con != NULL);
5120 mca->mca_contents_size = con_size;
5121 mca->mca_contents = con;
5122 con = con->obj_next;
5123 bzero(mca->mca_contents, mca->mca_contents_size);
5124 }
5125
5126 mca_tail = mca;
5127 mca = mca->mca_next;
5128 }
5129
5130 if (save_contents)
5131 *con_list = con;
5132
5133 *mca_list = mca_tail->mca_next;
5134 mca_tail->mca_next = NULL;
5135 }
5136
5137 /*
5138 * Given an address of a buffer (mbuf/cluster/big cluster), return
5139 * the corresponding audit structure for that buffer.
5140 */
5141 static mcache_audit_t *
5142 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
5143 {
5144 mcache_audit_t *mca = NULL;
5145 int ix = MTOCL(o);
5146
5147 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
5148
5149 switch (class) {
5150 case MC_MBUF:
5151 /*
5152 * For the mbuf case, find the index of the cluster
5153 * used by the mbuf and use that index to locate the
5154 * base address of the cluster. Then find out the
5155 * mbuf index relative to the cluster base and use
5156 * it to locate the audit structure.
5157 */
5158 VERIFY(MCLIDX(CLTOM(ix), o) < (int)NMBPCL);
5159 mca = mclaudit[ix].cl_audit[MCLIDX(CLTOM(ix), o)];
5160 break;
5161
5162 case MC_CL:
5163 case MC_BIGCL:
5164 case MC_16KCL:
5165 /*
5166 * Same as above, but only return the first element.
5167 */
5168 mca = mclaudit[ix].cl_audit[0];
5169 break;
5170
5171 default:
5172 VERIFY(0);
5173 /* NOTREACHED */
5174 }
5175
5176 return (mca);
5177 }
5178
5179 static void
5180 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
5181 boolean_t alloc)
5182 {
5183 struct mbuf *m = addr;
5184 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
5185
5186 VERIFY(mca->mca_contents != NULL &&
5187 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5188
5189 mcl_audit_verify_nextptr(next, mca);
5190
5191 if (!alloc) {
5192 /* Save constructed mbuf fields */
5193 mcl_audit_save_mbuf(m, mca);
5194 mcache_set_pattern(MCACHE_FREE_PATTERN, m, m_maxsize(MC_MBUF));
5195 ((mcache_obj_t *)m)->obj_next = next;
5196 return;
5197 }
5198
5199 /* Check if the buffer has been corrupted while in freelist */
5200 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
5201
5202 /* Restore constructed mbuf fields */
5203 mcl_audit_restore_mbuf(m, mca, composite);
5204 }
5205
5206 static void
5207 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
5208 {
5209 struct mbuf *ms = (struct mbuf *)mca->mca_contents;
5210
5211 if (composite) {
5212 struct mbuf *next = m->m_next;
5213 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
5214 MBUF_IS_COMPOSITE(ms));
5215 /*
5216 * We could have hand-picked the mbuf fields and restore
5217 * them individually, but that will be a maintenance
5218 * headache. Instead, restore everything that was saved;
5219 * the mbuf layer will recheck and reinitialize anyway.
5220 */
5221 bcopy(ms, m, mca->mca_contents_size);
5222 m->m_next = next;
5223 } else {
5224 /*
5225 * For a regular mbuf (no cluster attached) there's nothing
5226 * to restore other than the type field, which is expected
5227 * to be MT_FREE.
5228 */
5229 m->m_type = ms->m_type;
5230 }
5231 _MCHECK(m);
5232 }
5233
5234 static void
5235 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
5236 {
5237 _MCHECK(m);
5238 bcopy(m, mca->mca_contents, mca->mca_contents_size);
5239 }
5240
5241 static void
5242 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
5243 boolean_t save_next)
5244 {
5245 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
5246
5247 if (!alloc) {
5248 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
5249 if (save_next) {
5250 mcl_audit_verify_nextptr(next, mca);
5251 ((mcache_obj_t *)addr)->obj_next = next;
5252 }
5253 } else {
5254 /* Check if the buffer has been corrupted while in freelist */
5255 mcl_audit_verify_nextptr(next, mca);
5256 mcache_audit_free_verify_set(mca, addr, 0, size);
5257 }
5258 }
5259
5260 static void
5261 mcl_audit_mcheck_panic(struct mbuf *m)
5262 {
5263 mcache_audit_t *mca;
5264
5265 MRANGE(m);
5266 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
5267
5268 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
5269 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
5270 /* NOTREACHED */
5271 }
5272
5273 static void
5274 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
5275 {
5276 if (next != NULL && next != (void *)MCACHE_FREE_PATTERN &&
5277 !MBUF_IN_MAP(next)) {
5278 panic("mcl_audit: buffer %p modified after free at offset 0: "
5279 "%p out of range [%p-%p)\n%s\n",
5280 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
5281 /* NOTREACHED */
5282 }
5283 }
5284
5285 SYSCTL_DECL(_kern_ipc);
5286 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD | CTLFLAG_LOCKED,
5287 0, 0, mbstat_sysctl, "S,mbstat", "");
5288 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLFLAG_RD | CTLFLAG_LOCKED,
5289 0, 0, mb_stat_sysctl, "S,mb_stat", "");
5290 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, CTLFLAG_RD | CTLFLAG_LOCKED,
5291 &mb_normalized, 0, "");