]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
2d21ac55 | 2 | * Copyright (c) 2000-2007 Apple Inc. All rights reserved. |
5d5c5d0d | 3 | * |
2d21ac55 A |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
1c79356b A |
27 | */ |
28 | /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ | |
29 | /* | |
30 | * Copyright (c) 1982, 1986, 1988, 1991, 1993 | |
31 | * The Regents of the University of California. All rights reserved. | |
32 | * | |
33 | * Redistribution and use in source and binary forms, with or without | |
34 | * modification, are permitted provided that the following conditions | |
35 | * are met: | |
36 | * 1. Redistributions of source code must retain the above copyright | |
37 | * notice, this list of conditions and the following disclaimer. | |
38 | * 2. Redistributions in binary form must reproduce the above copyright | |
39 | * notice, this list of conditions and the following disclaimer in the | |
40 | * documentation and/or other materials provided with the distribution. | |
41 | * 3. All advertising materials mentioning features or use of this software | |
42 | * must display the following acknowledgement: | |
43 | * This product includes software developed by the University of | |
44 | * California, Berkeley and its contributors. | |
45 | * 4. Neither the name of the University nor the names of its contributors | |
46 | * may be used to endorse or promote products derived from this software | |
47 | * without specific prior written permission. | |
48 | * | |
49 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
50 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
51 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
52 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
53 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
54 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
55 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
56 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
57 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
58 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
59 | * SUCH DAMAGE. | |
60 | * | |
61 | * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 | |
62 | */ | |
2d21ac55 A |
63 | /* |
64 | * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce | |
65 | * support for mandatory and extensible security protections. This notice | |
66 | * is included in support of clause 2.2 (b) of the Apple Public License, | |
67 | * Version 2.0. | |
1c79356b A |
68 | */ |
69 | ||
70 | #include <sys/param.h> | |
71 | #include <sys/systm.h> | |
72 | #include <sys/malloc.h> | |
73 | #include <sys/mbuf.h> | |
74 | #include <sys/kernel.h> | |
91447636 | 75 | #include <sys/sysctl.h> |
1c79356b A |
76 | #include <sys/syslog.h> |
77 | #include <sys/protosw.h> | |
78 | #include <sys/domain.h> | |
2d21ac55 | 79 | #include <sys/queue.h> |
1c79356b | 80 | |
9bccf70c | 81 | #include <kern/kern_types.h> |
2d21ac55 A |
82 | #include <kern/simple_lock.h> |
83 | #include <kern/queue.h> | |
9bccf70c | 84 | #include <kern/sched_prim.h> |
2d21ac55 A |
85 | #include <kern/cpu_number.h> |
86 | ||
87 | #include <libkern/OSAtomic.h> | |
88 | #include <libkern/libkern.h> | |
9bccf70c | 89 | |
55e303ae A |
90 | #include <IOKit/IOMapper.h> |
91 | ||
2d21ac55 A |
92 | #include <machine/limits.h> |
93 | #include <machine/machine_routines.h> | |
55e303ae | 94 | |
2d21ac55 A |
95 | #if CONFIG_MACF_NET |
96 | #include <security/mac_framework.h> | |
97 | #endif /* MAC_NET */ | |
98 | ||
99 | #include <sys/mcache.h> | |
1c79356b | 100 | |
2d21ac55 A |
101 | /* |
102 | * MBUF IMPLEMENTATION NOTES. | |
103 | * | |
104 | * There is a total of 5 per-CPU caches: | |
105 | * | |
106 | * MC_MBUF: | |
107 | * This is a cache of rudimentary objects of MSIZE in size; each | |
108 | * object represents an mbuf structure. This cache preserves only | |
109 | * the m_type field of the mbuf during its transactions. | |
110 | * | |
111 | * MC_CL: | |
112 | * This is a cache of rudimentary objects of MCLBYTES in size; each | |
113 | * object represents a mcluster structure. This cache does not | |
114 | * preserve the contents of the objects during its transactions. | |
115 | * | |
116 | * MC_BIGCL: | |
117 | * This is a cache of rudimentary objects of NBPG in size; each | |
118 | * object represents a mbigcluster structure. This cache does not | |
119 | * preserve the contents of the objects during its transaction. | |
120 | * | |
121 | * MC_MBUF_CL: | |
122 | * This is a cache of mbufs each having a cluster attached to it. | |
123 | * It is backed by MC_MBUF and MC_CL rudimentary caches. Several | |
124 | * fields of the mbuf related to the external cluster are preserved | |
125 | * during transactions. | |
126 | * | |
127 | * MC_MBUF_BIGCL: | |
128 | * This is a cache of mbufs each having a big cluster attached to it. | |
129 | * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several | |
130 | * fields of the mbuf related to the external cluster are preserved | |
131 | * during transactions. | |
132 | * | |
133 | * OBJECT ALLOCATION: | |
134 | * | |
135 | * Allocation requests are handled first at the per-CPU (mcache) layer | |
136 | * before falling back to the slab layer. Performance is optimal when | |
137 | * the request is satisfied at the CPU layer because global data/lock | |
138 | * never gets accessed. When the slab layer is entered for allocation, | |
139 | * the slab freelist will be checked first for available objects before | |
140 | * the VM backing store is invoked. Slab layer operations are serialized | |
141 | * for all of the caches as the mbuf global lock is held most of the time. | |
142 | * Allocation paths are different depending on the class of objects: | |
143 | * | |
144 | * a. Rudimentary object: | |
145 | * | |
146 | * { m_get_common(), m_clattach(), m_mclget(), | |
147 | * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(), | |
148 | * composite object allocation } | |
149 | * | ^ | |
150 | * | | | |
151 | * | +-----------------------+ | |
152 | * v | | |
153 | * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit() | |
154 | * | ^ | |
155 | * v | | |
156 | * [CPU cache] -------> (found?) -------+ | |
157 | * | | | |
158 | * v | | |
159 | * mbuf_slab_alloc() | | |
160 | * | | | |
161 | * v | | |
162 | * +---------> [freelist] -------> (found?) -------+ | |
163 | * | | | |
164 | * | v | |
165 | * | m_clalloc() | |
166 | * | | | |
167 | * | v | |
168 | * +---<<---- kmem_mb_alloc() | |
169 | * | |
170 | * b. Composite object: | |
171 | * | |
172 | * { m_getpackets_internal(), m_allocpacket_internal() } | |
173 | * | ^ | |
174 | * | | | |
175 | * | +------ (done) ---------+ | |
176 | * v | | |
177 | * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit() | |
178 | * | ^ | |
179 | * v | | |
180 | * [CPU cache] -------> (found?) -------+ | |
181 | * | | | |
182 | * v | | |
183 | * mbuf_cslab_alloc() | | |
184 | * | | | |
185 | * v | | |
186 | * [freelist] -------> (found?) -------+ | |
187 | * | | | |
188 | * v | | |
189 | * (rudimentary object) | | |
190 | * mcache_alloc/mcache_alloc_ext() ------>>-----+ | |
191 | * | |
192 | * Auditing notes: If auditing is enabled, buffers will be subjected to | |
193 | * integrity checks by the audit routine. This is done by verifying their | |
194 | * contents against DEADBEEF (free) pattern before returning them to caller. | |
195 | * As part of this step, the routine will also record the transaction and | |
196 | * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will | |
197 | * also restore any constructed data structure fields if necessary. | |
198 | * | |
199 | * OBJECT DEALLOCATION: | |
200 | * | |
201 | * Freeing an object simply involves placing it into the CPU cache; this | |
202 | * pollutes the cache to benefit subsequent allocations. The slab layer | |
203 | * will only be entered if the object is to be purged out of the cache. | |
204 | * During normal operations, this happens only when the CPU layer resizes | |
205 | * its bucket while it's adjusting to the allocation load. Deallocation | |
206 | * paths are different depending on the class of objects: | |
207 | * | |
208 | * a. Rudimentary object: | |
209 | * | |
210 | * { m_free(), m_freem_list(), composite object deallocation } | |
211 | * | ^ | |
212 | * | | | |
213 | * | +------ (done) ---------+ | |
214 | * v | | |
215 | * mcache_free/mcache_free_ext() | | |
216 | * | | | |
217 | * v | | |
218 | * mbuf_slab_audit() | | |
219 | * | | | |
220 | * v | | |
221 | * [CPU cache] ---> (not purging?) -----+ | |
222 | * | | | |
223 | * v | | |
224 | * mbuf_slab_free() | | |
225 | * | | | |
226 | * v | | |
227 | * [freelist] ----------->>------------+ | |
228 | * (objects never get purged to VM) | |
229 | * | |
230 | * b. Composite object: | |
231 | * | |
232 | * { m_free(), m_freem_list() } | |
233 | * | ^ | |
234 | * | | | |
235 | * | +------ (done) ---------+ | |
236 | * v | | |
237 | * mcache_free/mcache_free_ext() | | |
238 | * | | | |
239 | * v | | |
240 | * mbuf_cslab_audit() | | |
241 | * | | | |
242 | * v | | |
243 | * [CPU cache] ---> (not purging?) -----+ | |
244 | * | | | |
245 | * v | | |
246 | * mbuf_cslab_free() | | |
247 | * | | | |
248 | * v | | |
249 | * [freelist] ---> (not purging?) -----+ | |
250 | * | | | |
251 | * v | | |
252 | * (rudimentary object) | | |
253 | * mcache_free/mcache_free_ext() ------->>------+ | |
254 | * | |
255 | * Auditing notes: If auditing is enabled, the audit routine will save | |
256 | * any constructed data structure fields (if necessary) before filling the | |
257 | * contents of the buffers with DEADBEEF (free) pattern and recording the | |
258 | * transaction. Buffers that are freed (whether at CPU or slab layer) are | |
259 | * expected to contain the free pattern. | |
260 | * | |
261 | * DEBUGGING: | |
262 | * | |
263 | * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this | |
264 | * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally, | |
265 | * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag, | |
266 | * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Note | |
267 | * that debugging consumes more CPU and memory. | |
268 | * | |
269 | * Each object is associated with exactly one mcache_audit_t structure that | |
270 | * contains the information related to its last buffer transaction. Given | |
271 | * an address of an object, the audit structure can be retrieved by finding | |
272 | * the position of the object relevant to the base address of the cluster: | |
273 | * | |
274 | * +------------+ +=============+ | |
275 | * | mbuf addr | | mclaudit[i] | | |
276 | * +------------+ +=============+ | |
277 | * | | cl_audit[0] | | |
278 | * i = MTOCL(addr) +-------------+ | |
279 | * | +-----> | cl_audit[1] | -----> mcache_audit_t | |
280 | * b = CLTOM(i) | +-------------+ | |
281 | * | | | ... | | |
282 | * x = MCLIDX(b, addr) | +-------------+ | |
283 | * | | | cl_audit[7] | | |
284 | * +-----------------+ +-------------+ | |
285 | * (e.g. x == 1) | |
286 | * | |
287 | * The mclaudit[] array is allocated at initialization time, but its contents | |
288 | * get populated when the corresponding cluster is created. Because a cluster | |
289 | * can be turned into NMBPCL number of mbufs, we preserve enough space for the | |
290 | * mbufs so that there is a 1-to-1 mapping between them. A cluster that never | |
291 | * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the | |
292 | * remaining entries unused. For big clusters, only one entry is allocated | |
293 | * and used for the entire cluster pair. | |
294 | */ | |
91447636 | 295 | |
2d21ac55 A |
296 | /* TODO: should be in header file */ |
297 | /* kernel translater */ | |
298 | extern vm_offset_t kmem_mb_alloc(vm_map_t, int); | |
299 | extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); | |
1c79356b | 300 | extern vm_map_t mb_map; /* special map */ |
2d21ac55 A |
301 | |
302 | /* Global lock */ | |
303 | static lck_mtx_t *mbuf_mlock; | |
304 | static lck_attr_t *mbuf_mlock_attr; | |
305 | static lck_grp_t *mbuf_mlock_grp; | |
306 | static lck_grp_attr_t *mbuf_mlock_grp_attr; | |
307 | ||
308 | /* Back-end (common) layer */ | |
309 | static void *mbuf_worker_run; /* wait channel for worker thread */ | |
310 | static int mbuf_worker_ready; /* worker thread is runnable */ | |
311 | static int mbuf_expand_mcl; /* number of cluster creation requets */ | |
312 | static int mbuf_expand_big; /* number of big cluster creation requests */ | |
313 | static int mbuf_expand_16k; /* number of 16K cluster creation requests */ | |
314 | static int ncpu; /* number of CPUs */ | |
315 | static int *mcl_paddr; /* Array of cluster physical addresses */ | |
55e303ae | 316 | static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */ |
2d21ac55 A |
317 | static mcache_t *ref_cache; /* Cache of cluster reference & flags */ |
318 | static mcache_t *mcl_audit_con_cache; /* Audit contents cache */ | |
319 | static unsigned int mbuf_debug; /* patchable mbuf mcache flags */ | |
320 | static unsigned int mb_normalized; /* number of packets "normalized" */ | |
321 | ||
322 | typedef enum { | |
323 | MC_MBUF = 0, /* Regular mbuf */ | |
324 | MC_CL, /* Cluster */ | |
325 | MC_BIGCL, /* Large (4K) cluster */ | |
326 | MC_16KCL, /* Jumbo (16K) cluster */ | |
327 | MC_MBUF_CL, /* mbuf + cluster */ | |
328 | MC_MBUF_BIGCL, /* mbuf + large (4K) cluster */ | |
329 | MC_MBUF_16KCL /* mbuf + jumbo (16K) cluster */ | |
330 | } mbuf_class_t; | |
331 | ||
332 | #define MBUF_CLASS_MIN MC_MBUF | |
333 | #define MBUF_CLASS_MAX MC_MBUF_16KCL | |
334 | #define MBUF_CLASS_LAST MC_16KCL | |
335 | #define MBUF_CLASS_VALID(c) \ | |
336 | ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX) | |
337 | #define MBUF_CLASS_COMPOSITE(c) \ | |
338 | ((int)(c) > MBUF_CLASS_LAST) | |
91447636 | 339 | |
9bccf70c | 340 | |
2d21ac55 A |
341 | /* |
342 | * mbuf specific mcache allocation request flags. | |
343 | */ | |
344 | #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */ | |
9bccf70c | 345 | |
2d21ac55 A |
346 | /* |
347 | * Per-cluster slab structure. | |
348 | * | |
349 | * A slab is a cluster control structure that contains one or more object | |
350 | * chunks; the available chunks are chained in the slab's freelist (sl_head). | |
351 | * Each time a chunk is taken out of the slab, the slab's reference count | |
352 | * gets incremented. When all chunks have been taken out, the empty slab | |
353 | * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is | |
354 | * returned to a slab causes the slab's reference count to be decremented; | |
355 | * it also causes the slab to be reinserted back to class's slab list, if | |
356 | * it's not already done. | |
357 | * | |
358 | * Compartmentalizing of the object chunks into slabs allows us to easily | |
359 | * merge one or more slabs together when the adjacent slabs are idle, as | |
360 | * well as to convert or move a slab from one class to another; e.g. the | |
361 | * mbuf cluster slab can be converted to a regular cluster slab when all | |
362 | * mbufs in the slab have been freed. | |
363 | * | |
364 | * A slab may also span across multiple clusters for chunks larger than | |
365 | * a cluster's size. In this case, only the slab of the first cluster is | |
366 | * used. The rest of the slabs are marked with SLF_PARTIAL to indicate | |
367 | * that they are part of the larger slab. | |
368 | */ | |
369 | typedef struct mcl_slab { | |
370 | struct mcl_slab *sl_next; /* neighboring slab */ | |
371 | u_int8_t sl_class; /* controlling mbuf class */ | |
372 | int8_t sl_refcnt; /* outstanding allocations */ | |
373 | int8_t sl_chunks; /* chunks (bufs) in this slab */ | |
374 | u_int16_t sl_flags; /* slab flags (see below) */ | |
375 | u_int16_t sl_len; /* slab length */ | |
376 | void *sl_base; /* base of allocated memory */ | |
377 | void *sl_head; /* first free buffer */ | |
378 | TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */ | |
379 | } mcl_slab_t; | |
380 | ||
381 | #define SLF_MAPPED 0x0001 /* backed by a mapped page */ | |
382 | #define SLF_PARTIAL 0x0002 /* part of another slab */ | |
383 | #define SLF_DETACHED 0x0004 /* not in slab freelist */ | |
1c79356b | 384 | |
2d21ac55 A |
385 | /* |
386 | * The array of slabs are broken into groups of arrays per 1MB of kernel | |
387 | * memory to reduce the footprint. Each group is allocated on demand | |
388 | * whenever a new piece of memory mapped in from the VM crosses the 1MB | |
389 | * boundary. | |
390 | */ | |
391 | #define MBSHIFT 20 /* 1MB */ | |
392 | #define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */ | |
91447636 | 393 | |
2d21ac55 A |
394 | typedef struct mcl_slabg { |
395 | mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */ | |
396 | } mcl_slabg_t; | |
1c79356b | 397 | |
2d21ac55 A |
398 | /* |
399 | * Per-cluster audit structure. | |
400 | */ | |
401 | typedef struct { | |
402 | mcache_audit_t *cl_audit[NMBPCL]; /* array of audits */ | |
403 | } mcl_audit_t; | |
91447636 | 404 | |
2d21ac55 A |
405 | #if CONFIG_MBUF_NOEXPAND |
406 | static unsigned int maxmbufcl; | |
407 | #endif /* CONFIG_MBUF_NOEXPAND */ | |
91447636 | 408 | |
2d21ac55 A |
409 | /* |
410 | * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr | |
411 | * and m_ext structures. If auditing is enabled, we allocate a shadow | |
412 | * mbuf structure of this size inside each audit structure, and the | |
413 | * contents of the real mbuf gets copied into it when the mbuf is freed. | |
414 | * This allows us to pattern-fill the mbuf for integrity check, and to | |
415 | * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case). | |
416 | * Note that we don't save the contents of clusters when they are freed; | |
417 | * we simply pattern-fill them. | |
418 | */ | |
419 | #if defined(__LP64__) | |
420 | #define AUDIT_CONTENTS_SIZE 160 | |
421 | #else | |
422 | #define AUDIT_CONTENTS_SIZE 80 | |
423 | #endif /* __LP64__ */ | |
fa4905b1 | 424 | |
2d21ac55 A |
425 | /* |
426 | * mbuf specific mcache audit flags | |
427 | */ | |
428 | #define MB_INUSE 0x01 /* object has not been returned to slab */ | |
429 | #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */ | |
430 | #define MB_SCVALID 0x04 /* object has valid saved contents */ | |
fa4905b1 | 431 | |
2d21ac55 A |
432 | /* |
433 | * Each of the following two arrays hold up to nmbclusters elements. | |
434 | */ | |
435 | static mcl_audit_t *mclaudit; /* array of cluster audit information */ | |
436 | static mcl_slabg_t **slabstbl; /* cluster slabs table */ | |
437 | static unsigned int maxslabgrp; /* max # of entries in slabs table */ | |
438 | static unsigned int slabgrp; /* # of entries in slabs table */ | |
439 | ||
440 | /* Globals */ | |
441 | int nclusters; /* # of clusters for non-jumbo (legacy) sizes */ | |
442 | int njcl; /* # of clusters for jumbo sizes */ | |
443 | int njclbytes; /* size of a jumbo cluster */ | |
444 | union mcluster *mbutl; /* first mapped cluster address */ | |
445 | union mcluster *embutl; /* ending virtual address of mclusters */ | |
446 | int max_linkhdr; /* largest link-level header */ | |
447 | int max_protohdr; /* largest protocol header */ | |
448 | int max_hdr; /* largest link+protocol header */ | |
449 | int max_datalen; /* MHLEN - max_hdr */ | |
450 | ||
451 | /* TODO: should be in header file */ | |
452 | int do_reclaim = 0; | |
1c79356b | 453 | |
2d21ac55 A |
454 | /* The minimum number of objects that are allocated, to start. */ |
455 | #define MINCL 32 | |
456 | #define MINBIGCL (MINCL >> 1) | |
457 | #define MIN16KCL (MINCL >> 2) | |
458 | ||
459 | /* Low watermarks (only map in pages once free counts go below) */ | |
460 | #define MCL_LOWAT MINCL | |
461 | #define MBIGCL_LOWAT MINBIGCL | |
462 | #define M16KCL_LOWAT MIN16KCL | |
463 | ||
464 | typedef struct { | |
465 | mbuf_class_t mtbl_class; /* class type */ | |
466 | mcache_t *mtbl_cache; /* mcache for this buffer class */ | |
467 | TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */ | |
468 | mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */ | |
469 | mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */ | |
470 | u_int32_t mtbl_maxsize; /* maximum buffer size */ | |
471 | int mtbl_minlimit; /* minimum allowed */ | |
472 | int mtbl_maxlimit; /* maximum allowed */ | |
473 | u_int32_t mtbl_wantpurge; /* purge during next reclaim */ | |
474 | } mbuf_table_t; | |
475 | ||
476 | #define m_class(c) mbuf_table[c].mtbl_class | |
477 | #define m_cache(c) mbuf_table[c].mtbl_cache | |
478 | #define m_slablist(c) mbuf_table[c].mtbl_slablist | |
479 | #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist | |
480 | #define m_maxsize(c) mbuf_table[c].mtbl_maxsize | |
481 | #define m_minlimit(c) mbuf_table[c].mtbl_minlimit | |
482 | #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit | |
483 | #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge | |
484 | #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname | |
485 | #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size | |
486 | #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total | |
487 | #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active | |
488 | #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree | |
489 | #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt | |
490 | #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt | |
491 | #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt | |
492 | #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified | |
493 | #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt | |
494 | #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt | |
495 | #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal | |
496 | ||
497 | static mbuf_table_t mbuf_table[] = { | |
498 | /* | |
499 | * The caches for mbufs, regular clusters and big clusters. | |
500 | */ | |
501 | { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)), | |
502 | NULL, NULL, 0, 0, 0, 0 }, | |
503 | { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)), | |
504 | NULL, NULL, 0, 0, 0, 0 }, | |
505 | { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)), | |
506 | NULL, NULL, 0, 0, 0, 0 }, | |
507 | { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)), | |
508 | NULL, NULL, 0, 0, 0, 0 }, | |
509 | /* | |
510 | * The following are special caches; they serve as intermediate | |
511 | * caches backed by the above rudimentary caches. Each object | |
512 | * in the cache is an mbuf with a cluster attached to it. Unlike | |
513 | * the above caches, these intermediate caches do not directly | |
514 | * deal with the slab structures; instead, the constructed | |
515 | * cached elements are simply stored in the freelists. | |
516 | */ | |
517 | { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, | |
518 | { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, | |
519 | { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, | |
520 | }; | |
521 | ||
522 | #define NELEM(a) (sizeof (a) / sizeof ((a)[0])) | |
523 | ||
524 | static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */ | |
525 | static int mb_waiters; /* number of sleepers */ | |
526 | ||
527 | /* The following are used to serialize m_clalloc() */ | |
528 | static boolean_t mb_clalloc_busy; | |
529 | static void *mb_clalloc_waitchan = &mb_clalloc_busy; | |
530 | static int mb_clalloc_waiters; | |
531 | ||
532 | static int mbstat_sysctl SYSCTL_HANDLER_ARGS; | |
533 | static int mb_stat_sysctl SYSCTL_HANDLER_ARGS; | |
534 | static void mbuf_table_init(void); | |
535 | static inline void m_incref(struct mbuf *); | |
536 | static inline u_int32_t m_decref(struct mbuf *); | |
537 | static int m_clalloc(const u_int32_t, const int, const u_int32_t); | |
538 | static void mbuf_worker_thread_init(void); | |
539 | static mcache_obj_t *slab_alloc(mbuf_class_t, int); | |
540 | static void slab_free(mbuf_class_t, mcache_obj_t *); | |
541 | static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***, | |
542 | unsigned int, int); | |
543 | static void mbuf_slab_free(void *, mcache_obj_t *, int); | |
544 | static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t); | |
545 | static void mbuf_slab_notify(void *, u_int32_t); | |
546 | static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***, | |
547 | unsigned int); | |
548 | static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int); | |
549 | static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***, | |
550 | unsigned int, int); | |
551 | static void mbuf_cslab_free(void *, mcache_obj_t *, int); | |
552 | static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t); | |
553 | static int freelist_populate(mbuf_class_t, unsigned int, int); | |
554 | static boolean_t mbuf_cached_above(mbuf_class_t, int); | |
555 | static boolean_t mbuf_steal(mbuf_class_t, unsigned int); | |
556 | static void m_reclaim(mbuf_class_t, unsigned int, boolean_t); | |
557 | static int m_howmany(int, size_t); | |
558 | static void mbuf_worker_thread(void); | |
559 | static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int); | |
560 | ||
561 | static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **, | |
562 | size_t, unsigned int); | |
563 | static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *); | |
564 | static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t); | |
565 | static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t, | |
566 | boolean_t); | |
567 | static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t); | |
568 | static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *); | |
569 | static void mcl_audit_mcheck_panic(struct mbuf *); | |
570 | static void mcl_audit_verify_nextptr(void *, mcache_audit_t *); | |
571 | ||
572 | static mcl_slab_t *slab_get(void *); | |
573 | static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t, | |
574 | void *, void *, unsigned int, int, int); | |
575 | static void slab_insert(mcl_slab_t *, mbuf_class_t); | |
576 | static void slab_remove(mcl_slab_t *, mbuf_class_t); | |
577 | static boolean_t slab_inrange(mcl_slab_t *, void *); | |
578 | static void slab_nextptr_panic(mcl_slab_t *, void *); | |
579 | static void slab_detach(mcl_slab_t *); | |
580 | static boolean_t slab_is_detached(mcl_slab_t *); | |
581 | ||
582 | /* | |
583 | * This flag is set for all mbufs that come out of and into the composite | |
584 | * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that | |
585 | * are marked with such a flag have clusters attached to them, and will be | |
586 | * treated differently when they are freed; instead of being placed back | |
587 | * into the mbuf and cluster freelists, the composite mbuf + cluster objects | |
588 | * are placed back into the appropriate composite cache's freelist, and the | |
589 | * actual freeing is deferred until the composite objects are purged. At | |
590 | * such a time, this flag will be cleared from the mbufs and the objects | |
591 | * will be freed into their own separate freelists. | |
592 | */ | |
593 | #define EXTF_COMPOSITE 0x1 | |
1c79356b | 594 | |
2d21ac55 A |
595 | #define MEXT_RFA(m) ((m)->m_ext.ext_refflags) |
596 | #define MEXT_REF(m) (MEXT_RFA(m)->refcnt) | |
597 | #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags) | |
598 | #define MBUF_IS_COMPOSITE(m) \ | |
599 | (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE)) | |
1c79356b | 600 | |
2d21ac55 A |
601 | /* |
602 | * Macros used to verify the integrity of the mbuf. | |
603 | */ | |
604 | #define _MCHECK(m) { \ | |
605 | if ((m)->m_type != MT_FREE) { \ | |
606 | if (mclaudit == NULL) \ | |
607 | panic("MCHECK: m_type=%d m=%p", \ | |
608 | (u_int16_t)(m)->m_type, m); \ | |
609 | else \ | |
610 | mcl_audit_mcheck_panic(m); \ | |
611 | } \ | |
612 | } | |
55e303ae | 613 | |
2d21ac55 A |
614 | #define MBUF_IN_MAP(addr) \ |
615 | ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl) | |
55e303ae | 616 | |
2d21ac55 A |
617 | #define MRANGE(addr) { \ |
618 | if (!MBUF_IN_MAP(addr)) \ | |
619 | panic("MRANGE: address out of range 0x%p", addr); \ | |
1c79356b A |
620 | } |
621 | ||
622 | /* | |
2d21ac55 | 623 | * Macro version of mtod. |
1c79356b | 624 | */ |
2d21ac55 | 625 | #define MTOD(m, t) ((t)((m)->m_data)) |
1c79356b | 626 | |
2d21ac55 A |
627 | /* |
628 | * Macros to obtain cluster index and base cluster address. | |
629 | */ | |
630 | #define MTOCL(x) (((char *)(x) - (char *)mbutl) >> MCLSHIFT) | |
631 | #define CLTOM(x) ((union mcluster *)(mbutl + (x))) | |
1c79356b | 632 | |
2d21ac55 A |
633 | /* |
634 | * Macro to find the mbuf index relative to the cluster base. | |
635 | */ | |
636 | #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> 8) | |
91447636 | 637 | |
2d21ac55 A |
638 | /* |
639 | * Macros used during mbuf and cluster initialization. | |
640 | */ | |
641 | #define MBUF_INIT(m, pkthdr, type) { \ | |
642 | _MCHECK(m); \ | |
643 | (m)->m_next = (m)->m_nextpkt = NULL; \ | |
644 | (m)->m_len = 0; \ | |
645 | (m)->m_type = type; \ | |
646 | if ((pkthdr) == 0) { \ | |
647 | (m)->m_data = (m)->m_dat; \ | |
648 | (m)->m_flags = 0; \ | |
649 | } else { \ | |
650 | (m)->m_data = (m)->m_pktdat; \ | |
651 | (m)->m_flags = M_PKTHDR; \ | |
652 | (m)->m_pkthdr.rcvif = NULL; \ | |
653 | (m)->m_pkthdr.len = 0; \ | |
654 | (m)->m_pkthdr.header = NULL; \ | |
655 | (m)->m_pkthdr.csum_flags = 0; \ | |
656 | (m)->m_pkthdr.csum_data = 0; \ | |
657 | (m)->m_pkthdr.reserved0 = NULL; \ | |
658 | (m)->m_pkthdr.vlan_tag = 0; \ | |
659 | (m)->m_pkthdr.socket_id = 0; \ | |
660 | m_tag_init(m); \ | |
661 | } \ | |
662 | } | |
91447636 | 663 | |
2d21ac55 A |
664 | #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \ |
665 | (m)->m_data = (m)->m_ext.ext_buf = (buf); \ | |
666 | (m)->m_flags |= M_EXT; \ | |
667 | (m)->m_ext.ext_size = (size); \ | |
668 | (m)->m_ext.ext_free = (free); \ | |
669 | (m)->m_ext.ext_arg = (arg); \ | |
670 | (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \ | |
671 | &(m)->m_ext.ext_refs; \ | |
672 | MEXT_RFA(m) = (rfa); \ | |
673 | MEXT_REF(m) = (ref); \ | |
674 | MEXT_FLAGS(m) = (flag); \ | |
1c79356b A |
675 | } |
676 | ||
2d21ac55 A |
677 | #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \ |
678 | MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag) | |
679 | ||
680 | #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \ | |
681 | MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag) | |
682 | ||
683 | #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \ | |
684 | MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag) | |
685 | ||
1c79356b | 686 | /* |
2d21ac55 | 687 | * Macro to convert BSD malloc sleep flag to mcache's |
1c79356b | 688 | */ |
2d21ac55 | 689 | #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP) |
1c79356b | 690 | |
2d21ac55 A |
691 | /* |
692 | * The structure that holds all mbuf class statistics exportable via sysctl. | |
693 | * Similar to mbstat structure, the mb_stat structure is protected by the | |
694 | * global mbuf lock. It contains additional information about the classes | |
695 | * that allows for a more accurate view of the state of the allocator. | |
696 | */ | |
697 | struct mb_stat *mb_stat; | |
1c79356b | 698 | |
2d21ac55 A |
699 | #define MB_STAT_SIZE(n) \ |
700 | ((size_t)(&((mb_stat_t *)0)->mbs_class[n])) | |
1c79356b A |
701 | |
702 | /* | |
2d21ac55 A |
703 | * The legacy structure holding all of the mbuf allocation statistics. |
704 | * The actual statistics used by the kernel are stored in the mbuf_table | |
705 | * instead, and are updated atomically while the global mbuf lock is held. | |
706 | * They are mirrored in mbstat to support legacy applications (e.g. netstat). | |
707 | * Unlike before, the kernel no longer relies on the contents of mbstat for | |
708 | * its operations (e.g. cluster expansion) because the structure is exposed | |
709 | * to outside and could possibly be modified, therefore making it unsafe. | |
710 | * With the exception of the mbstat.m_mtypes array (see below), all of the | |
711 | * statistics are updated as they change. | |
1c79356b | 712 | */ |
2d21ac55 | 713 | struct mbstat mbstat; |
1c79356b | 714 | |
2d21ac55 A |
715 | #define MBSTAT_MTYPES_MAX \ |
716 | (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0])) | |
1c79356b A |
717 | |
718 | /* | |
2d21ac55 A |
719 | * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated |
720 | * atomically and stored in a per-CPU structure which is lock-free; this is | |
721 | * done in order to avoid writing to the global mbstat data structure which | |
722 | * would cause false sharing. During sysctl request for kern.ipc.mbstat, | |
723 | * the statistics across all CPUs will be converged into the mbstat.m_mtypes | |
724 | * array and returned to the application. Any updates for types greater or | |
725 | * equal than MT_MAX would be done atomically to the mbstat; this slows down | |
726 | * performance but is okay since the kernel uses only up to MT_MAX-1 while | |
727 | * anything beyond that (up to type 255) is considered a corner case. | |
1c79356b | 728 | */ |
2d21ac55 A |
729 | typedef struct { |
730 | unsigned int cpu_mtypes[MT_MAX]; | |
731 | } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t; | |
1c79356b | 732 | |
2d21ac55 A |
733 | typedef struct { |
734 | mtypes_cpu_t mbs_cpu[1]; | |
735 | } mbuf_mtypes_t; | |
1c79356b | 736 | |
2d21ac55 A |
737 | static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ |
738 | ||
739 | #define MBUF_MTYPES_SIZE(n) \ | |
740 | ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n])) | |
741 | ||
742 | #define MTYPES_CPU(p) \ | |
743 | ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number()))) | |
744 | ||
745 | /* This should be in a header file */ | |
746 | #define atomic_add_32(a, n) ((void) OSAddAtomic(n, (volatile SInt32 *)a)) | |
747 | ||
748 | #define mtype_stat_add(type, n) { \ | |
749 | if ((unsigned)(type) < MT_MAX) { \ | |
750 | mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \ | |
751 | atomic_add_32(&mbs->cpu_mtypes[type], n); \ | |
752 | } else if ((unsigned)(type) < MBSTAT_MTYPES_MAX) { \ | |
753 | atomic_add_32(&mbstat.m_mtypes[type], n); \ | |
754 | } \ | |
1c79356b A |
755 | } |
756 | ||
2d21ac55 A |
757 | #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n)) |
758 | #define mtype_stat_inc(t) mtype_stat_add(t, 1) | |
759 | #define mtype_stat_dec(t) mtype_stat_sub(t, 1) | |
91447636 | 760 | |
2d21ac55 A |
761 | static int |
762 | mbstat_sysctl SYSCTL_HANDLER_ARGS | |
763 | { | |
764 | #pragma unused(oidp, arg1, arg2) | |
765 | int m, n; | |
766 | mtypes_cpu_t mtc; | |
1c79356b | 767 | |
2d21ac55 A |
768 | bzero(&mtc, sizeof (mtc)); |
769 | for (m = 0; m < ncpu; m++) { | |
770 | mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m]; | |
771 | mtypes_cpu_t temp; | |
9bccf70c | 772 | |
2d21ac55 A |
773 | bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes, |
774 | sizeof (temp.cpu_mtypes)); | |
91447636 | 775 | |
2d21ac55 A |
776 | for (n = 0; n < MT_MAX; n++) |
777 | mtc.cpu_mtypes[n] += temp.cpu_mtypes[n]; | |
778 | } | |
779 | lck_mtx_lock(mbuf_mlock); | |
780 | for (n = 0; n < MT_MAX; n++) | |
781 | mbstat.m_mtypes[n] = mtc.cpu_mtypes[n]; | |
782 | lck_mtx_unlock(mbuf_mlock); | |
91447636 | 783 | |
2d21ac55 | 784 | return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat))); |
1c79356b A |
785 | } |
786 | ||
2d21ac55 A |
787 | static int |
788 | mb_stat_sysctl SYSCTL_HANDLER_ARGS | |
1c79356b | 789 | { |
2d21ac55 A |
790 | #pragma unused(oidp, arg1, arg2) |
791 | mcache_t *cp; | |
792 | mcache_cpu_t *ccp; | |
793 | mb_class_stat_t *sp; | |
794 | int k, m, bktsize; | |
795 | ||
796 | lck_mtx_lock(mbuf_mlock); | |
797 | for (k = 0; k < NELEM(mbuf_table); k++) { | |
798 | cp = m_cache(k); | |
799 | ccp = &cp->mc_cpu[0]; | |
800 | bktsize = ccp->cc_bktsize; | |
801 | sp = mbuf_table[k].mtbl_stats; | |
802 | ||
803 | if (cp->mc_flags & MCF_NOCPUCACHE) | |
804 | sp->mbcl_mc_state = MCS_DISABLED; | |
805 | else if (cp->mc_purge_cnt > 0) | |
806 | sp->mbcl_mc_state = MCS_PURGING; | |
807 | else if (bktsize == 0) | |
808 | sp->mbcl_mc_state = MCS_OFFLINE; | |
809 | else | |
810 | sp->mbcl_mc_state = MCS_ONLINE; | |
811 | ||
812 | sp->mbcl_mc_cached = 0; | |
813 | for (m = 0; m < ncpu; m++) { | |
814 | ccp = &cp->mc_cpu[m]; | |
815 | if (ccp->cc_objs > 0) | |
816 | sp->mbcl_mc_cached += ccp->cc_objs; | |
817 | if (ccp->cc_pobjs > 0) | |
818 | sp->mbcl_mc_cached += ccp->cc_pobjs; | |
819 | } | |
820 | sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize); | |
821 | sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached - | |
822 | sp->mbcl_infree; | |
823 | ||
824 | sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt; | |
825 | sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt; | |
826 | sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt; | |
827 | ||
828 | /* Calculate total count specific to each class */ | |
829 | sp->mbcl_ctotal = sp->mbcl_total; | |
830 | switch (m_class(k)) { | |
831 | case MC_MBUF: | |
832 | /* Deduct mbufs used in composite caches */ | |
833 | sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) + | |
834 | m_total(MC_MBUF_BIGCL)); | |
835 | break; | |
91447636 | 836 | |
2d21ac55 A |
837 | case MC_CL: |
838 | /* Deduct clusters used in composite cache and mbufs */ | |
839 | sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) + | |
840 | (P2ROUNDUP(m_total(MC_MBUF), NMBPCL)/NMBPCL)); | |
841 | break; | |
91447636 | 842 | |
2d21ac55 A |
843 | case MC_BIGCL: |
844 | /* Deduct clusters used in composite cache */ | |
845 | sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL); | |
846 | break; | |
1c79356b | 847 | |
2d21ac55 A |
848 | case MC_16KCL: |
849 | /* Deduct clusters used in composite cache */ | |
850 | sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL); | |
851 | break; | |
852 | ||
853 | default: | |
854 | break; | |
855 | } | |
856 | } | |
857 | lck_mtx_unlock(mbuf_mlock); | |
9bccf70c | 858 | |
2d21ac55 A |
859 | return (SYSCTL_OUT(req, mb_stat, MB_STAT_SIZE(NELEM(mbuf_table)))); |
860 | } | |
91447636 | 861 | |
2d21ac55 A |
862 | static inline void |
863 | m_incref(struct mbuf *m) | |
864 | { | |
865 | UInt32 old, new; | |
866 | volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); | |
91447636 | 867 | |
2d21ac55 A |
868 | do { |
869 | old = *addr; | |
870 | new = old + 1; | |
871 | ASSERT(new != 0); | |
872 | } while (!OSCompareAndSwap(old, new, addr)); | |
1c79356b A |
873 | } |
874 | ||
2d21ac55 A |
875 | static inline u_int32_t |
876 | m_decref(struct mbuf *m) | |
1c79356b | 877 | { |
2d21ac55 A |
878 | UInt32 old, new; |
879 | volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); | |
1c79356b | 880 | |
2d21ac55 A |
881 | do { |
882 | old = *addr; | |
883 | new = old - 1; | |
884 | ASSERT(old != 0); | |
885 | } while (!OSCompareAndSwap(old, new, addr)); | |
886 | ||
887 | return (new); | |
1c79356b A |
888 | } |
889 | ||
2d21ac55 A |
890 | static void |
891 | mbuf_table_init(void) | |
1c79356b | 892 | { |
2d21ac55 | 893 | int m; |
91447636 | 894 | |
2d21ac55 A |
895 | MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)), |
896 | M_TEMP, M_WAITOK | M_ZERO); | |
897 | VERIFY(mb_stat != NULL); | |
1c79356b | 898 | |
2d21ac55 A |
899 | mb_stat->mbs_cnt = NELEM(mbuf_table); |
900 | for (m = 0; m < NELEM(mbuf_table); m++) | |
901 | mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m]; | |
1c79356b | 902 | |
2d21ac55 A |
903 | #if CONFIG_MBUF_JUMBO |
904 | /* | |
905 | * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do | |
906 | * this only on platforms where jumbo cluster pool is enabled. | |
907 | */ | |
908 | njcl = nmbclusters / 3; | |
909 | njclbytes = M16KCLBYTES; | |
910 | #endif /* CONFIG_MBUF_JUMBO */ | |
9bccf70c | 911 | |
2d21ac55 A |
912 | /* |
913 | * nclusters is going to be split in 2 to hold both the 2K | |
914 | * and the 4K pools, so make sure each half is even. | |
915 | */ | |
916 | nclusters = P2ROUNDDOWN(nmbclusters - njcl, 4); | |
917 | if (njcl > 0) { | |
918 | /* | |
919 | * Each jumbo cluster takes 8 2K clusters, so make | |
920 | * sure that the pool size is evenly divisible by 8. | |
921 | */ | |
922 | njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8); | |
1c79356b | 923 | } |
1c79356b | 924 | |
2d21ac55 A |
925 | #if CONFIG_MBUF_NOEXPAND |
926 | /* Only use 4k clusters if we're setting aside more than 256k */ | |
927 | if (nmbclusters <= 128) { | |
928 | maxmbufcl = nmbclusters / 4; | |
929 | } else { | |
930 | /* Half to big clusters, half to small */ | |
931 | maxmbufcl = (nmbclusters / 4) * 3; | |
9bccf70c | 932 | } |
2d21ac55 A |
933 | #endif /* CONFIG_MBUF_NOEXPAND */ |
934 | ||
935 | /* | |
936 | * 1/2 of the map is reserved for 2K clusters. Out of this, 1/16th | |
937 | * of the total number of 2K clusters allocated is reserved and cannot | |
938 | * be turned into mbufs. It can only be used for pure cluster objects. | |
939 | */ | |
940 | m_minlimit(MC_CL) = (nclusters >> 5); | |
941 | m_maxlimit(MC_CL) = (nclusters >> 1); | |
942 | m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES; | |
943 | (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl"); | |
944 | ||
945 | /* | |
946 | * The remaining (15/16th) can be turned into mbufs. | |
947 | */ | |
948 | m_minlimit(MC_MBUF) = 0; | |
949 | m_maxlimit(MC_MBUF) = (m_maxlimit(MC_CL) - m_minlimit(MC_CL)) * NMBPCL; | |
950 | m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE; | |
951 | (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf"); | |
952 | ||
953 | /* | |
954 | * The other 1/2 of the map is reserved for 4K clusters. | |
955 | */ | |
956 | m_minlimit(MC_BIGCL) = 0; | |
957 | m_maxlimit(MC_BIGCL) = m_maxlimit(MC_CL) >> 1; | |
958 | m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = NBPG; | |
959 | (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl"); | |
960 | ||
961 | /* | |
962 | * Set limits for the composite classes. | |
963 | */ | |
964 | m_minlimit(MC_MBUF_CL) = 0; | |
965 | m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL) - m_minlimit(MC_CL); | |
966 | m_maxsize(MC_MBUF_CL) = MCLBYTES; | |
967 | m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL); | |
968 | (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl"); | |
969 | ||
970 | m_minlimit(MC_MBUF_BIGCL) = 0; | |
971 | m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL); | |
972 | m_maxsize(MC_MBUF_BIGCL) = NBPG; | |
973 | m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL); | |
974 | (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl"); | |
975 | ||
976 | /* | |
977 | * And for jumbo classes. | |
978 | */ | |
979 | m_minlimit(MC_16KCL) = 0; | |
980 | m_maxlimit(MC_16KCL) = (njcl >> 3); | |
981 | m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES; | |
982 | (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl"); | |
983 | ||
984 | m_minlimit(MC_MBUF_16KCL) = 0; | |
985 | m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL); | |
986 | m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES; | |
987 | m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL); | |
988 | (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl"); | |
989 | ||
990 | /* | |
991 | * Initialize the legacy mbstat structure. | |
992 | */ | |
993 | bzero(&mbstat, sizeof (mbstat)); | |
994 | mbstat.m_msize = m_maxsize(MC_MBUF); | |
995 | mbstat.m_mclbytes = m_maxsize(MC_CL); | |
996 | mbstat.m_minclsize = MINCLSIZE; | |
997 | mbstat.m_mlen = MLEN; | |
998 | mbstat.m_mhlen = MHLEN; | |
999 | mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL); | |
1000 | } | |
1001 | ||
1002 | __private_extern__ void | |
1003 | mbinit(void) | |
1004 | { | |
1005 | unsigned int m; | |
1006 | int initmcl = MINCL; | |
1007 | int mcl_pages; | |
1008 | void *buf; | |
1009 | ||
1010 | if (nmbclusters == 0) | |
1011 | nmbclusters = NMBCLUSTERS; | |
1012 | ||
1013 | /* Setup the mbuf table */ | |
1014 | mbuf_table_init(); | |
1015 | ||
1016 | /* Global lock for common layer */ | |
1017 | mbuf_mlock_grp_attr = lck_grp_attr_alloc_init(); | |
1018 | mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr); | |
1019 | mbuf_mlock_attr = lck_attr_alloc_init(); | |
1020 | mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr); | |
1021 | ||
1022 | /* Allocate cluster slabs table */ | |
1023 | maxslabgrp = P2ROUNDUP(nmbclusters, NSLABSPMB) / NSLABSPMB; | |
1024 | MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *), | |
1025 | M_TEMP, M_WAITOK | M_ZERO); | |
1026 | VERIFY(slabstbl != NULL); | |
1027 | ||
1028 | /* Allocate audit structures if needed */ | |
1029 | PE_parse_boot_arg("mbuf_debug", &mbuf_debug); | |
1030 | mbuf_debug |= mcache_getflags(); | |
1031 | if (mbuf_debug & MCF_AUDIT) { | |
1032 | MALLOC(mclaudit, mcl_audit_t *, | |
1033 | nmbclusters * sizeof (*mclaudit), M_TEMP, | |
1034 | M_WAITOK | M_ZERO); | |
1035 | VERIFY(mclaudit != NULL); | |
1036 | ||
1037 | mcl_audit_con_cache = mcache_create("mcl_audit_contents", | |
1038 | AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP); | |
1039 | VERIFY(mcl_audit_con_cache != NULL); | |
1040 | } | |
1041 | ||
1042 | /* Calculate the number of pages assigned to the cluster pool */ | |
1043 | mcl_pages = nmbclusters/(NBPG/CLBYTES); | |
1044 | MALLOC(mcl_paddr, int *, mcl_pages * sizeof (int), M_TEMP, M_WAITOK); | |
1045 | VERIFY(mcl_paddr != NULL); | |
1046 | ||
1047 | /* Register with the I/O Bus mapper */ | |
1048 | mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); | |
1049 | bzero((char *)mcl_paddr, mcl_pages * sizeof (int)); | |
1050 | ||
1051 | embutl = (union mcluster *) | |
1052 | ((unsigned char *)mbutl + (nmbclusters * MCLBYTES)); | |
1053 | ||
1054 | PE_parse_boot_arg("initmcl", &initmcl); | |
1055 | ||
1056 | lck_mtx_lock(mbuf_mlock); | |
1057 | ||
1058 | if (m_clalloc(MAX(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES) == 0) | |
1059 | panic("mbinit: m_clalloc failed\n"); | |
1060 | ||
1061 | lck_mtx_unlock(mbuf_mlock); | |
1062 | ||
1063 | (void) kernel_thread(kernel_task, mbuf_worker_thread_init); | |
1064 | ||
1065 | ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref), | |
1066 | 0, 0, MCR_SLEEP); | |
1067 | ||
1068 | /* Create the cache for each class */ | |
1069 | for (m = 0; m < NELEM(mbuf_table); m++) { | |
1070 | void *allocfunc, *freefunc, *auditfunc; | |
1071 | u_int32_t flags; | |
1072 | ||
1073 | flags = mbuf_debug; | |
1074 | if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL || | |
1075 | m_class(m) == MC_MBUF_16KCL) { | |
1076 | allocfunc = mbuf_cslab_alloc; | |
1077 | freefunc = mbuf_cslab_free; | |
1078 | auditfunc = mbuf_cslab_audit; | |
1079 | } else { | |
1080 | allocfunc = mbuf_slab_alloc; | |
1081 | freefunc = mbuf_slab_free; | |
1082 | auditfunc = mbuf_slab_audit; | |
1083 | } | |
1084 | ||
1085 | /* | |
1086 | * Disable per-CPU caches for jumbo classes if there | |
1087 | * is no jumbo cluster pool available in the system. | |
1088 | * The cache itself is still created (but will never | |
1089 | * be populated) since it simplifies the code. | |
1090 | */ | |
1091 | if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) && | |
1092 | njcl == 0) | |
1093 | flags |= MCF_NOCPUCACHE; | |
1094 | ||
1095 | m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m), | |
1096 | allocfunc, freefunc, auditfunc, mbuf_slab_notify, | |
1097 | (void *)m, flags, MCR_SLEEP); | |
1098 | } | |
1099 | ||
1100 | /* | |
1101 | * Allocate structure for per-CPU statistics that's aligned | |
1102 | * on the CPU cache boundary; this code assumes that we never | |
1103 | * uninitialize this framework, since the original address | |
1104 | * before alignment is not saved. | |
1105 | */ | |
1106 | ncpu = ml_get_max_cpus(); | |
1107 | MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE, | |
1108 | M_TEMP, M_WAITOK); | |
1109 | VERIFY(buf != NULL); | |
1110 | ||
1111 | mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE); | |
1112 | bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu)); | |
1113 | ||
1114 | printf("mbinit: done\n"); | |
1115 | } | |
1116 | ||
1117 | /* | |
1118 | * Obtain a slab of object(s) from the class's freelist. | |
1119 | */ | |
1120 | static mcache_obj_t * | |
1121 | slab_alloc(mbuf_class_t class, int wait) | |
1122 | { | |
1123 | mcl_slab_t *sp; | |
1124 | mcache_obj_t *buf; | |
1125 | ||
1126 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
1127 | ||
1128 | VERIFY(class != MC_16KCL || njcl > 0); | |
1129 | ||
1130 | /* This should always be NULL for us */ | |
1131 | VERIFY(m_cobjlist(class) == NULL); | |
1132 | ||
1133 | /* | |
1134 | * Treat composite objects as having longer lifespan by using | |
1135 | * a slab from the reverse direction, in hoping that this could | |
1136 | * reduce the probability of fragmentation for slabs that hold | |
1137 | * more than one buffer chunks (e.g. mbuf slabs). For other | |
1138 | * slabs, this probably doesn't make much of a difference. | |
1139 | */ | |
1140 | if (class == MC_MBUF && (wait & MCR_COMP)) | |
1141 | sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead); | |
1142 | else | |
1143 | sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class)); | |
1144 | ||
1145 | if (sp == NULL) { | |
1146 | VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0); | |
1147 | /* The slab list for this class is empty */ | |
1148 | return (NULL); | |
1149 | } | |
1150 | ||
1151 | VERIFY(m_infree(class) > 0); | |
1152 | VERIFY(!slab_is_detached(sp)); | |
1153 | VERIFY(sp->sl_class == class && | |
1154 | (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); | |
1155 | buf = sp->sl_head; | |
1156 | VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf)); | |
1157 | ||
1158 | if (class == MC_MBUF) { | |
1159 | sp->sl_head = buf->obj_next; | |
1160 | VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPCL - 1)); | |
1161 | } else { | |
1162 | sp->sl_head = NULL; | |
1163 | } | |
1164 | if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) { | |
1165 | slab_nextptr_panic(sp, sp->sl_head); | |
1166 | /* In case sl_head is in the map but not in the slab */ | |
1167 | VERIFY(slab_inrange(sp, sp->sl_head)); | |
1168 | /* NOTREACHED */ | |
1169 | } | |
1170 | ||
1171 | /* Increment slab reference */ | |
1172 | sp->sl_refcnt++; | |
1173 | ||
1174 | if (mclaudit != NULL) { | |
1175 | mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); | |
1176 | mca->mca_uflags = 0; | |
1177 | /* Save contents on mbuf objects only */ | |
1178 | if (class == MC_MBUF) | |
1179 | mca->mca_uflags |= MB_SCVALID; | |
1180 | } | |
1181 | ||
1182 | if (class == MC_CL) { | |
1183 | mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL); | |
1184 | /* | |
1185 | * A 2K cluster slab can have at most 1 reference. | |
1186 | */ | |
1187 | VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && | |
1188 | sp->sl_len == m_maxsize(MC_CL) && sp->sl_head == NULL); | |
1189 | } else if (class == MC_BIGCL) { | |
1190 | mcl_slab_t *nsp = sp->sl_next; | |
1191 | mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) + | |
1192 | m_infree(MC_MBUF_BIGCL); | |
1193 | /* | |
1194 | * Increment 2nd slab. A 4K big cluster takes | |
1195 | * 2 slabs, each having at most 1 reference. | |
1196 | */ | |
1197 | VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && | |
1198 | sp->sl_len == m_maxsize(MC_BIGCL) && sp->sl_head == NULL); | |
1199 | /* Next slab must already be present */ | |
1200 | VERIFY(nsp != NULL); | |
1201 | nsp->sl_refcnt++; | |
1202 | VERIFY(!slab_is_detached(nsp)); | |
1203 | VERIFY(nsp->sl_class == MC_BIGCL && | |
1204 | nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) && | |
1205 | nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 && | |
1206 | nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && | |
1207 | nsp->sl_head == NULL); | |
1208 | } else if (class == MC_16KCL) { | |
1209 | mcl_slab_t *nsp; | |
1210 | int k; | |
1211 | ||
1212 | --m_infree(MC_16KCL); | |
1213 | VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && | |
1214 | sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL); | |
1215 | /* | |
1216 | * Increment 2nd-8th slab. A 16K big cluster takes | |
1217 | * 8 cluster slabs, each having at most 1 reference. | |
1218 | */ | |
1219 | for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { | |
1220 | nsp = nsp->sl_next; | |
1221 | /* Next slab must already be present */ | |
1222 | VERIFY(nsp != NULL); | |
1223 | nsp->sl_refcnt++; | |
1224 | VERIFY(!slab_is_detached(nsp)); | |
1225 | VERIFY(nsp->sl_class == MC_16KCL && | |
1226 | nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) && | |
1227 | nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 && | |
1228 | nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && | |
1229 | nsp->sl_head == NULL); | |
1230 | } | |
1231 | } else { | |
1232 | ASSERT(class == MC_MBUF); | |
1233 | --m_infree(MC_MBUF); | |
1234 | /* | |
1235 | * If auditing is turned on, this check is | |
1236 | * deferred until later in mbuf_slab_audit(). | |
1237 | */ | |
1238 | if (mclaudit == NULL) | |
1239 | _MCHECK((struct mbuf *)buf); | |
1240 | /* | |
1241 | * Since we have incremented the reference count above, | |
1242 | * an mbuf slab (formerly a 2K cluster slab that was cut | |
1243 | * up into mbufs) must have a reference count between 1 | |
1244 | * and NMBPCL at this point. | |
1245 | */ | |
1246 | VERIFY(sp->sl_refcnt >= 1 && | |
1247 | (unsigned short)sp->sl_refcnt <= NMBPCL && | |
1248 | sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL)); | |
1249 | VERIFY((unsigned short)sp->sl_refcnt < NMBPCL || | |
1250 | sp->sl_head == NULL); | |
1251 | } | |
1252 | ||
1253 | /* If empty, remove this slab from the class's freelist */ | |
1254 | if (sp->sl_head == NULL) { | |
1255 | VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPCL); | |
1256 | slab_remove(sp, class); | |
1257 | } | |
1258 | ||
1259 | return (buf); | |
1260 | } | |
1261 | ||
1262 | /* | |
1263 | * Place a slab of object(s) back into a class's slab list. | |
1264 | */ | |
1265 | static void | |
1266 | slab_free(mbuf_class_t class, mcache_obj_t *buf) | |
1267 | { | |
1268 | mcl_slab_t *sp; | |
1269 | ||
1270 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
1271 | ||
1272 | VERIFY(class != MC_16KCL || njcl > 0); | |
1273 | VERIFY(buf->obj_next == NULL); | |
1274 | sp = slab_get(buf); | |
1275 | VERIFY(sp->sl_class == class && slab_inrange(sp, buf) && | |
1276 | (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); | |
1277 | ||
1278 | /* Decrement slab reference */ | |
1279 | sp->sl_refcnt--; | |
1280 | ||
1281 | if (class == MC_CL || class == MC_BIGCL) { | |
1282 | VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); | |
1283 | /* | |
1284 | * A 2K cluster slab can have at most 1 reference | |
1285 | * which must be 0 at this point. | |
1286 | */ | |
1287 | VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && | |
1288 | sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); | |
1289 | VERIFY(slab_is_detached(sp)); | |
1290 | if (class == MC_BIGCL) { | |
1291 | mcl_slab_t *nsp = sp->sl_next; | |
1292 | VERIFY(IS_P2ALIGNED(buf, NBPG)); | |
1293 | /* Next slab must already be present */ | |
1294 | VERIFY(nsp != NULL); | |
1295 | /* Decrement 2nd slab reference */ | |
1296 | nsp->sl_refcnt--; | |
1297 | /* | |
1298 | * A 4K big cluster takes 2 slabs, both | |
1299 | * must now have 0 reference. | |
1300 | */ | |
1301 | VERIFY(slab_is_detached(nsp)); | |
1302 | VERIFY(nsp->sl_class == MC_BIGCL && | |
1303 | (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) && | |
1304 | nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 && | |
1305 | nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && | |
1306 | nsp->sl_head == NULL); | |
1307 | } | |
1308 | } else if (class == MC_16KCL) { | |
1309 | mcl_slab_t *nsp; | |
1310 | int k; | |
1311 | /* | |
1312 | * A 16K cluster takes 8 cluster slabs, all must | |
1313 | * now have 0 reference. | |
1314 | */ | |
1315 | VERIFY(IS_P2ALIGNED(buf, NBPG)); | |
1316 | VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && | |
1317 | sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL); | |
1318 | VERIFY(slab_is_detached(sp)); | |
1319 | for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { | |
1320 | nsp = nsp->sl_next; | |
1321 | /* Next slab must already be present */ | |
1322 | VERIFY(nsp != NULL); | |
1323 | nsp->sl_refcnt--; | |
1324 | VERIFY(slab_is_detached(nsp)); | |
1325 | VERIFY(nsp->sl_class == MC_16KCL && | |
1326 | (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) && | |
1327 | nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 && | |
1328 | nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && | |
1329 | nsp->sl_head == NULL); | |
1330 | } | |
1331 | } else { | |
1332 | /* | |
1333 | * An mbuf slab has a total of NMBPL reference counts. | |
1334 | * Since we have decremented the reference above, it | |
1335 | * must now be between 0 and NMBPCL-1. | |
1336 | */ | |
1337 | VERIFY(sp->sl_refcnt >= 0 && | |
1338 | (unsigned short)sp->sl_refcnt <= (NMBPCL - 1) && | |
1339 | sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL)); | |
1340 | VERIFY(sp->sl_refcnt < (NMBPCL - 1) || | |
1341 | (slab_is_detached(sp) && sp->sl_head == NULL)); | |
1342 | } | |
1343 | ||
1344 | /* | |
1345 | * When auditing is enabled, ensure that the buffer still | |
1346 | * contains the free pattern. Otherwise it got corrupted | |
1347 | * while at the CPU cache layer. | |
1348 | */ | |
1349 | if (mclaudit != NULL) { | |
1350 | mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); | |
1351 | mcache_audit_free_verify(mca, buf, 0, m_maxsize(class)); | |
1352 | mca->mca_uflags &= ~MB_SCVALID; | |
1353 | } | |
1354 | ||
1355 | if (class == MC_CL) { | |
1356 | mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); | |
1357 | } else if (class == MC_BIGCL) { | |
1358 | mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + | |
1359 | m_infree(MC_MBUF_BIGCL); | |
1360 | } else if (class == MC_16KCL) { | |
1361 | ++m_infree(MC_16KCL); | |
1362 | } else { | |
1363 | ++m_infree(MC_MBUF); | |
1364 | buf->obj_next = sp->sl_head; | |
1365 | } | |
1366 | sp->sl_head = buf; | |
1367 | ||
1368 | /* All mbufs are freed; return the cluster that we stole earlier */ | |
1369 | if (sp->sl_refcnt == 0 && class == MC_MBUF) { | |
1370 | int i = NMBPCL; | |
1371 | ||
1372 | m_total(MC_MBUF) -= NMBPCL; | |
1373 | mbstat.m_mbufs = m_total(MC_MBUF); | |
1374 | m_infree(MC_MBUF) -= NMBPCL; | |
1375 | mtype_stat_add(MT_FREE, -NMBPCL); | |
1376 | ||
1377 | while (i--) { | |
1378 | struct mbuf *m = sp->sl_head; | |
1379 | VERIFY(m != NULL); | |
1380 | sp->sl_head = m->m_next; | |
1381 | m->m_next = NULL; | |
1382 | } | |
1383 | VERIFY(sp->sl_head == NULL); | |
1384 | ||
1385 | /* Remove the slab from the mbuf class's slab list */ | |
1386 | slab_remove(sp, class); | |
1387 | ||
1388 | /* Reinitialize it as a 2K cluster slab */ | |
1389 | slab_init(sp, MC_CL, sp->sl_flags, sp->sl_base, sp->sl_base, | |
1390 | sp->sl_len, 0, 1); | |
1391 | ||
1392 | if (mclaudit != NULL) | |
1393 | mcache_set_pattern(MCACHE_FREE_PATTERN, | |
1394 | (caddr_t)sp->sl_head, m_maxsize(MC_CL)); | |
1395 | ||
1396 | mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); | |
1397 | ||
1398 | VERIFY(slab_is_detached(sp)); | |
1399 | /* And finally switch class */ | |
1400 | class = MC_CL; | |
1401 | } | |
1402 | ||
1403 | /* Reinsert the slab to the class's slab list */ | |
1404 | if (slab_is_detached(sp)) | |
1405 | slab_insert(sp, class); | |
1406 | } | |
1407 | ||
1408 | /* | |
1409 | * Common allocator for rudimentary objects called by the CPU cache layer | |
1410 | * during an allocation request whenever there is no available element in the | |
1411 | * bucket layer. It returns one or more elements from the appropriate global | |
1412 | * freelist. If the freelist is empty, it will attempt to populate it and | |
1413 | * retry the allocation. | |
1414 | */ | |
1415 | static unsigned int | |
1416 | mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait) | |
1417 | { | |
1418 | mbuf_class_t class = (mbuf_class_t)arg; | |
1419 | unsigned int need = num; | |
1420 | mcache_obj_t **list = *plist; | |
1421 | ||
1422 | ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); | |
1423 | ASSERT(need > 0); | |
1424 | ||
1425 | lck_mtx_lock(mbuf_mlock); | |
1426 | ||
1427 | for (;;) { | |
1428 | if ((*list = slab_alloc(class, wait)) != NULL) { | |
1429 | (*list)->obj_next = NULL; | |
1430 | list = *plist = &(*list)->obj_next; | |
1431 | ||
1432 | if (--need == 0) { | |
1433 | /* | |
1434 | * If the number of elements in freelist has | |
1435 | * dropped below low watermark, asynchronously | |
1436 | * populate the freelist now rather than doing | |
1437 | * it later when we run out of elements. | |
1438 | */ | |
1439 | if (!mbuf_cached_above(class, wait) && | |
1440 | m_infree(class) < m_total(class) >> 5) { | |
1441 | (void) freelist_populate(class, 1, | |
1442 | M_DONTWAIT); | |
1443 | } | |
1444 | break; | |
1445 | } | |
1446 | } else { | |
1447 | VERIFY(m_infree(class) == 0 || class == MC_CL); | |
1448 | ||
1449 | (void) freelist_populate(class, 1, | |
1450 | (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT); | |
1451 | ||
1452 | if (m_infree(class) > 0) | |
1453 | continue; | |
1454 | ||
1455 | /* Check if there's anything at the cache layer */ | |
1456 | if (mbuf_cached_above(class, wait)) | |
1457 | break; | |
1458 | ||
1459 | /* We have nothing and cannot block; give up */ | |
1460 | if (wait & MCR_NOSLEEP) { | |
1461 | if (!(wait & MCR_TRYHARD)) { | |
1462 | m_fail_cnt(class)++; | |
1463 | mbstat.m_drops++; | |
1464 | break; | |
1465 | } | |
1466 | } | |
1467 | ||
1468 | /* | |
1469 | * If the freelist is still empty and the caller is | |
1470 | * willing to be blocked, sleep on the wait channel | |
1471 | * until an element is available. Otherwise, if | |
1472 | * MCR_TRYHARD is set, do our best to satisfy the | |
1473 | * request without having to go to sleep. | |
1474 | */ | |
1475 | if (mbuf_worker_ready && | |
1476 | mbuf_sleep(class, need, wait)) | |
1477 | break; | |
1478 | ||
1479 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
1480 | } | |
1481 | } | |
1482 | ||
1483 | m_alloc_cnt(class) += num - need; | |
1484 | lck_mtx_unlock(mbuf_mlock); | |
1485 | ||
1486 | return (num - need); | |
1487 | } | |
1488 | ||
1489 | /* | |
1490 | * Common de-allocator for rudimentary objects called by the CPU cache | |
1491 | * layer when one or more elements need to be returned to the appropriate | |
1492 | * global freelist. | |
1493 | */ | |
1494 | static void | |
1495 | mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged) | |
1496 | { | |
1497 | mbuf_class_t class = (mbuf_class_t)arg; | |
1498 | mcache_obj_t *nlist; | |
1499 | unsigned int num = 0; | |
1500 | int w; | |
1501 | ||
1502 | ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); | |
1503 | ||
1504 | lck_mtx_lock(mbuf_mlock); | |
1505 | ||
1506 | for (;;) { | |
1507 | nlist = list->obj_next; | |
1508 | list->obj_next = NULL; | |
1509 | slab_free(class, list); | |
1510 | ++num; | |
1511 | if ((list = nlist) == NULL) | |
1512 | break; | |
1513 | } | |
1514 | m_free_cnt(class) += num; | |
1515 | ||
1516 | if ((w = mb_waiters) > 0) | |
1517 | mb_waiters = 0; | |
1518 | ||
1519 | lck_mtx_unlock(mbuf_mlock); | |
1520 | ||
1521 | if (w != 0) | |
1522 | wakeup(mb_waitchan); | |
1523 | } | |
1524 | ||
1525 | /* | |
1526 | * Common auditor for rudimentary objects called by the CPU cache layer | |
1527 | * during an allocation or free request. For the former, this is called | |
1528 | * after the objects are obtained from either the bucket or slab layer | |
1529 | * and before they are returned to the caller. For the latter, this is | |
1530 | * called immediately during free and before placing the objects into | |
1531 | * the bucket or slab layer. | |
1532 | */ | |
1533 | static void | |
1534 | mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) | |
1535 | { | |
1536 | mbuf_class_t class = (mbuf_class_t)arg; | |
1537 | mcache_audit_t *mca; | |
1538 | ||
1539 | ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); | |
1540 | ||
1541 | while (list != NULL) { | |
1542 | lck_mtx_lock(mbuf_mlock); | |
1543 | mca = mcl_audit_buf2mca(class, list); | |
1544 | ||
1545 | /* Do the sanity checks */ | |
1546 | if (class == MC_MBUF) { | |
1547 | mcl_audit_mbuf(mca, list, FALSE, alloc); | |
1548 | ASSERT(mca->mca_uflags & MB_SCVALID); | |
1549 | } else { | |
1550 | mcl_audit_cluster(mca, list, m_maxsize(class), | |
1551 | alloc, TRUE); | |
1552 | ASSERT(!(mca->mca_uflags & MB_SCVALID)); | |
1553 | } | |
1554 | /* Record this transaction */ | |
1555 | mcache_buffer_log(mca, list, m_cache(class)); | |
1556 | if (alloc) | |
1557 | mca->mca_uflags |= MB_INUSE; | |
1558 | else | |
1559 | mca->mca_uflags &= ~MB_INUSE; | |
1560 | /* Unpair the object (unconditionally) */ | |
1561 | mca->mca_uptr = NULL; | |
1562 | lck_mtx_unlock(mbuf_mlock); | |
1563 | ||
1564 | list = list->obj_next; | |
1565 | } | |
1566 | } | |
1567 | ||
1568 | /* | |
1569 | * Common notify routine for all caches. It is called by mcache when | |
1570 | * one or more objects get freed. We use this indication to trigger | |
1571 | * the wakeup of any sleeping threads so that they can retry their | |
1572 | * allocation requests. | |
1573 | */ | |
1574 | static void | |
1575 | mbuf_slab_notify(void *arg, u_int32_t reason) | |
1576 | { | |
1577 | mbuf_class_t class = (mbuf_class_t)arg; | |
1578 | int w; | |
1579 | ||
1580 | ASSERT(MBUF_CLASS_VALID(class)); | |
1581 | ||
1582 | if (reason != MCN_RETRYALLOC) | |
1583 | return; | |
1584 | ||
1585 | lck_mtx_lock(mbuf_mlock); | |
1586 | if ((w = mb_waiters) > 0) { | |
1587 | m_notified(class)++; | |
1588 | mb_waiters = 0; | |
1589 | } | |
1590 | lck_mtx_unlock(mbuf_mlock); | |
1591 | ||
1592 | if (w != 0) | |
1593 | wakeup(mb_waitchan); | |
1594 | } | |
1595 | ||
1596 | /* | |
1597 | * Obtain object(s) from the composite class's freelist. | |
1598 | */ | |
1599 | static unsigned int | |
1600 | cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num) | |
1601 | { | |
1602 | unsigned int need = num; | |
1603 | mcl_slab_t *sp, *clsp, *nsp; | |
1604 | struct mbuf *m; | |
1605 | mcache_obj_t **list = *plist; | |
1606 | void *cl; | |
1607 | ||
1608 | VERIFY(need > 0); | |
1609 | VERIFY(class != MC_MBUF_16KCL || njcl > 0); | |
1610 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
1611 | ||
1612 | /* Get what we can from the freelist */ | |
1613 | while ((*list = m_cobjlist(class)) != NULL) { | |
1614 | MRANGE(*list); | |
1615 | ||
1616 | m = (struct mbuf *)*list; | |
1617 | sp = slab_get(m); | |
1618 | cl = m->m_ext.ext_buf; | |
1619 | clsp = slab_get(cl); | |
1620 | VERIFY(m->m_flags == M_EXT && cl != NULL); | |
1621 | VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m)); | |
1622 | VERIFY(clsp->sl_refcnt == 1); | |
1623 | if (class == MC_MBUF_BIGCL) { | |
1624 | nsp = clsp->sl_next; | |
1625 | /* Next slab must already be present */ | |
1626 | VERIFY(nsp != NULL); | |
1627 | VERIFY(nsp->sl_refcnt == 1); | |
1628 | } else if (class == MC_MBUF_16KCL) { | |
1629 | int k; | |
1630 | for (nsp = clsp, k = 1; | |
1631 | k < (M16KCLBYTES / MCLBYTES); k++) { | |
1632 | nsp = nsp->sl_next; | |
1633 | /* Next slab must already be present */ | |
1634 | VERIFY(nsp != NULL); | |
1635 | VERIFY(nsp->sl_refcnt == 1); | |
1636 | } | |
1637 | } | |
1638 | ||
1639 | if ((m_cobjlist(class) = (*list)->obj_next) != NULL && | |
1640 | !MBUF_IN_MAP(m_cobjlist(class))) { | |
1641 | slab_nextptr_panic(sp, m_cobjlist(class)); | |
1642 | /* NOTREACHED */ | |
1643 | } | |
1644 | (*list)->obj_next = NULL; | |
1645 | list = *plist = &(*list)->obj_next; | |
1646 | ||
1647 | if (--need == 0) | |
1648 | break; | |
1649 | } | |
1650 | m_infree(class) -= (num - need); | |
1651 | ||
1652 | return (num - need); | |
1653 | } | |
1654 | ||
1655 | /* | |
1656 | * Place object(s) back into a composite class's freelist. | |
1657 | */ | |
1658 | static unsigned int | |
1659 | cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) | |
1660 | { | |
1661 | mcache_obj_t *o, *tail; | |
1662 | unsigned int num = 0; | |
1663 | struct mbuf *m, *ms; | |
1664 | mcache_audit_t *mca = NULL; | |
1665 | mcache_obj_t *ref_list = NULL; | |
1666 | mcl_slab_t *clsp, *nsp; | |
1667 | void *cl; | |
1668 | ||
1669 | ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); | |
1670 | VERIFY(class != MC_MBUF_16KCL || njcl > 0); | |
1671 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
1672 | ||
1673 | o = tail = list; | |
1674 | ||
1675 | while ((m = ms = (struct mbuf *)o) != NULL) { | |
1676 | mcache_obj_t *rfa, *nexto = o->obj_next; | |
1677 | ||
1678 | /* Do the mbuf sanity checks */ | |
1679 | if (mclaudit != NULL) { | |
1680 | mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); | |
1681 | mcache_audit_free_verify(mca, m, 0, m_maxsize(MC_MBUF)); | |
1682 | ms = (struct mbuf *)mca->mca_contents; | |
1683 | } | |
1684 | ||
1685 | /* Do the cluster sanity checks */ | |
1686 | cl = ms->m_ext.ext_buf; | |
1687 | clsp = slab_get(cl); | |
1688 | if (mclaudit != NULL) { | |
1689 | size_t size; | |
1690 | if (class == MC_MBUF_CL) | |
1691 | size = m_maxsize(MC_CL); | |
1692 | else if (class == MC_MBUF_BIGCL) | |
1693 | size = m_maxsize(MC_BIGCL); | |
1694 | else | |
1695 | size = m_maxsize(MC_16KCL); | |
1696 | mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL, | |
1697 | (mcache_obj_t *)cl), cl, 0, size); | |
1698 | } | |
1699 | VERIFY(ms->m_type == MT_FREE); | |
1700 | VERIFY(ms->m_flags == M_EXT); | |
1701 | VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); | |
1702 | VERIFY(clsp->sl_refcnt == 1); | |
1703 | if (class == MC_MBUF_BIGCL) { | |
1704 | nsp = clsp->sl_next; | |
1705 | /* Next slab must already be present */ | |
1706 | VERIFY(nsp != NULL); | |
1707 | VERIFY(nsp->sl_refcnt == 1); | |
1708 | } else if (class == MC_MBUF_16KCL) { | |
1709 | int k; | |
1710 | for (nsp = clsp, k = 1; | |
1711 | k < (M16KCLBYTES / MCLBYTES); k++) { | |
1712 | nsp = nsp->sl_next; | |
1713 | /* Next slab must already be present */ | |
1714 | VERIFY(nsp != NULL); | |
1715 | VERIFY(nsp->sl_refcnt == 1); | |
1716 | } | |
1717 | } | |
1718 | ||
1719 | /* | |
1720 | * If we're asked to purge, restore the actual mbuf using | |
1721 | * contents of the shadow structure (if auditing is enabled) | |
1722 | * and clear EXTF_COMPOSITE flag from the mbuf, as we are | |
1723 | * about to free it and the attached cluster into their caches. | |
1724 | */ | |
1725 | if (purged) { | |
1726 | /* Restore constructed mbuf fields */ | |
1727 | if (mclaudit != NULL) | |
1728 | mcl_audit_restore_mbuf(m, mca, TRUE); | |
1729 | ||
1730 | MEXT_REF(m) = 0; | |
1731 | MEXT_FLAGS(m) = 0; | |
1732 | ||
1733 | rfa = (mcache_obj_t *)MEXT_RFA(m); | |
1734 | rfa->obj_next = ref_list; | |
1735 | ref_list = rfa; | |
1736 | MEXT_RFA(m) = NULL; | |
1737 | ||
1738 | m->m_type = MT_FREE; | |
1739 | m->m_flags = m->m_len = 0; | |
1740 | m->m_next = m->m_nextpkt = NULL; | |
1741 | ||
1742 | /* Save mbuf fields and make auditing happy */ | |
1743 | if (mclaudit != NULL) | |
1744 | mcl_audit_mbuf(mca, o, FALSE, FALSE); | |
1745 | ||
1746 | VERIFY(m_total(class) > 0); | |
1747 | m_total(class)--; | |
1748 | ||
1749 | /* Free the mbuf */ | |
1750 | o->obj_next = NULL; | |
1751 | slab_free(MC_MBUF, o); | |
1752 | ||
1753 | /* And free the cluster */ | |
1754 | ((mcache_obj_t *)cl)->obj_next = NULL; | |
1755 | if (class == MC_MBUF_CL) | |
1756 | slab_free(MC_CL, cl); | |
1757 | else if (class == MC_MBUF_BIGCL) | |
1758 | slab_free(MC_BIGCL, cl); | |
1759 | else | |
1760 | slab_free(MC_16KCL, cl); | |
1761 | } | |
1762 | ||
1763 | ++num; | |
1764 | tail = o; | |
1765 | o = nexto; | |
1766 | } | |
1767 | ||
1768 | if (!purged) { | |
1769 | tail->obj_next = m_cobjlist(class); | |
1770 | m_cobjlist(class) = list; | |
1771 | m_infree(class) += num; | |
1772 | } else if (ref_list != NULL) { | |
1773 | mcache_free_ext(ref_cache, ref_list); | |
1774 | } | |
1775 | ||
1776 | return (num); | |
1777 | } | |
1778 | ||
1779 | /* | |
1780 | * Common allocator for composite objects called by the CPU cache layer | |
1781 | * during an allocation request whenever there is no available element in | |
1782 | * the bucket layer. It returns one or more composite elements from the | |
1783 | * appropriate global freelist. If the freelist is empty, it will attempt | |
1784 | * to obtain the rudimentary objects from their caches and construct them | |
1785 | * into composite mbuf + cluster objects. | |
1786 | */ | |
1787 | static unsigned int | |
1788 | mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, | |
1789 | int wait) | |
1790 | { | |
1791 | mbuf_class_t class = (mbuf_class_t)arg; | |
1792 | mcache_t *cp = NULL; | |
1793 | unsigned int num = 0, cnum = 0, want = needed; | |
1794 | mcache_obj_t *ref_list = NULL; | |
1795 | mcache_obj_t *mp_list = NULL; | |
1796 | mcache_obj_t *clp_list = NULL; | |
1797 | mcache_obj_t **list; | |
1798 | struct ext_ref *rfa; | |
1799 | struct mbuf *m; | |
1800 | void *cl; | |
1801 | ||
1802 | ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); | |
1803 | ASSERT(needed > 0); | |
1804 | ||
1805 | VERIFY(class != MC_MBUF_16KCL || njcl > 0); | |
1806 | ||
1807 | /* There should not be any slab for this class */ | |
1808 | VERIFY(m_slab_cnt(class) == 0 && | |
1809 | m_slablist(class).tqh_first == NULL && | |
1810 | m_slablist(class).tqh_last == NULL); | |
1811 | ||
1812 | lck_mtx_lock(mbuf_mlock); | |
1813 | ||
1814 | /* Try using the freelist first */ | |
1815 | num = cslab_alloc(class, plist, needed); | |
1816 | list = *plist; | |
1817 | if (num == needed) { | |
1818 | m_alloc_cnt(class) += num; | |
1819 | lck_mtx_unlock(mbuf_mlock); | |
1820 | return (needed); | |
1821 | } | |
1822 | ||
1823 | lck_mtx_unlock(mbuf_mlock); | |
1824 | ||
1825 | /* | |
1826 | * We could not satisfy the request using the freelist alone; | |
1827 | * allocate from the appropriate rudimentary caches and use | |
1828 | * whatever we can get to construct the composite objects. | |
1829 | */ | |
1830 | needed -= num; | |
1831 | ||
1832 | /* | |
1833 | * Mark these allocation requests as coming from a composite cache. | |
1834 | * Also, if the caller is willing to be blocked, mark the request | |
1835 | * with MCR_FAILOK such that we don't end up sleeping at the mbuf | |
1836 | * slab layer waiting for the individual object when one or more | |
1837 | * of the already-constructed composite objects are available. | |
1838 | */ | |
1839 | wait |= MCR_COMP; | |
1840 | if (!(wait & MCR_NOSLEEP)) | |
1841 | wait |= MCR_FAILOK; | |
1842 | ||
1843 | needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait); | |
1844 | if (needed == 0) { | |
1845 | ASSERT(mp_list == NULL); | |
1846 | goto fail; | |
1847 | } | |
1848 | if (class == MC_MBUF_CL) | |
1849 | cp = m_cache(MC_CL); | |
1850 | else if (class == MC_MBUF_BIGCL) | |
1851 | cp = m_cache(MC_BIGCL); | |
1852 | else | |
1853 | cp = m_cache(MC_16KCL); | |
1854 | needed = mcache_alloc_ext(cp, &clp_list, needed, wait); | |
1855 | if (needed == 0) { | |
1856 | ASSERT(clp_list == NULL); | |
1857 | goto fail; | |
1858 | } | |
1859 | needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait); | |
1860 | if (needed == 0) { | |
1861 | ASSERT(ref_list == NULL); | |
1862 | goto fail; | |
1863 | } | |
1864 | ||
1865 | /* | |
1866 | * By this time "needed" is MIN(mbuf, cluster, ref). Any left | |
1867 | * overs will get freed accordingly before we return to caller. | |
1868 | */ | |
1869 | for (cnum = 0; cnum < needed; cnum++) { | |
1870 | struct mbuf *ms; | |
1871 | ||
1872 | m = ms = (struct mbuf *)mp_list; | |
1873 | mp_list = mp_list->obj_next; | |
1874 | ||
1875 | cl = clp_list; | |
1876 | clp_list = clp_list->obj_next; | |
1877 | ((mcache_obj_t *)cl)->obj_next = NULL; | |
1878 | ||
1879 | rfa = (struct ext_ref *)ref_list; | |
1880 | ref_list = ref_list->obj_next; | |
1881 | ((mcache_obj_t *)rfa)->obj_next = NULL; | |
1882 | ||
1883 | /* | |
1884 | * If auditing is enabled, construct the shadow mbuf | |
1885 | * in the audit structure instead of in the actual one. | |
1886 | * mbuf_cslab_audit() will take care of restoring the | |
1887 | * contents after the integrity check. | |
1888 | */ | |
1889 | if (mclaudit != NULL) { | |
1890 | mcache_audit_t *mca, *cl_mca; | |
1891 | size_t size; | |
1892 | ||
1893 | lck_mtx_lock(mbuf_mlock); | |
1894 | mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); | |
1895 | ms = ((struct mbuf *)mca->mca_contents); | |
1896 | cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl); | |
1897 | ||
1898 | /* | |
1899 | * Pair them up. Note that this is done at the time | |
1900 | * the mbuf+cluster objects are constructed. This | |
1901 | * information should be treated as "best effort" | |
1902 | * debugging hint since more than one mbufs can refer | |
1903 | * to a cluster. In that case, the cluster might not | |
1904 | * be freed along with the mbuf it was paired with. | |
1905 | */ | |
1906 | mca->mca_uptr = cl_mca; | |
1907 | cl_mca->mca_uptr = mca; | |
1908 | ||
1909 | ASSERT(mca->mca_uflags & MB_SCVALID); | |
1910 | ASSERT(!(cl_mca->mca_uflags & MB_SCVALID)); | |
1911 | lck_mtx_unlock(mbuf_mlock); | |
1912 | ||
1913 | /* Technically, they are in the freelist */ | |
1914 | mcache_set_pattern(MCACHE_FREE_PATTERN, m, | |
1915 | m_maxsize(MC_MBUF)); | |
1916 | if (class == MC_MBUF_CL) | |
1917 | size = m_maxsize(MC_CL); | |
1918 | else if (class == MC_MBUF_BIGCL) | |
1919 | size = m_maxsize(MC_BIGCL); | |
1920 | else | |
1921 | size = m_maxsize(MC_16KCL); | |
1922 | mcache_set_pattern(MCACHE_FREE_PATTERN, cl, size); | |
1923 | } | |
1924 | ||
1925 | MBUF_INIT(ms, 0, MT_FREE); | |
1926 | if (class == MC_MBUF_16KCL) { | |
1927 | MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); | |
1928 | } else if (class == MC_MBUF_BIGCL) { | |
1929 | MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); | |
1930 | } else { | |
1931 | MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); | |
1932 | } | |
1933 | VERIFY(ms->m_flags == M_EXT); | |
1934 | VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); | |
1935 | ||
1936 | *list = (mcache_obj_t *)m; | |
1937 | (*list)->obj_next = NULL; | |
1938 | list = *plist = &(*list)->obj_next; | |
1939 | } | |
1940 | ||
1941 | fail: | |
1942 | /* | |
1943 | * Free up what's left of the above. | |
1944 | */ | |
1945 | if (mp_list != NULL) | |
1946 | mcache_free_ext(m_cache(MC_MBUF), mp_list); | |
1947 | if (clp_list != NULL) | |
1948 | mcache_free_ext(cp, clp_list); | |
1949 | if (ref_list != NULL) | |
1950 | mcache_free_ext(ref_cache, ref_list); | |
1951 | ||
1952 | lck_mtx_lock(mbuf_mlock); | |
1953 | if (num > 0 || cnum > 0) { | |
1954 | m_total(class) += cnum; | |
1955 | VERIFY(m_total(class) <= m_maxlimit(class)); | |
1956 | m_alloc_cnt(class) += num + cnum; | |
1957 | } | |
1958 | if ((num + cnum) < want) | |
1959 | m_fail_cnt(class) += (want - (num + cnum)); | |
1960 | lck_mtx_unlock(mbuf_mlock); | |
1961 | ||
1962 | return (num + cnum); | |
1963 | } | |
1964 | ||
1965 | /* | |
1966 | * Common de-allocator for composite objects called by the CPU cache | |
1967 | * layer when one or more elements need to be returned to the appropriate | |
1968 | * global freelist. | |
1969 | */ | |
1970 | static void | |
1971 | mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged) | |
1972 | { | |
1973 | mbuf_class_t class = (mbuf_class_t)arg; | |
1974 | unsigned int num; | |
1975 | int w; | |
1976 | ||
1977 | ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); | |
1978 | ||
1979 | lck_mtx_lock(mbuf_mlock); | |
1980 | ||
1981 | num = cslab_free(class, list, purged); | |
1982 | m_free_cnt(class) += num; | |
1983 | ||
1984 | if ((w = mb_waiters) > 0) | |
1985 | mb_waiters = 0; | |
1986 | ||
1987 | lck_mtx_unlock(mbuf_mlock); | |
1988 | ||
1989 | if (w != 0) | |
1990 | wakeup(mb_waitchan); | |
1991 | } | |
1992 | ||
1993 | /* | |
1994 | * Common auditor for composite objects called by the CPU cache layer | |
1995 | * during an allocation or free request. For the former, this is called | |
1996 | * after the objects are obtained from either the bucket or slab layer | |
1997 | * and before they are returned to the caller. For the latter, this is | |
1998 | * called immediately during free and before placing the objects into | |
1999 | * the bucket or slab layer. | |
2000 | */ | |
2001 | static void | |
2002 | mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) | |
2003 | { | |
2004 | mbuf_class_t class = (mbuf_class_t)arg; | |
2005 | mcache_audit_t *mca; | |
2006 | struct mbuf *m, *ms; | |
2007 | mcl_slab_t *clsp, *nsp; | |
2008 | size_t size; | |
2009 | void *cl; | |
2010 | ||
2011 | ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); | |
2012 | ||
2013 | while ((m = ms = (struct mbuf *)list) != NULL) { | |
2014 | lck_mtx_lock(mbuf_mlock); | |
2015 | /* Do the mbuf sanity checks and record its transaction */ | |
2016 | mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); | |
2017 | mcl_audit_mbuf(mca, m, TRUE, alloc); | |
2018 | mcache_buffer_log(mca, m, m_cache(class)); | |
2019 | if (alloc) | |
2020 | mca->mca_uflags |= MB_COMP_INUSE; | |
2021 | else | |
2022 | mca->mca_uflags &= ~MB_COMP_INUSE; | |
2023 | ||
2024 | /* | |
2025 | * Use the shadow mbuf in the audit structure if we are | |
2026 | * freeing, since the contents of the actual mbuf has been | |
2027 | * pattern-filled by the above call to mcl_audit_mbuf(). | |
2028 | */ | |
2029 | if (!alloc) | |
2030 | ms = (struct mbuf *)mca->mca_contents; | |
2031 | ||
2032 | /* Do the cluster sanity checks and record its transaction */ | |
2033 | cl = ms->m_ext.ext_buf; | |
2034 | clsp = slab_get(cl); | |
2035 | VERIFY(ms->m_flags == M_EXT && cl != NULL); | |
2036 | VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); | |
2037 | VERIFY(clsp->sl_refcnt == 1); | |
2038 | if (class == MC_MBUF_BIGCL) { | |
2039 | nsp = clsp->sl_next; | |
2040 | /* Next slab must already be present */ | |
2041 | VERIFY(nsp != NULL); | |
2042 | VERIFY(nsp->sl_refcnt == 1); | |
2043 | } else if (class == MC_MBUF_16KCL) { | |
2044 | int k; | |
2045 | for (nsp = clsp, k = 1; | |
2046 | k < (M16KCLBYTES / MCLBYTES); k++) { | |
2047 | nsp = nsp->sl_next; | |
2048 | /* Next slab must already be present */ | |
2049 | VERIFY(nsp != NULL); | |
2050 | VERIFY(nsp->sl_refcnt == 1); | |
2051 | } | |
2052 | } | |
2053 | ||
2054 | mca = mcl_audit_buf2mca(MC_CL, cl); | |
2055 | if (class == MC_MBUF_CL) | |
2056 | size = m_maxsize(MC_CL); | |
2057 | else if (class == MC_MBUF_BIGCL) | |
2058 | size = m_maxsize(MC_BIGCL); | |
2059 | else | |
2060 | size = m_maxsize(MC_16KCL); | |
2061 | mcl_audit_cluster(mca, cl, size, alloc, FALSE); | |
2062 | mcache_buffer_log(mca, cl, m_cache(class)); | |
2063 | if (alloc) | |
2064 | mca->mca_uflags |= MB_COMP_INUSE; | |
2065 | else | |
2066 | mca->mca_uflags &= ~MB_COMP_INUSE; | |
2067 | lck_mtx_unlock(mbuf_mlock); | |
2068 | ||
2069 | list = list->obj_next; | |
2070 | } | |
2071 | } | |
2072 | ||
2073 | /* | |
2074 | * Allocate some number of mbuf clusters and place on cluster freelist. | |
2075 | */ | |
2076 | static int | |
2077 | m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) | |
2078 | { | |
2079 | int i; | |
2080 | vm_size_t size = 0; | |
2081 | int numpages = 0; | |
2082 | vm_offset_t page = 0; | |
2083 | mcache_audit_t *mca_list = NULL; | |
2084 | mcache_obj_t *con_list = NULL; | |
2085 | mcl_slab_t *sp; | |
2086 | ||
2087 | VERIFY(bufsize == m_maxsize(MC_CL) || | |
2088 | bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL)); | |
2089 | ||
2090 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
2091 | ||
2092 | /* | |
2093 | * Multiple threads may attempt to populate the cluster map one | |
2094 | * after another. Since we drop the lock below prior to acquiring | |
2095 | * the physical page(s), our view of the cluster map may no longer | |
2096 | * be accurate, and we could end up over-committing the pages beyond | |
2097 | * the maximum allowed for each class. To prevent it, this entire | |
2098 | * operation (including the page mapping) is serialized. | |
2099 | */ | |
2100 | while (mb_clalloc_busy) { | |
2101 | mb_clalloc_waiters++; | |
2102 | (void) msleep(mb_clalloc_waitchan, mbuf_mlock, | |
2103 | (PZERO-1), "m_clalloc", NULL); | |
2104 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
2105 | } | |
2106 | ||
2107 | /* We are busy now; tell everyone else to go away */ | |
2108 | mb_clalloc_busy = TRUE; | |
2109 | ||
2110 | /* | |
2111 | * Honor the caller's wish to block or not block. We have a way | |
2112 | * to grow the pool asynchronously using the mbuf worker thread. | |
2113 | */ | |
2114 | i = m_howmany(num, bufsize); | |
2115 | if (i == 0 || (wait & M_DONTWAIT)) | |
2116 | goto out; | |
2117 | ||
2118 | lck_mtx_unlock(mbuf_mlock); | |
2119 | ||
2120 | size = round_page_32(i * bufsize); | |
2121 | page = kmem_mb_alloc(mb_map, size); | |
2122 | ||
2123 | if (page == 0) { | |
2124 | if (bufsize <= m_maxsize(MC_BIGCL)) { | |
2125 | /* Try for 1 page if failed, only for 2KB/4KB request */ | |
2126 | size = NBPG; | |
2127 | page = kmem_mb_alloc(mb_map, size); | |
2128 | } | |
2129 | ||
2130 | if (page == 0) { | |
2131 | lck_mtx_lock(mbuf_mlock); | |
2132 | goto out; | |
2133 | } | |
2134 | } | |
2135 | ||
2136 | VERIFY(IS_P2ALIGNED(page, NBPG)); | |
2137 | numpages = size / NBPG; | |
2138 | ||
2139 | /* If auditing is enabled, allocate the audit structures now */ | |
2140 | if (mclaudit != NULL) { | |
2141 | int needed; | |
2142 | ||
2143 | /* | |
2144 | * Yes, I realize this is a waste of memory for clusters | |
2145 | * that never get transformed into mbufs, as we may end | |
2146 | * up with NMBPCL-1 unused audit structures per cluster. | |
2147 | * But doing so tremendously simplifies the allocation | |
2148 | * strategy, since at this point we are not holding the | |
2149 | * mbuf lock and the caller is okay to be blocked. For | |
2150 | * the case of big clusters, we allocate one structure | |
2151 | * for each as we never turn them into mbufs. | |
2152 | */ | |
2153 | if (bufsize == m_maxsize(MC_CL)) { | |
2154 | needed = numpages * 2 * NMBPCL; | |
2155 | ||
2156 | i = mcache_alloc_ext(mcl_audit_con_cache, | |
2157 | &con_list, needed, MCR_SLEEP); | |
2158 | ||
2159 | VERIFY(con_list != NULL && i == needed); | |
2160 | } else if (bufsize == m_maxsize(MC_BIGCL)) { | |
2161 | needed = numpages; | |
2162 | } else { | |
2163 | needed = numpages / (M16KCLBYTES / NBPG); | |
2164 | } | |
2165 | ||
2166 | i = mcache_alloc_ext(mcache_audit_cache, | |
2167 | (mcache_obj_t **)&mca_list, needed, MCR_SLEEP); | |
2168 | ||
2169 | VERIFY(mca_list != NULL && i == needed); | |
2170 | } | |
2171 | ||
2172 | lck_mtx_lock(mbuf_mlock); | |
2173 | ||
2174 | for (i = 0; i < numpages; i++, page += NBPG) { | |
2175 | ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG; | |
2176 | ppnum_t new_page = pmap_find_phys(kernel_pmap, | |
2177 | (vm_address_t)page); | |
2178 | ||
2179 | /* | |
2180 | * In the case of no mapper being available the following | |
2181 | * code noops and returns the input page; if there is a | |
2182 | * mapper the appropriate I/O page is returned. | |
2183 | */ | |
2184 | new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page); | |
2185 | mcl_paddr[offset] = new_page << PGSHIFT; | |
2186 | ||
2187 | /* Pattern-fill this fresh page */ | |
2188 | if (mclaudit != NULL) | |
2189 | mcache_set_pattern(MCACHE_FREE_PATTERN, | |
2190 | (caddr_t)page, NBPG); | |
2191 | ||
2192 | if (bufsize == m_maxsize(MC_CL)) { | |
2193 | union mcluster *mcl = (union mcluster *)page; | |
2194 | ||
2195 | /* 1st cluster in the page */ | |
2196 | sp = slab_get(mcl); | |
2197 | if (mclaudit != NULL) | |
2198 | mcl_audit_init(mcl, &mca_list, &con_list, | |
2199 | AUDIT_CONTENTS_SIZE, NMBPCL); | |
2200 | ||
2201 | VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); | |
2202 | slab_init(sp, MC_CL, SLF_MAPPED, | |
2203 | mcl, mcl, bufsize, 0, 1); | |
2204 | ||
2205 | /* Insert this slab */ | |
2206 | slab_insert(sp, MC_CL); | |
2207 | ||
2208 | /* Update stats now since slab_get() drops the lock */ | |
2209 | mbstat.m_clfree = ++m_infree(MC_CL) + | |
2210 | m_infree(MC_MBUF_CL); | |
2211 | mbstat.m_clusters = ++m_total(MC_CL); | |
2212 | VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); | |
2213 | ||
2214 | /* 2nd cluster in the page */ | |
2215 | sp = slab_get(++mcl); | |
2216 | if (mclaudit != NULL) | |
2217 | mcl_audit_init(mcl, &mca_list, &con_list, | |
2218 | AUDIT_CONTENTS_SIZE, NMBPCL); | |
2219 | ||
2220 | VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); | |
2221 | slab_init(sp, MC_CL, SLF_MAPPED, | |
2222 | mcl, mcl, bufsize, 0, 1); | |
2223 | ||
2224 | /* Insert this slab */ | |
2225 | slab_insert(sp, MC_CL); | |
2226 | ||
2227 | /* Update stats now since slab_get() drops the lock */ | |
2228 | mbstat.m_clfree = ++m_infree(MC_CL) + | |
2229 | m_infree(MC_MBUF_CL); | |
2230 | mbstat.m_clusters = ++m_total(MC_CL); | |
2231 | VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); | |
2232 | } else if (bufsize == m_maxsize(MC_BIGCL)) { | |
2233 | union mbigcluster *mbc = (union mbigcluster *)page; | |
2234 | mcl_slab_t *nsp; | |
2235 | ||
2236 | /* One for the entire page */ | |
2237 | sp = slab_get(mbc); | |
2238 | if (mclaudit != NULL) | |
2239 | mcl_audit_init(mbc, &mca_list, NULL, 0, 1); | |
2240 | ||
2241 | VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); | |
2242 | slab_init(sp, MC_BIGCL, SLF_MAPPED, | |
2243 | mbc, mbc, bufsize, 0, 1); | |
2244 | ||
2245 | /* 2nd cluster's slab is part of the previous one */ | |
2246 | nsp = slab_get(((union mcluster *)page) + 1); | |
2247 | slab_init(nsp, MC_BIGCL, SLF_MAPPED | SLF_PARTIAL, | |
2248 | mbc, NULL, 0, 0, 0); | |
2249 | ||
2250 | /* Insert this slab */ | |
2251 | slab_insert(sp, MC_BIGCL); | |
2252 | ||
2253 | /* Update stats now since slab_get() drops the lock */ | |
2254 | mbstat.m_bigclfree = ++m_infree(MC_BIGCL) + | |
2255 | m_infree(MC_MBUF_BIGCL); | |
2256 | mbstat.m_bigclusters = ++m_total(MC_BIGCL); | |
2257 | VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); | |
2258 | } else if ((i % (M16KCLBYTES / NBPG)) == 0) { | |
2259 | union m16kcluster *m16kcl = (union m16kcluster *)page; | |
2260 | mcl_slab_t *nsp; | |
2261 | int k; | |
2262 | ||
2263 | VERIFY(njcl > 0); | |
2264 | /* One for the entire 16KB */ | |
2265 | sp = slab_get(m16kcl); | |
2266 | if (mclaudit != NULL) | |
2267 | mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1); | |
2268 | ||
2269 | VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); | |
2270 | slab_init(sp, MC_16KCL, SLF_MAPPED, | |
2271 | m16kcl, m16kcl, bufsize, 0, 1); | |
2272 | ||
2273 | /* 2nd-8th cluster's slab is part of the first one */ | |
2274 | for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { | |
2275 | nsp = slab_get(((union mcluster *)page) + k); | |
2276 | VERIFY(nsp->sl_refcnt == 0 && | |
2277 | nsp->sl_flags == 0); | |
2278 | slab_init(nsp, MC_16KCL, | |
2279 | SLF_MAPPED | SLF_PARTIAL, | |
2280 | m16kcl, NULL, 0, 0, 0); | |
2281 | } | |
2282 | ||
2283 | /* Insert this slab */ | |
2284 | slab_insert(sp, MC_16KCL); | |
2285 | ||
2286 | /* Update stats now since slab_get() drops the lock */ | |
2287 | m_infree(MC_16KCL)++; | |
2288 | m_total(MC_16KCL)++; | |
2289 | VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); | |
2290 | } | |
2291 | } | |
2292 | VERIFY(mca_list == NULL && con_list == NULL); | |
2293 | ||
2294 | /* We're done; let others enter */ | |
2295 | mb_clalloc_busy = FALSE; | |
2296 | if (mb_clalloc_waiters > 0) { | |
2297 | mb_clalloc_waiters = 0; | |
2298 | wakeup(mb_clalloc_waitchan); | |
2299 | } | |
2300 | ||
2301 | if (bufsize == m_maxsize(MC_CL)) | |
2302 | return (numpages << 1); | |
2303 | else if (bufsize == m_maxsize(MC_BIGCL)) | |
2304 | return (numpages); | |
2305 | ||
2306 | VERIFY(bufsize == m_maxsize(MC_16KCL)); | |
2307 | return (numpages / (M16KCLBYTES / NBPG)); | |
2308 | ||
2309 | out: | |
2310 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
2311 | ||
2312 | /* We're done; let others enter */ | |
2313 | mb_clalloc_busy = FALSE; | |
2314 | if (mb_clalloc_waiters > 0) { | |
2315 | mb_clalloc_waiters = 0; | |
2316 | wakeup(mb_clalloc_waitchan); | |
2317 | } | |
2318 | ||
2319 | /* | |
2320 | * When non-blocking we kick a thread if we have to grow the | |
2321 | * pool or if the number of free clusters is less than requested. | |
2322 | */ | |
2323 | if (bufsize == m_maxsize(MC_CL)) { | |
2324 | if (i > 0) { | |
2325 | /* | |
2326 | * Remember total number of clusters needed | |
2327 | * at this time. | |
2328 | */ | |
2329 | i += m_total(MC_CL); | |
2330 | if (i > mbuf_expand_mcl) { | |
2331 | mbuf_expand_mcl = i; | |
2332 | if (mbuf_worker_ready) | |
2333 | wakeup((caddr_t)&mbuf_worker_run); | |
2334 | } | |
2335 | } | |
2336 | ||
2337 | if (m_infree(MC_CL) >= num) | |
2338 | return (1); | |
2339 | } else if (bufsize == m_maxsize(MC_BIGCL)) { | |
2340 | if (i > 0) { | |
2341 | /* | |
2342 | * Remember total number of 4KB clusters needed | |
2343 | * at this time. | |
2344 | */ | |
2345 | i += m_total(MC_BIGCL); | |
2346 | if (i > mbuf_expand_big) { | |
2347 | mbuf_expand_big = i; | |
2348 | if (mbuf_worker_ready) | |
2349 | wakeup((caddr_t)&mbuf_worker_run); | |
2350 | } | |
2351 | } | |
2352 | ||
2353 | if (m_infree(MC_BIGCL) >= num) | |
2354 | return (1); | |
2355 | } else { | |
2356 | if (i > 0) { | |
2357 | /* | |
2358 | * Remember total number of 16KB clusters needed | |
2359 | * at this time. | |
2360 | */ | |
2361 | i += m_total(MC_16KCL); | |
2362 | if (i > mbuf_expand_16k) { | |
2363 | mbuf_expand_16k = i; | |
2364 | if (mbuf_worker_ready) | |
2365 | wakeup((caddr_t)&mbuf_worker_run); | |
2366 | } | |
2367 | } | |
2368 | ||
2369 | if (m_infree(MC_16KCL) >= num) | |
2370 | return (1); | |
2371 | } | |
2372 | return (0); | |
2373 | } | |
2374 | ||
2375 | /* | |
2376 | * Populate the global freelist of the corresponding buffer class. | |
2377 | */ | |
2378 | static int | |
2379 | freelist_populate(mbuf_class_t class, unsigned int num, int wait) | |
2380 | { | |
2381 | mcache_obj_t *o = NULL; | |
2382 | int i; | |
2383 | ||
2384 | VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL || | |
2385 | class == MC_16KCL); | |
2386 | ||
2387 | #if CONFIG_MBUF_NOEXPAND | |
2388 | if ((mbstat.m_mbufs / NMBPCL) >= maxmbufcl) { | |
2389 | #if DEBUG | |
2390 | static int printonce = 1; | |
2391 | if (printonce == 1) { | |
2392 | printonce = 0; | |
2393 | printf("m_expand failed, allocated %ld out of %d " | |
2394 | "clusters\n", mbstat.m_mbufs / NMBPCL, | |
2395 | nmbclusters); | |
2396 | } | |
2397 | #endif /* DEBUG */ | |
2398 | return (0); | |
2399 | } | |
2400 | #endif /* CONFIG_MBUF_NOEXPAND */ | |
2401 | ||
2402 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
2403 | ||
2404 | switch (class) { | |
2405 | case MC_MBUF: | |
2406 | case MC_CL: | |
2407 | i = m_clalloc(num, wait, m_maxsize(MC_CL)); | |
2408 | ||
2409 | /* Respect the 2K clusters minimum limit */ | |
2410 | if (m_total(MC_CL) == m_maxlimit(MC_CL) && | |
2411 | m_infree(MC_CL) <= m_minlimit(MC_CL)) { | |
2412 | if (class != MC_CL || (wait & MCR_COMP)) | |
2413 | return (0); | |
2414 | } | |
2415 | if (class == MC_CL) | |
2416 | return (i != 0); | |
2417 | break; | |
2418 | ||
2419 | case MC_BIGCL: | |
2420 | case MC_16KCL: | |
2421 | return (m_clalloc(num, wait, m_maxsize(class)) != 0); | |
2422 | /* NOTREACHED */ | |
2423 | ||
2424 | default: | |
2425 | VERIFY(0); | |
2426 | /* NOTREACHED */ | |
2427 | } | |
2428 | ||
2429 | /* Steal a cluster and cut it up to create NMBPCL mbufs */ | |
2430 | if ((o = slab_alloc(MC_CL, wait)) != NULL) { | |
2431 | struct mbuf *m = (struct mbuf *)o; | |
2432 | mcache_audit_t *mca = NULL; | |
2433 | mcl_slab_t *sp = slab_get(o); | |
2434 | ||
2435 | VERIFY(slab_is_detached(sp) && | |
2436 | (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); | |
2437 | ||
2438 | /* Make sure that the cluster is unmolested while in freelist */ | |
2439 | if (mclaudit != NULL) { | |
2440 | mca = mcl_audit_buf2mca(MC_CL, o); | |
2441 | mcache_audit_free_verify(mca, o, 0, m_maxsize(MC_CL)); | |
2442 | } | |
2443 | ||
2444 | /* Reinitialize it as an mbuf slab */ | |
2445 | slab_init(sp, MC_MBUF, sp->sl_flags, sp->sl_base, NULL, | |
2446 | sp->sl_len, 0, NMBPCL); | |
2447 | ||
2448 | VERIFY(m == (struct mbuf *)sp->sl_base); | |
2449 | VERIFY(sp->sl_head == NULL); | |
2450 | ||
2451 | m_total(MC_MBUF) += NMBPCL; | |
2452 | mbstat.m_mbufs = m_total(MC_MBUF); | |
2453 | m_infree(MC_MBUF) += NMBPCL; | |
2454 | mtype_stat_add(MT_FREE, NMBPCL); | |
2455 | ||
2456 | i = NMBPCL; | |
2457 | while (i--) { | |
2458 | /* | |
2459 | * If auditing is enabled, construct the shadow mbuf | |
2460 | * in the audit structure instead of the actual one. | |
2461 | * mbuf_slab_audit() will take care of restoring the | |
2462 | * contents after the integrity check. | |
2463 | */ | |
2464 | if (mclaudit != NULL) { | |
2465 | struct mbuf *ms; | |
2466 | mca = mcl_audit_buf2mca(MC_MBUF, | |
2467 | (mcache_obj_t *)m); | |
2468 | ms = ((struct mbuf *)mca->mca_contents); | |
2469 | ms->m_type = MT_FREE; | |
2470 | } else { | |
2471 | m->m_type = MT_FREE; | |
2472 | } | |
2473 | m->m_next = sp->sl_head; | |
2474 | sp->sl_head = (void *)m++; | |
2475 | } | |
2476 | ||
2477 | /* Insert it into the mbuf class's slab list */ | |
2478 | slab_insert(sp, MC_MBUF); | |
2479 | ||
2480 | if ((i = mb_waiters) > 0) | |
2481 | mb_waiters = 0; | |
2482 | if (i != 0) | |
2483 | wakeup(mb_waitchan); | |
2484 | ||
2485 | return (1); | |
2486 | } | |
2487 | ||
2488 | return (0); | |
2489 | } | |
2490 | ||
2491 | /* | |
2492 | * (Inaccurately) check if it might be worth a trip back to the | |
2493 | * mcache layer due the availability of objects there. We'll | |
2494 | * end up back here if there's nothing up there. | |
2495 | */ | |
2496 | static boolean_t | |
2497 | mbuf_cached_above(mbuf_class_t class, int wait) | |
2498 | { | |
2499 | switch (class) { | |
2500 | case MC_MBUF: | |
2501 | if (wait & MCR_COMP) | |
2502 | return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) || | |
2503 | !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL))); | |
2504 | break; | |
2505 | ||
2506 | case MC_CL: | |
2507 | if (wait & MCR_COMP) | |
2508 | return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL))); | |
2509 | break; | |
2510 | ||
2511 | case MC_BIGCL: | |
2512 | if (wait & MCR_COMP) | |
2513 | return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL))); | |
2514 | break; | |
2515 | ||
2516 | case MC_16KCL: | |
2517 | if (wait & MCR_COMP) | |
2518 | return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL))); | |
2519 | break; | |
2520 | ||
2521 | case MC_MBUF_CL: | |
2522 | case MC_MBUF_BIGCL: | |
2523 | case MC_MBUF_16KCL: | |
2524 | break; | |
2525 | ||
2526 | default: | |
2527 | VERIFY(0); | |
2528 | /* NOTREACHED */ | |
2529 | } | |
2530 | ||
2531 | return (!mcache_bkt_isempty(m_cache(class))); | |
2532 | } | |
2533 | ||
2534 | /* | |
2535 | * If possible, convert constructed objects to raw ones. | |
2536 | */ | |
2537 | static boolean_t | |
2538 | mbuf_steal(mbuf_class_t class, unsigned int num) | |
2539 | { | |
2540 | mcache_obj_t *top = NULL; | |
2541 | mcache_obj_t **list = ⊤ | |
2542 | unsigned int tot = 0; | |
2543 | ||
2544 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
2545 | ||
2546 | switch (class) { | |
2547 | case MC_MBUF: | |
2548 | case MC_CL: | |
2549 | case MC_BIGCL: | |
2550 | case MC_16KCL: | |
2551 | return (FALSE); | |
2552 | ||
2553 | case MC_MBUF_CL: | |
2554 | case MC_MBUF_BIGCL: | |
2555 | case MC_MBUF_16KCL: | |
2556 | /* Get the required number of constructed objects if possible */ | |
2557 | if (m_infree(class) > m_minlimit(class)) { | |
2558 | tot = cslab_alloc(class, &list, | |
2559 | MIN(num, m_infree(class))); | |
2560 | } | |
2561 | ||
2562 | /* And destroy them to get back the raw objects */ | |
2563 | if (top != NULL) | |
2564 | (void) cslab_free(class, top, 1); | |
2565 | break; | |
2566 | ||
2567 | default: | |
2568 | VERIFY(0); | |
2569 | /* NOTREACHED */ | |
2570 | } | |
2571 | ||
2572 | return (tot == num); | |
2573 | } | |
2574 | ||
2575 | static void | |
2576 | m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp) | |
2577 | { | |
2578 | int m, bmap = 0; | |
2579 | ||
2580 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
2581 | ||
2582 | VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); | |
2583 | VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); | |
2584 | VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); | |
2585 | ||
2586 | /* | |
2587 | * This logic can be made smarter; for now, simply mark | |
2588 | * all other related classes as potential victims. | |
2589 | */ | |
2590 | switch (class) { | |
2591 | case MC_MBUF: | |
2592 | m_wantpurge(MC_CL)++; | |
2593 | m_wantpurge(MC_MBUF_CL)++; | |
2594 | m_wantpurge(MC_MBUF_BIGCL)++; | |
2595 | break; | |
2596 | ||
2597 | case MC_CL: | |
2598 | m_wantpurge(MC_MBUF)++; | |
2599 | if (!comp) | |
2600 | m_wantpurge(MC_MBUF_CL)++; | |
2601 | break; | |
2602 | ||
2603 | case MC_BIGCL: | |
2604 | if (!comp) | |
2605 | m_wantpurge(MC_MBUF_BIGCL)++; | |
2606 | break; | |
2607 | ||
2608 | case MC_16KCL: | |
2609 | if (!comp) | |
2610 | m_wantpurge(MC_MBUF_16KCL)++; | |
2611 | break; | |
2612 | ||
2613 | default: | |
2614 | VERIFY(0); | |
2615 | /* NOTREACHED */ | |
2616 | } | |
2617 | ||
2618 | /* | |
2619 | * Run through each marked class and check if we really need to | |
2620 | * purge (and therefore temporarily disable) the per-CPU caches | |
2621 | * layer used by the class. If so, remember the classes since | |
2622 | * we are going to drop the lock below prior to purging. | |
2623 | */ | |
2624 | for (m = 0; m < NELEM(mbuf_table); m++) { | |
2625 | if (m_wantpurge(m) > 0) { | |
2626 | m_wantpurge(m) = 0; | |
2627 | /* | |
2628 | * Try hard to steal the required number of objects | |
2629 | * from the freelist of other mbuf classes. Only | |
2630 | * purge and disable the per-CPU caches layer when | |
2631 | * we don't have enough; it's the last resort. | |
2632 | */ | |
2633 | if (!mbuf_steal(m, num)) | |
2634 | bmap |= (1 << m); | |
2635 | } | |
2636 | } | |
2637 | ||
2638 | lck_mtx_unlock(mbuf_mlock); | |
2639 | ||
2640 | if (bmap != 0) { | |
2641 | /* drain is performed in pfslowtimo(), to avoid deadlocks */ | |
2642 | do_reclaim = 1; | |
2643 | ||
2644 | /* Sigh; we have no other choices but to ask mcache to purge */ | |
2645 | for (m = 0; m < NELEM(mbuf_table); m++) { | |
2646 | if ((bmap & (1 << m)) && | |
2647 | mcache_purge_cache(m_cache(m))) { | |
2648 | lck_mtx_lock(mbuf_mlock); | |
2649 | m_purge_cnt(m)++; | |
2650 | mbstat.m_drain++; | |
2651 | lck_mtx_unlock(mbuf_mlock); | |
2652 | } | |
2653 | } | |
2654 | } else { | |
2655 | /* | |
2656 | * Request mcache to reap extra elements from all of its caches; | |
2657 | * note that all reaps are serialized and happen only at a fixed | |
2658 | * interval. | |
2659 | */ | |
2660 | mcache_reap(); | |
2661 | } | |
2662 | lck_mtx_lock(mbuf_mlock); | |
2663 | } | |
2664 | ||
2665 | static inline struct mbuf * | |
2666 | m_get_common(int wait, short type, int hdr) | |
2667 | { | |
2668 | struct mbuf *m; | |
2669 | int mcflags = MSLEEPF(wait); | |
2670 | ||
2671 | /* Is this due to a non-blocking retry? If so, then try harder */ | |
2672 | if (mcflags & MCR_NOSLEEP) | |
2673 | mcflags |= MCR_TRYHARD; | |
2674 | ||
2675 | m = mcache_alloc(m_cache(MC_MBUF), mcflags); | |
2676 | if (m != NULL) { | |
2677 | MBUF_INIT(m, hdr, type); | |
2678 | mtype_stat_inc(type); | |
2679 | mtype_stat_dec(MT_FREE); | |
2680 | #if CONFIG_MACF_NET | |
2681 | if (hdr && mac_init_mbuf(m, wait) != 0) { | |
2682 | m_free(m); | |
2683 | return (NULL); | |
2684 | } | |
2685 | #endif /* MAC_NET */ | |
2686 | } | |
2687 | return (m); | |
2688 | } | |
2689 | ||
2690 | /* | |
2691 | * Space allocation routines; these are also available as macros | |
2692 | * for critical paths. | |
2693 | */ | |
2694 | #define _M_GET(wait, type) m_get_common(wait, type, 0) | |
2695 | #define _M_GETHDR(wait, type) m_get_common(wait, type, 1) | |
2696 | #define _M_RETRY(wait, type) _M_GET(wait, type) | |
2697 | #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type) | |
2698 | #define _MGET(m, how, type) ((m) = _M_GET(how, type)) | |
2699 | #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type)) | |
2700 | ||
2701 | struct mbuf * | |
2702 | m_get(int wait, int type) | |
2703 | { | |
2704 | return (_M_GET(wait, type)); | |
2705 | } | |
2706 | ||
2707 | struct mbuf * | |
2708 | m_gethdr(int wait, int type) | |
2709 | { | |
2710 | return (_M_GETHDR(wait, type)); | |
2711 | } | |
2712 | ||
2713 | struct mbuf * | |
2714 | m_retry(int wait, int type) | |
2715 | { | |
2716 | return (_M_RETRY(wait, type)); | |
2717 | } | |
2718 | ||
2719 | struct mbuf * | |
2720 | m_retryhdr(int wait, int type) | |
2721 | { | |
2722 | return (_M_RETRYHDR(wait, type)); | |
2723 | } | |
2724 | ||
2725 | struct mbuf * | |
2726 | m_getclr(int wait, int type) | |
2727 | { | |
2728 | struct mbuf *m; | |
2729 | ||
2730 | _MGET(m, wait, type); | |
2731 | if (m != NULL) | |
2732 | bzero(MTOD(m, caddr_t), MLEN); | |
2733 | return (m); | |
2734 | } | |
2735 | ||
2736 | struct mbuf * | |
2737 | m_free(struct mbuf *m) | |
2738 | { | |
2739 | struct mbuf *n = m->m_next; | |
2740 | ||
2741 | if (m->m_type == MT_FREE) | |
2742 | panic("m_free: freeing an already freed mbuf"); | |
2743 | ||
2744 | /* Free the aux data and tags if there is any */ | |
2745 | if (m->m_flags & M_PKTHDR) { | |
2746 | m_tag_delete_chain(m, NULL); | |
2747 | } | |
2748 | ||
2749 | if (m->m_flags & M_EXT) { | |
2750 | u_int32_t refcnt; | |
2751 | u_int32_t flags; | |
2752 | ||
2753 | refcnt = m_decref(m); | |
2754 | flags = MEXT_FLAGS(m); | |
2755 | if (refcnt == 0 && flags == 0) { | |
2756 | if (m->m_ext.ext_free == NULL) { | |
2757 | mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); | |
2758 | } else if (m->m_ext.ext_free == m_bigfree) { | |
2759 | mcache_free(m_cache(MC_BIGCL), | |
2760 | m->m_ext.ext_buf); | |
2761 | } else if (m->m_ext.ext_free == m_16kfree) { | |
2762 | mcache_free(m_cache(MC_16KCL), | |
2763 | m->m_ext.ext_buf); | |
2764 | } else { | |
2765 | (*(m->m_ext.ext_free))(m->m_ext.ext_buf, | |
2766 | m->m_ext.ext_size, m->m_ext.ext_arg); | |
2767 | } | |
2768 | mcache_free(ref_cache, MEXT_RFA(m)); | |
2769 | MEXT_RFA(m) = NULL; | |
2770 | } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { | |
2771 | VERIFY(m->m_type != MT_FREE); | |
2772 | ||
2773 | mtype_stat_dec(m->m_type); | |
2774 | mtype_stat_inc(MT_FREE); | |
2775 | ||
2776 | m->m_type = MT_FREE; | |
2777 | m->m_flags = M_EXT; | |
2778 | m->m_len = 0; | |
2779 | m->m_next = m->m_nextpkt = NULL; | |
2780 | ||
2781 | /* "Free" into the intermediate cache */ | |
2782 | if (m->m_ext.ext_free == NULL) { | |
2783 | mcache_free(m_cache(MC_MBUF_CL), m); | |
2784 | } else if (m->m_ext.ext_free == m_bigfree) { | |
2785 | mcache_free(m_cache(MC_MBUF_BIGCL), m); | |
2786 | } else { | |
2787 | VERIFY(m->m_ext.ext_free == m_16kfree); | |
2788 | mcache_free(m_cache(MC_MBUF_16KCL), m); | |
2789 | } | |
2790 | return (n); | |
2791 | } | |
2792 | } | |
2793 | ||
2794 | if (m->m_type != MT_FREE) { | |
2795 | mtype_stat_dec(m->m_type); | |
2796 | mtype_stat_inc(MT_FREE); | |
2797 | } | |
2798 | ||
2799 | m->m_type = MT_FREE; | |
2800 | m->m_flags = m->m_len = 0; | |
2801 | m->m_next = m->m_nextpkt = NULL; | |
2802 | ||
2803 | mcache_free(m_cache(MC_MBUF), m); | |
2804 | ||
2805 | return (n); | |
2806 | } | |
2807 | ||
2808 | __private_extern__ struct mbuf * | |
2809 | m_clattach(struct mbuf *m, int type, caddr_t extbuf, | |
2810 | void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg, | |
2811 | int wait) | |
2812 | { | |
2813 | struct ext_ref *rfa = NULL; | |
2814 | ||
2815 | if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL) | |
2816 | return (NULL); | |
2817 | ||
2818 | if (m->m_flags & M_EXT) { | |
2819 | u_int32_t refcnt; | |
2820 | u_int32_t flags; | |
2821 | ||
2822 | refcnt = m_decref(m); | |
2823 | flags = MEXT_FLAGS(m); | |
2824 | if (refcnt == 0 && flags == 0) { | |
2825 | if (m->m_ext.ext_free == NULL) { | |
2826 | mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); | |
2827 | } else if (m->m_ext.ext_free == m_bigfree) { | |
2828 | mcache_free(m_cache(MC_BIGCL), | |
2829 | m->m_ext.ext_buf); | |
2830 | } else if (m->m_ext.ext_free == m_16kfree) { | |
2831 | mcache_free(m_cache(MC_16KCL), | |
2832 | m->m_ext.ext_buf); | |
2833 | } else { | |
2834 | (*(m->m_ext.ext_free))(m->m_ext.ext_buf, | |
2835 | m->m_ext.ext_size, m->m_ext.ext_arg); | |
2836 | } | |
2837 | /* Re-use the reference structure */ | |
2838 | rfa = MEXT_RFA(m); | |
2839 | } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { | |
2840 | VERIFY(m->m_type != MT_FREE); | |
2841 | ||
2842 | mtype_stat_dec(m->m_type); | |
2843 | mtype_stat_inc(MT_FREE); | |
2844 | ||
2845 | m->m_type = MT_FREE; | |
2846 | m->m_flags = M_EXT; | |
2847 | m->m_len = 0; | |
2848 | m->m_next = m->m_nextpkt = NULL; | |
2849 | /* "Free" into the intermediate cache */ | |
2850 | if (m->m_ext.ext_free == NULL) { | |
2851 | mcache_free(m_cache(MC_MBUF_CL), m); | |
2852 | } else if (m->m_ext.ext_free == m_bigfree) { | |
2853 | mcache_free(m_cache(MC_MBUF_BIGCL), m); | |
2854 | } else { | |
2855 | VERIFY(m->m_ext.ext_free == m_16kfree); | |
2856 | mcache_free(m_cache(MC_MBUF_16KCL), m); | |
2857 | } | |
2858 | /* | |
2859 | * Allocate a new mbuf, since we didn't divorce | |
2860 | * the composite mbuf + cluster pair above. | |
2861 | */ | |
2862 | if ((m = _M_GETHDR(wait, type)) == NULL) | |
2863 | return (NULL); | |
2864 | } | |
2865 | } | |
2866 | ||
2867 | if (rfa == NULL && | |
2868 | (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { | |
2869 | m_free(m); | |
2870 | return (NULL); | |
2871 | } | |
2872 | ||
2873 | MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0); | |
2874 | ||
2875 | return (m); | |
2876 | } | |
2877 | ||
2878 | /* m_mclget() add an mbuf cluster to a normal mbuf */ | |
2879 | struct mbuf * | |
2880 | m_mclget(struct mbuf *m, int wait) | |
2881 | { | |
2882 | struct ext_ref *rfa; | |
2883 | ||
2884 | if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) | |
2885 | return (m); | |
2886 | ||
2887 | m->m_ext.ext_buf = m_mclalloc(wait); | |
2888 | if (m->m_ext.ext_buf != NULL) { | |
2889 | MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); | |
2890 | } else { | |
2891 | mcache_free(ref_cache, rfa); | |
2892 | } | |
2893 | return (m); | |
2894 | } | |
2895 | ||
2896 | /* Allocate an mbuf cluster */ | |
2897 | caddr_t | |
2898 | m_mclalloc(int wait) | |
2899 | { | |
2900 | int mcflags = MSLEEPF(wait); | |
2901 | ||
2902 | /* Is this due to a non-blocking retry? If so, then try harder */ | |
2903 | if (mcflags & MCR_NOSLEEP) | |
2904 | mcflags |= MCR_TRYHARD; | |
2905 | ||
2906 | return (mcache_alloc(m_cache(MC_CL), mcflags)); | |
2907 | } | |
2908 | ||
2909 | /* Free an mbuf cluster */ | |
2910 | void | |
2911 | m_mclfree(caddr_t p) | |
2912 | { | |
2913 | mcache_free(m_cache(MC_CL), p); | |
2914 | } | |
2915 | ||
2916 | /* | |
2917 | * mcl_hasreference() checks if a cluster of an mbuf is referenced by | |
2918 | * another mbuf | |
2919 | */ | |
2920 | int | |
2921 | m_mclhasreference(struct mbuf *m) | |
2922 | { | |
2923 | if (!(m->m_flags & M_EXT)) | |
2924 | return (0); | |
9bccf70c | 2925 | |
2d21ac55 A |
2926 | ASSERT(MEXT_RFA(m) != NULL); |
2927 | ||
2928 | return (MEXT_REF(m) > 1); | |
9bccf70c A |
2929 | } |
2930 | ||
2d21ac55 A |
2931 | __private_extern__ caddr_t |
2932 | m_bigalloc(int wait) | |
9bccf70c | 2933 | { |
2d21ac55 | 2934 | int mcflags = MSLEEPF(wait); |
91447636 | 2935 | |
2d21ac55 A |
2936 | /* Is this due to a non-blocking retry? If so, then try harder */ |
2937 | if (mcflags & MCR_NOSLEEP) | |
2938 | mcflags |= MCR_TRYHARD; | |
91447636 | 2939 | |
2d21ac55 | 2940 | return (mcache_alloc(m_cache(MC_BIGCL), mcflags)); |
9bccf70c A |
2941 | } |
2942 | ||
2d21ac55 A |
2943 | __private_extern__ void |
2944 | m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg) | |
9bccf70c | 2945 | { |
2d21ac55 | 2946 | mcache_free(m_cache(MC_BIGCL), p); |
9bccf70c A |
2947 | } |
2948 | ||
2d21ac55 A |
2949 | /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */ |
2950 | __private_extern__ struct mbuf * | |
2951 | m_mbigget(struct mbuf *m, int wait) | |
2952 | { | |
2953 | struct ext_ref *rfa; | |
2954 | ||
2955 | if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) | |
2956 | return (m); | |
2957 | ||
2958 | m->m_ext.ext_buf = m_bigalloc(wait); | |
2959 | if (m->m_ext.ext_buf != NULL) { | |
2960 | MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); | |
91447636 | 2961 | } else { |
2d21ac55 | 2962 | mcache_free(ref_cache, rfa); |
91447636 | 2963 | } |
2d21ac55 A |
2964 | return (m); |
2965 | } | |
2966 | ||
2967 | __private_extern__ caddr_t | |
2968 | m_16kalloc(int wait) | |
2969 | { | |
2970 | int mcflags = MSLEEPF(wait); | |
2971 | ||
2972 | /* Is this due to a non-blocking retry? If so, then try harder */ | |
2973 | if (mcflags & MCR_NOSLEEP) | |
2974 | mcflags |= MCR_TRYHARD; | |
2975 | ||
2976 | return (mcache_alloc(m_cache(MC_16KCL), mcflags)); | |
91447636 A |
2977 | } |
2978 | ||
2979 | __private_extern__ void | |
2d21ac55 | 2980 | m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg) |
91447636 | 2981 | { |
2d21ac55 | 2982 | mcache_free(m_cache(MC_16KCL), p); |
91447636 A |
2983 | } |
2984 | ||
2d21ac55 | 2985 | /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */ |
91447636 | 2986 | __private_extern__ struct mbuf * |
2d21ac55 | 2987 | m_m16kget(struct mbuf *m, int wait) |
91447636 | 2988 | { |
2d21ac55 A |
2989 | struct ext_ref *rfa; |
2990 | ||
2991 | if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) | |
2992 | return (m); | |
2993 | ||
2994 | m->m_ext.ext_buf = m_16kalloc(wait); | |
2995 | if (m->m_ext.ext_buf != NULL) { | |
2996 | MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); | |
2997 | } else { | |
2998 | mcache_free(ref_cache, rfa); | |
91447636 | 2999 | } |
2d21ac55 | 3000 | return (m); |
91447636 A |
3001 | } |
3002 | ||
9bccf70c A |
3003 | /* */ |
3004 | void | |
2d21ac55 | 3005 | m_copy_pkthdr(struct mbuf *to, struct mbuf *from) |
9bccf70c | 3006 | { |
2d21ac55 A |
3007 | #if CONFIG_MACF_NET |
3008 | /* We will be taking over the tags of 'to' */ | |
3009 | if (to->m_flags & M_PKTHDR) | |
3010 | m_tag_delete_chain(to, NULL); | |
3011 | #endif /* MAC_NET */ | |
3012 | to->m_pkthdr = from->m_pkthdr; /* especially tags */ | |
3013 | m_tag_init(from); /* purge tags from src */ | |
9bccf70c A |
3014 | to->m_flags = from->m_flags & M_COPYFLAGS; |
3015 | to->m_data = (to)->m_pktdat; | |
3016 | } | |
3017 | ||
91447636 A |
3018 | /* |
3019 | * Duplicate "from"'s mbuf pkthdr in "to". | |
3020 | * "from" must have M_PKTHDR set, and "to" must be empty. | |
3021 | * In particular, this does a deep copy of the packet tags. | |
3022 | */ | |
3a60a9f5 | 3023 | static int |
91447636 A |
3024 | m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how) |
3025 | { | |
2d21ac55 A |
3026 | #if CONFIG_MACF_NET |
3027 | if (to->m_flags & M_PKTHDR) | |
3028 | m_tag_delete_chain(to, NULL); | |
3029 | #endif /* MAC_NET */ | |
3030 | to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); | |
3031 | if ((to->m_flags & M_EXT) == 0) | |
3032 | to->m_data = to->m_pktdat; | |
3033 | to->m_pkthdr = from->m_pkthdr; | |
3034 | m_tag_init(to); | |
3035 | return (m_tag_copy_chain(to, from, how)); | |
91447636 | 3036 | } |
fa4905b1 | 3037 | |
9bccf70c | 3038 | /* |
2d21ac55 A |
3039 | * Return a list of mbuf hdrs that point to clusters. Try for num_needed; |
3040 | * if wantall is not set, return whatever number were available. Set up the | |
3041 | * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these | |
3042 | * are chained on the m_nextpkt field. Any packets requested beyond this | |
3043 | * are chained onto the last packet header's m_next field. The size of | |
3044 | * the cluster is controlled by the parameter bufsize. | |
9bccf70c | 3045 | */ |
91447636 | 3046 | __private_extern__ struct mbuf * |
2d21ac55 A |
3047 | m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, |
3048 | int wait, int wantall, size_t bufsize) | |
fa4905b1 A |
3049 | { |
3050 | struct mbuf *m; | |
3051 | struct mbuf **np, *top; | |
2d21ac55 A |
3052 | unsigned int pnum, needed = *num_needed; |
3053 | mcache_obj_t *mp_list = NULL; | |
3054 | int mcflags = MSLEEPF(wait); | |
3055 | u_int32_t flag; | |
3056 | struct ext_ref *rfa; | |
3057 | mcache_t *cp; | |
3058 | void *cl; | |
3059 | ||
3060 | ASSERT(bufsize == m_maxsize(MC_CL) || | |
3061 | bufsize == m_maxsize(MC_BIGCL) || | |
3062 | bufsize == m_maxsize(MC_16KCL)); | |
3063 | ||
3064 | /* | |
3065 | * Caller must first check for njcl because this | |
3066 | * routine is internal and not exposed/used via KPI. | |
3067 | */ | |
3068 | VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0); | |
3069 | ||
fa4905b1 A |
3070 | top = NULL; |
3071 | np = ⊤ | |
2d21ac55 | 3072 | pnum = 0; |
fa4905b1 | 3073 | |
2d21ac55 A |
3074 | /* |
3075 | * The caller doesn't want all the requested buffers; only some. | |
3076 | * Try hard to get what we can, but don't block. This effectively | |
3077 | * overrides MCR_SLEEP, since this thread will not go to sleep | |
3078 | * if we can't get all the buffers. | |
3079 | */ | |
3080 | if (!wantall || (mcflags & MCR_NOSLEEP)) | |
3081 | mcflags |= MCR_TRYHARD; | |
3082 | ||
3083 | /* Allocate the composite mbuf + cluster elements from the cache */ | |
3084 | if (bufsize == m_maxsize(MC_CL)) | |
3085 | cp = m_cache(MC_MBUF_CL); | |
3086 | else if (bufsize == m_maxsize(MC_BIGCL)) | |
3087 | cp = m_cache(MC_MBUF_BIGCL); | |
3088 | else | |
3089 | cp = m_cache(MC_MBUF_16KCL); | |
3090 | needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags); | |
3091 | ||
3092 | for (pnum = 0; pnum < needed; pnum++) { | |
3093 | m = (struct mbuf *)mp_list; | |
3094 | mp_list = mp_list->obj_next; | |
3095 | ||
3096 | VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); | |
3097 | cl = m->m_ext.ext_buf; | |
3098 | rfa = MEXT_RFA(m); | |
3099 | ||
3100 | ASSERT(cl != NULL && rfa != NULL); | |
3101 | VERIFY(MBUF_IS_COMPOSITE(m)); | |
3102 | ||
3103 | flag = MEXT_FLAGS(m); | |
3104 | ||
3105 | MBUF_INIT(m, num_with_pkthdrs, MT_DATA); | |
3106 | if (bufsize == m_maxsize(MC_16KCL)) { | |
3107 | MBUF_16KCL_INIT(m, cl, rfa, 1, flag); | |
3108 | } else if (bufsize == m_maxsize(MC_BIGCL)) { | |
3109 | MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); | |
91447636 | 3110 | } else { |
2d21ac55 A |
3111 | MBUF_CL_INIT(m, cl, rfa, 1, flag); |
3112 | } | |
3113 | ||
3114 | if (num_with_pkthdrs > 0) { | |
3115 | --num_with_pkthdrs; | |
3116 | #if CONFIG_MACF_NET | |
3117 | if (mac_mbuf_label_init(m, wait) != 0) { | |
91447636 | 3118 | m_free(m); |
2d21ac55 | 3119 | break; |
91447636 | 3120 | } |
2d21ac55 | 3121 | #endif /* MAC_NET */ |
91447636 | 3122 | } |
2d21ac55 A |
3123 | |
3124 | *np = m; | |
3125 | if (num_with_pkthdrs > 0) | |
91447636 A |
3126 | np = &m->m_nextpkt; |
3127 | else | |
3128 | np = &m->m_next; | |
3129 | } | |
2d21ac55 A |
3130 | ASSERT(pnum != *num_needed || mp_list == NULL); |
3131 | if (mp_list != NULL) | |
3132 | mcache_free_ext(cp, mp_list); | |
3133 | ||
3134 | if (pnum > 0) { | |
3135 | mtype_stat_add(MT_DATA, pnum); | |
3136 | mtype_stat_sub(MT_FREE, pnum); | |
3137 | } | |
3138 | ||
3139 | if (wantall && (pnum != *num_needed)) { | |
3140 | if (top != NULL) | |
3141 | m_freem_list(top); | |
3142 | return (NULL); | |
91447636 | 3143 | } |
fa4905b1 | 3144 | |
2d21ac55 A |
3145 | *num_needed = pnum; |
3146 | return (top); | |
3147 | } | |
fa4905b1 | 3148 | |
91447636 | 3149 | /* |
2d21ac55 A |
3150 | * Return list of mbuf linked by m_nextpkt. Try for numlist, and if |
3151 | * wantall is not set, return whatever number were available. The size of | |
3152 | * each mbuf in the list is controlled by the parameter packetlen. Each | |
3153 | * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf | |
3154 | * in the chain is called a segment. If maxsegments is not null and the | |
3155 | * value pointed to is not null, this specify the maximum number of segments | |
3156 | * for a chain of mbufs. If maxsegments is zero or the value pointed to | |
3157 | * is zero the caller does not have any restriction on the number of segments. | |
3158 | * The actual number of segments of a mbuf chain is return in the value | |
3159 | * pointed to by maxsegments. | |
91447636 | 3160 | */ |
91447636 | 3161 | __private_extern__ struct mbuf * |
2d21ac55 A |
3162 | m_allocpacket_internal(unsigned int *numlist, size_t packetlen, |
3163 | unsigned int *maxsegments, int wait, int wantall, size_t wantsize) | |
91447636 | 3164 | { |
2d21ac55 A |
3165 | struct mbuf **np, *top, *first = NULL; |
3166 | size_t bufsize, r_bufsize; | |
3167 | unsigned int num = 0; | |
3168 | unsigned int nsegs = 0; | |
3169 | unsigned int needed, resid; | |
3170 | int mcflags = MSLEEPF(wait); | |
3171 | mcache_obj_t *mp_list = NULL, *rmp_list = NULL; | |
3172 | mcache_t *cp = NULL, *rcp = NULL; | |
3173 | ||
3174 | if (*numlist == 0) | |
3175 | return (NULL); | |
fa4905b1 | 3176 | |
91447636 A |
3177 | top = NULL; |
3178 | np = ⊤ | |
2d21ac55 | 3179 | |
91447636 | 3180 | if (wantsize == 0) { |
2d21ac55 | 3181 | if (packetlen <= MINCLSIZE) { |
91447636 | 3182 | bufsize = packetlen; |
2d21ac55 A |
3183 | } else if (packetlen > m_maxsize(MC_CL)) { |
3184 | /* Use 4KB if jumbo cluster pool isn't available */ | |
3185 | if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) | |
3186 | bufsize = m_maxsize(MC_BIGCL); | |
3187 | else | |
3188 | bufsize = m_maxsize(MC_16KCL); | |
3189 | } else { | |
3190 | bufsize = m_maxsize(MC_CL); | |
3191 | } | |
3192 | } else if (wantsize == m_maxsize(MC_CL) || | |
3193 | wantsize == m_maxsize(MC_BIGCL) || | |
3194 | (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) { | |
91447636 | 3195 | bufsize = wantsize; |
2d21ac55 A |
3196 | } else { |
3197 | return (NULL); | |
3198 | } | |
91447636 A |
3199 | |
3200 | if (bufsize <= MHLEN) { | |
2d21ac55 | 3201 | nsegs = 1; |
91447636 A |
3202 | } else if (bufsize <= MINCLSIZE) { |
3203 | if (maxsegments != NULL && *maxsegments == 1) { | |
2d21ac55 A |
3204 | bufsize = m_maxsize(MC_CL); |
3205 | nsegs = 1; | |
91447636 | 3206 | } else { |
2d21ac55 | 3207 | nsegs = 2; |
fa4905b1 | 3208 | } |
2d21ac55 A |
3209 | } else if (bufsize == m_maxsize(MC_16KCL)) { |
3210 | VERIFY(njcl > 0); | |
3211 | nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1; | |
3212 | } else if (bufsize == m_maxsize(MC_BIGCL)) { | |
3213 | nsegs = ((packetlen - 1) >> PGSHIFT) + 1; | |
91447636 | 3214 | } else { |
2d21ac55 | 3215 | nsegs = ((packetlen - 1) >> MCLSHIFT) + 1; |
91447636 A |
3216 | } |
3217 | if (maxsegments != NULL) { | |
2d21ac55 A |
3218 | if (*maxsegments && nsegs > *maxsegments) { |
3219 | *maxsegments = nsegs; | |
3220 | return (NULL); | |
91447636 | 3221 | } |
2d21ac55 | 3222 | *maxsegments = nsegs; |
91447636 | 3223 | } |
91447636 | 3224 | |
2d21ac55 A |
3225 | /* |
3226 | * The caller doesn't want all the requested buffers; only some. | |
3227 | * Try hard to get what we can, but don't block. This effectively | |
3228 | * overrides MCR_SLEEP, since this thread will not go to sleep | |
3229 | * if we can't get all the buffers. | |
3230 | */ | |
3231 | if (!wantall || (mcflags & MCR_NOSLEEP)) | |
3232 | mcflags |= MCR_TRYHARD; | |
3233 | ||
3234 | /* | |
3235 | * Simple case where all elements in the lists/chains are mbufs. | |
3236 | * Unless bufsize is greater than MHLEN, each segment chain is made | |
3237 | * up of exactly 1 mbuf. Otherwise, each segment chain is made up | |
3238 | * of 2 mbufs; the second one is used for the residual data, i.e. | |
3239 | * the remaining data that cannot fit into the first mbuf. | |
3240 | */ | |
3241 | if (bufsize <= MINCLSIZE) { | |
3242 | /* Allocate the elements in one shot from the mbuf cache */ | |
3243 | ASSERT(bufsize <= MHLEN || nsegs == 2); | |
3244 | cp = m_cache(MC_MBUF); | |
3245 | needed = mcache_alloc_ext(cp, &mp_list, | |
3246 | (*numlist) * nsegs, mcflags); | |
3247 | ||
3248 | /* | |
3249 | * The number of elements must be even if we are to use an | |
3250 | * mbuf (instead of a cluster) to store the residual data. | |
3251 | * If we couldn't allocate the requested number of mbufs, | |
3252 | * trim the number down (if it's odd) in order to avoid | |
3253 | * creating a partial segment chain. | |
3254 | */ | |
3255 | if (bufsize > MHLEN && (needed & 0x1)) | |
3256 | needed--; | |
91447636 | 3257 | |
2d21ac55 A |
3258 | while (num < needed) { |
3259 | struct mbuf *m; | |
91447636 | 3260 | |
2d21ac55 A |
3261 | m = (struct mbuf *)mp_list; |
3262 | mp_list = mp_list->obj_next; | |
3263 | ASSERT(m != NULL); | |
91447636 | 3264 | |
2d21ac55 A |
3265 | MBUF_INIT(m, 1, MT_DATA); |
3266 | #if CONFIG_MACF_NET | |
3267 | if (mac_init_mbuf(m, wait) != 0) { | |
3268 | m_free(m); | |
3269 | break; | |
91447636 | 3270 | } |
2d21ac55 A |
3271 | #endif /* MAC_NET */ |
3272 | num++; | |
3273 | if (bufsize > MHLEN) { | |
3274 | /* A second mbuf for this segment chain */ | |
3275 | m->m_next = (struct mbuf *)mp_list; | |
3276 | mp_list = mp_list->obj_next; | |
3277 | ASSERT(m->m_next != NULL); | |
3278 | ||
3279 | MBUF_INIT(m->m_next, 0, MT_DATA); | |
3280 | num++; | |
91447636 | 3281 | } |
2d21ac55 A |
3282 | *np = m; |
3283 | np = &m->m_nextpkt; | |
3284 | } | |
3285 | ASSERT(num != *numlist || mp_list == NULL); | |
3286 | ||
3287 | if (num > 0) { | |
3288 | mtype_stat_add(MT_DATA, num); | |
3289 | mtype_stat_sub(MT_FREE, num); | |
3290 | } | |
3291 | num /= nsegs; | |
3292 | ||
3293 | /* We've got them all; return to caller */ | |
3294 | if (num == *numlist) | |
3295 | return (top); | |
3296 | ||
3297 | goto fail; | |
3298 | } | |
3299 | ||
3300 | /* | |
3301 | * Complex cases where elements are made up of one or more composite | |
3302 | * mbufs + cluster, depending on packetlen. Each N-segment chain can | |
3303 | * be illustrated as follows: | |
3304 | * | |
3305 | * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N] | |
3306 | * | |
3307 | * Every composite mbuf + cluster element comes from the intermediate | |
3308 | * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency, | |
3309 | * the last composite element will come from the MC_MBUF_CL cache, | |
3310 | * unless the residual data is larger than 2KB where we use the | |
3311 | * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual | |
3312 | * data is defined as extra data beyond the first element that cannot | |
3313 | * fit into the previous element, i.e. there is no residual data if | |
3314 | * the chain only has 1 segment. | |
3315 | */ | |
3316 | r_bufsize = bufsize; | |
3317 | resid = packetlen > bufsize ? packetlen % bufsize : 0; | |
3318 | if (resid > 0) { | |
3319 | /* There is residual data; figure out the cluster size */ | |
3320 | if (wantsize == 0 && packetlen > MINCLSIZE) { | |
3321 | /* | |
3322 | * Caller didn't request that all of the segments | |
3323 | * in the chain use the same cluster size; use the | |
3324 | * smaller of the cluster sizes. | |
3325 | */ | |
3326 | if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) | |
3327 | r_bufsize = m_maxsize(MC_16KCL); | |
3328 | else if (resid > m_maxsize(MC_CL)) | |
3329 | r_bufsize = m_maxsize(MC_BIGCL); | |
3330 | else | |
3331 | r_bufsize = m_maxsize(MC_CL); | |
3332 | } else { | |
3333 | /* Use the same cluster size as the other segments */ | |
3334 | resid = 0; | |
3335 | } | |
3336 | } | |
3337 | ||
3338 | needed = *numlist; | |
3339 | if (resid > 0) { | |
3340 | /* | |
3341 | * Attempt to allocate composite mbuf + cluster elements for | |
3342 | * the residual data in each chain; record the number of such | |
3343 | * elements that can be allocated so that we know how many | |
3344 | * segment chains we can afford to create. | |
3345 | */ | |
3346 | if (r_bufsize <= m_maxsize(MC_CL)) | |
3347 | rcp = m_cache(MC_MBUF_CL); | |
3348 | else if (r_bufsize <= m_maxsize(MC_BIGCL)) | |
3349 | rcp = m_cache(MC_MBUF_BIGCL); | |
3350 | else | |
3351 | rcp = m_cache(MC_MBUF_16KCL); | |
3352 | needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags); | |
3353 | ||
3354 | if (needed == 0) | |
3355 | goto fail; | |
3356 | ||
3357 | /* This is temporarily reduced for calculation */ | |
3358 | ASSERT(nsegs > 1); | |
3359 | nsegs--; | |
3360 | } | |
3361 | ||
3362 | /* | |
3363 | * Attempt to allocate the rest of the composite mbuf + cluster | |
3364 | * elements for the number of segment chains that we need. | |
3365 | */ | |
3366 | if (bufsize <= m_maxsize(MC_CL)) | |
3367 | cp = m_cache(MC_MBUF_CL); | |
3368 | else if (bufsize <= m_maxsize(MC_BIGCL)) | |
3369 | cp = m_cache(MC_MBUF_BIGCL); | |
3370 | else | |
3371 | cp = m_cache(MC_MBUF_16KCL); | |
3372 | needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags); | |
3373 | ||
3374 | /* Round it down to avoid creating a partial segment chain */ | |
3375 | needed = (needed / nsegs) * nsegs; | |
3376 | if (needed == 0) | |
3377 | goto fail; | |
3378 | ||
3379 | if (resid > 0) { | |
3380 | /* | |
3381 | * We're about to construct the chain(s); take into account | |
3382 | * the number of segments we have created above to hold the | |
3383 | * residual data for each chain, as well as restore the | |
3384 | * original count of segments per chain. | |
3385 | */ | |
3386 | ASSERT(nsegs > 0); | |
3387 | needed += needed / nsegs; | |
3388 | nsegs++; | |
3389 | } | |
3390 | ||
3391 | for (;;) { | |
3392 | struct mbuf *m; | |
3393 | u_int32_t flag; | |
3394 | struct ext_ref *rfa; | |
3395 | void *cl; | |
3396 | int pkthdr; | |
3397 | ||
3398 | ++num; | |
3399 | if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) { | |
3400 | m = (struct mbuf *)mp_list; | |
3401 | mp_list = mp_list->obj_next; | |
3402 | } else { | |
3403 | m = (struct mbuf *)rmp_list; | |
3404 | rmp_list = rmp_list->obj_next; | |
3405 | } | |
3406 | ASSERT(m != NULL); | |
3407 | VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); | |
3408 | VERIFY(m->m_ext.ext_free == NULL || | |
3409 | m->m_ext.ext_free == m_bigfree || | |
3410 | m->m_ext.ext_free == m_16kfree); | |
3411 | ||
3412 | cl = m->m_ext.ext_buf; | |
3413 | rfa = MEXT_RFA(m); | |
3414 | ||
3415 | ASSERT(cl != NULL && rfa != NULL); | |
3416 | VERIFY(MBUF_IS_COMPOSITE(m)); | |
3417 | ||
3418 | flag = MEXT_FLAGS(m); | |
3419 | ||
3420 | pkthdr = (nsegs == 1 || (num % nsegs) == 1); | |
3421 | if (pkthdr) | |
3422 | first = m; | |
3423 | MBUF_INIT(m, pkthdr, MT_DATA); | |
3424 | if (m->m_ext.ext_free == m_16kfree) { | |
3425 | MBUF_16KCL_INIT(m, cl, rfa, 1, flag); | |
3426 | } else if (m->m_ext.ext_free == m_bigfree) { | |
3427 | MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); | |
3428 | } else { | |
3429 | MBUF_CL_INIT(m, cl, rfa, 1, flag); | |
3430 | } | |
3431 | #if CONFIG_MACF_NET | |
3432 | if (pkthdr && mac_init_mbuf(m, wait) != 0) { | |
3433 | --num; | |
3434 | m_free(m); | |
3435 | break; | |
91447636 | 3436 | } |
2d21ac55 A |
3437 | #endif /* MAC_NET */ |
3438 | ||
3439 | *np = m; | |
3440 | if ((num % nsegs) == 0) | |
3441 | np = &first->m_nextpkt; | |
3442 | else | |
3443 | np = &m->m_next; | |
3444 | ||
3445 | if (num == needed) | |
3446 | break; | |
3447 | } | |
3448 | ||
3449 | if (num > 0) { | |
3450 | mtype_stat_add(MT_DATA, num); | |
3451 | mtype_stat_sub(MT_FREE, num); | |
91447636 | 3452 | } |
2d21ac55 A |
3453 | |
3454 | num /= nsegs; | |
3455 | ||
3456 | /* We've got them all; return to caller */ | |
3457 | if (num == *numlist) { | |
3458 | ASSERT(mp_list == NULL && rmp_list == NULL); | |
3459 | return (top); | |
3460 | } | |
3461 | ||
91447636 | 3462 | fail: |
2d21ac55 A |
3463 | /* Free up what's left of the above */ |
3464 | if (mp_list != NULL) | |
3465 | mcache_free_ext(cp, mp_list); | |
3466 | if (rmp_list != NULL) | |
3467 | mcache_free_ext(rcp, rmp_list); | |
3468 | if (wantall && top != NULL) { | |
91447636 | 3469 | m_freem(top); |
2d21ac55 | 3470 | return (NULL); |
91447636 | 3471 | } |
2d21ac55 A |
3472 | *numlist = num; |
3473 | return (top); | |
91447636 | 3474 | } |
fa4905b1 | 3475 | |
2d21ac55 A |
3476 | /* |
3477 | * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated | |
3478 | * packets on receive ring. | |
91447636 A |
3479 | */ |
3480 | __private_extern__ struct mbuf * | |
2d21ac55 | 3481 | m_getpacket_how(int wait) |
91447636 A |
3482 | { |
3483 | unsigned int num_needed = 1; | |
2d21ac55 A |
3484 | |
3485 | return (m_getpackets_internal(&num_needed, 1, wait, 1, | |
3486 | m_maxsize(MC_CL))); | |
91447636 | 3487 | } |
fa4905b1 | 3488 | |
2d21ac55 A |
3489 | /* |
3490 | * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated | |
3491 | * packets on receive ring. | |
91447636 A |
3492 | */ |
3493 | struct mbuf * | |
3494 | m_getpacket(void) | |
3495 | { | |
3496 | unsigned int num_needed = 1; | |
9bccf70c | 3497 | |
2d21ac55 A |
3498 | return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1, |
3499 | m_maxsize(MC_CL))); | |
91447636 | 3500 | } |
fa4905b1 | 3501 | |
91447636 | 3502 | /* |
2d21ac55 A |
3503 | * Return a list of mbuf hdrs that point to clusters. Try for num_needed; |
3504 | * if this can't be met, return whatever number were available. Set up the | |
3505 | * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These | |
3506 | * are chained on the m_nextpkt field. Any packets requested beyond this are | |
3507 | * chained onto the last packet header's m_next field. | |
91447636 A |
3508 | */ |
3509 | struct mbuf * | |
3510 | m_getpackets(int num_needed, int num_with_pkthdrs, int how) | |
3511 | { | |
3512 | unsigned int n = num_needed; | |
fa4905b1 | 3513 | |
2d21ac55 A |
3514 | return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0, |
3515 | m_maxsize(MC_CL))); | |
3516 | } | |
fa4905b1 | 3517 | |
9bccf70c | 3518 | /* |
2d21ac55 A |
3519 | * Return a list of mbuf hdrs set up as packet hdrs chained together |
3520 | * on the m_nextpkt field | |
9bccf70c | 3521 | */ |
fa4905b1 A |
3522 | struct mbuf * |
3523 | m_getpackethdrs(int num_needed, int how) | |
3524 | { | |
3525 | struct mbuf *m; | |
3526 | struct mbuf **np, *top; | |
3527 | ||
3528 | top = NULL; | |
3529 | np = ⊤ | |
3530 | ||
fa4905b1 | 3531 | while (num_needed--) { |
2d21ac55 A |
3532 | m = _M_RETRYHDR(how, MT_DATA); |
3533 | if (m == NULL) | |
3534 | break; | |
3535 | ||
3536 | *np = m; | |
3537 | np = &m->m_nextpkt; | |
3538 | } | |
fa4905b1 A |
3539 | |
3540 | return (top); | |
3541 | } | |
3542 | ||
2d21ac55 A |
3543 | /* |
3544 | * Free an mbuf list (m_nextpkt) while following m_next. Returns the count | |
3545 | * for mbufs packets freed. Used by the drivers. | |
1c79356b | 3546 | */ |
2d21ac55 A |
3547 | int |
3548 | m_freem_list(struct mbuf *m) | |
1c79356b A |
3549 | { |
3550 | struct mbuf *nextpkt; | |
2d21ac55 A |
3551 | mcache_obj_t *mp_list = NULL; |
3552 | mcache_obj_t *mcl_list = NULL; | |
3553 | mcache_obj_t *mbc_list = NULL; | |
3554 | mcache_obj_t *m16k_list = NULL; | |
3555 | mcache_obj_t *m_mcl_list = NULL; | |
3556 | mcache_obj_t *m_mbc_list = NULL; | |
3557 | mcache_obj_t *m_m16k_list = NULL; | |
3558 | mcache_obj_t *ref_list = NULL; | |
3559 | int pktcount = 0; | |
3560 | int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0; | |
3561 | ||
3562 | while (m != NULL) { | |
3563 | pktcount++; | |
3564 | ||
3565 | nextpkt = m->m_nextpkt; | |
3566 | m->m_nextpkt = NULL; | |
3567 | ||
3568 | while (m != NULL) { | |
3569 | struct mbuf *next = m->m_next; | |
3570 | mcache_obj_t *o, *rfa; | |
3571 | u_int32_t refcnt, flags; | |
fa4905b1 | 3572 | |
2d21ac55 A |
3573 | if (m->m_type == MT_FREE) |
3574 | panic("m_free: freeing an already freed mbuf"); | |
9bccf70c | 3575 | |
2d21ac55 A |
3576 | if (m->m_type != MT_FREE) |
3577 | mt_free++; | |
91447636 | 3578 | |
2d21ac55 | 3579 | if (m->m_flags & M_PKTHDR) { |
91447636 | 3580 | m_tag_delete_chain(m, NULL); |
91447636 | 3581 | } |
9bccf70c | 3582 | |
2d21ac55 A |
3583 | if (!(m->m_flags & M_EXT)) |
3584 | goto simple_free; | |
3585 | ||
3586 | o = (mcache_obj_t *)m->m_ext.ext_buf; | |
3587 | refcnt = m_decref(m); | |
3588 | flags = MEXT_FLAGS(m); | |
3589 | if (refcnt == 0 && flags == 0) { | |
3590 | if (m->m_ext.ext_free == NULL) { | |
3591 | o->obj_next = mcl_list; | |
3592 | mcl_list = o; | |
3593 | } else if (m->m_ext.ext_free == m_bigfree) { | |
3594 | o->obj_next = mbc_list; | |
3595 | mbc_list = o; | |
3596 | } else if (m->m_ext.ext_free == m_16kfree) { | |
3597 | o->obj_next = m16k_list; | |
3598 | m16k_list = o; | |
3599 | } else { | |
3600 | (*(m->m_ext.ext_free))((caddr_t)o, | |
3601 | m->m_ext.ext_size, | |
3602 | m->m_ext.ext_arg); | |
3603 | } | |
3604 | rfa = (mcache_obj_t *)MEXT_RFA(m); | |
3605 | rfa->obj_next = ref_list; | |
3606 | ref_list = rfa; | |
3607 | MEXT_RFA(m) = NULL; | |
3608 | } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { | |
3609 | VERIFY(m->m_type != MT_FREE); | |
3610 | /* | |
3611 | * Amortize the costs of atomic operations | |
3612 | * by doing them at the end, if possible. | |
3613 | */ | |
3614 | if (m->m_type == MT_DATA) | |
3615 | mt_data++; | |
3616 | else if (m->m_type == MT_HEADER) | |
3617 | mt_header++; | |
3618 | else if (m->m_type == MT_SONAME) | |
3619 | mt_soname++; | |
3620 | else if (m->m_type == MT_TAG) | |
3621 | mt_tag++; | |
3622 | else | |
3623 | mtype_stat_dec(m->m_type); | |
fa4905b1 | 3624 | |
2d21ac55 A |
3625 | m->m_type = MT_FREE; |
3626 | m->m_flags = M_EXT; | |
3627 | m->m_len = 0; | |
3628 | m->m_next = m->m_nextpkt = NULL; | |
3629 | ||
3630 | /* "Free" into the intermediate cache */ | |
3631 | o = (mcache_obj_t *)m; | |
3632 | if (m->m_ext.ext_free == NULL) { | |
3633 | o->obj_next = m_mcl_list; | |
3634 | m_mcl_list = o; | |
3635 | } else if (m->m_ext.ext_free == m_bigfree) { | |
3636 | o->obj_next = m_mbc_list; | |
3637 | m_mbc_list = o; | |
1c79356b | 3638 | } else { |
2d21ac55 A |
3639 | VERIFY(m->m_ext.ext_free == m_16kfree); |
3640 | o->obj_next = m_m16k_list; | |
3641 | m_m16k_list = o; | |
1c79356b | 3642 | } |
2d21ac55 A |
3643 | m = next; |
3644 | continue; | |
1c79356b | 3645 | } |
2d21ac55 A |
3646 | simple_free: |
3647 | /* | |
3648 | * Amortize the costs of atomic operations | |
3649 | * by doing them at the end, if possible. | |
3650 | */ | |
3651 | if (m->m_type == MT_DATA) | |
3652 | mt_data++; | |
3653 | else if (m->m_type == MT_HEADER) | |
3654 | mt_header++; | |
3655 | else if (m->m_type == MT_SONAME) | |
3656 | mt_soname++; | |
3657 | else if (m->m_type == MT_TAG) | |
3658 | mt_tag++; | |
3659 | else if (m->m_type != MT_FREE) | |
3660 | mtype_stat_dec(m->m_type); | |
3661 | ||
1c79356b | 3662 | m->m_type = MT_FREE; |
2d21ac55 A |
3663 | m->m_flags = m->m_len = 0; |
3664 | m->m_next = m->m_nextpkt = NULL; | |
fa4905b1 | 3665 | |
2d21ac55 A |
3666 | ((mcache_obj_t *)m)->obj_next = mp_list; |
3667 | mp_list = (mcache_obj_t *)m; | |
3668 | ||
3669 | m = next; | |
3670 | } | |
fa4905b1 | 3671 | |
2d21ac55 A |
3672 | m = nextpkt; |
3673 | } | |
fa4905b1 | 3674 | |
2d21ac55 A |
3675 | if (mt_free > 0) |
3676 | mtype_stat_add(MT_FREE, mt_free); | |
3677 | if (mt_data > 0) | |
3678 | mtype_stat_sub(MT_DATA, mt_data); | |
3679 | if (mt_header > 0) | |
3680 | mtype_stat_sub(MT_HEADER, mt_header); | |
3681 | if (mt_soname > 0) | |
3682 | mtype_stat_sub(MT_SONAME, mt_soname); | |
3683 | if (mt_tag > 0) | |
3684 | mtype_stat_sub(MT_TAG, mt_tag); | |
3685 | ||
3686 | if (mp_list != NULL) | |
3687 | mcache_free_ext(m_cache(MC_MBUF), mp_list); | |
3688 | if (mcl_list != NULL) | |
3689 | mcache_free_ext(m_cache(MC_CL), mcl_list); | |
3690 | if (mbc_list != NULL) | |
3691 | mcache_free_ext(m_cache(MC_BIGCL), mbc_list); | |
3692 | if (m16k_list != NULL) | |
3693 | mcache_free_ext(m_cache(MC_16KCL), m16k_list); | |
3694 | if (m_mcl_list != NULL) | |
3695 | mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list); | |
3696 | if (m_mbc_list != NULL) | |
3697 | mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list); | |
3698 | if (m_m16k_list != NULL) | |
3699 | mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list); | |
3700 | if (ref_list != NULL) | |
3701 | mcache_free_ext(ref_cache, ref_list); | |
3702 | ||
3703 | return (pktcount); | |
1c79356b A |
3704 | } |
3705 | ||
3706 | void | |
2d21ac55 | 3707 | m_freem(struct mbuf *m) |
1c79356b | 3708 | { |
2d21ac55 | 3709 | while (m != NULL) |
1c79356b A |
3710 | m = m_free(m); |
3711 | } | |
3712 | ||
3713 | /* | |
3714 | * Mbuffer utility routines. | |
3715 | */ | |
2d21ac55 | 3716 | |
1c79356b | 3717 | /* |
2d21ac55 A |
3718 | * Compute the amount of space available before the current start |
3719 | * of data in an mbuf. | |
1c79356b | 3720 | */ |
91447636 | 3721 | int |
2d21ac55 | 3722 | m_leadingspace(struct mbuf *m) |
1c79356b A |
3723 | { |
3724 | if (m->m_flags & M_EXT) { | |
3725 | if (MCLHASREFERENCE(m)) | |
2d21ac55 | 3726 | return (0); |
1c79356b A |
3727 | return (m->m_data - m->m_ext.ext_buf); |
3728 | } | |
3729 | if (m->m_flags & M_PKTHDR) | |
3730 | return (m->m_data - m->m_pktdat); | |
3731 | return (m->m_data - m->m_dat); | |
3732 | } | |
3733 | ||
3734 | /* | |
2d21ac55 | 3735 | * Compute the amount of space available after the end of data in an mbuf. |
1c79356b | 3736 | */ |
91447636 | 3737 | int |
2d21ac55 | 3738 | m_trailingspace(struct mbuf *m) |
1c79356b A |
3739 | { |
3740 | if (m->m_flags & M_EXT) { | |
3741 | if (MCLHASREFERENCE(m)) | |
2d21ac55 | 3742 | return (0); |
1c79356b | 3743 | return (m->m_ext.ext_buf + m->m_ext.ext_size - |
2d21ac55 | 3744 | (m->m_data + m->m_len)); |
1c79356b A |
3745 | } |
3746 | return (&m->m_dat[MLEN] - (m->m_data + m->m_len)); | |
3747 | } | |
3748 | ||
3749 | /* | |
2d21ac55 A |
3750 | * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain, |
3751 | * copy junk along. Does not adjust packet header length. | |
1c79356b A |
3752 | */ |
3753 | struct mbuf * | |
2d21ac55 | 3754 | m_prepend(struct mbuf *m, int len, int how) |
1c79356b A |
3755 | { |
3756 | struct mbuf *mn; | |
3757 | ||
2d21ac55 A |
3758 | _MGET(mn, how, m->m_type); |
3759 | if (mn == NULL) { | |
1c79356b | 3760 | m_freem(m); |
2d21ac55 | 3761 | return (NULL); |
1c79356b A |
3762 | } |
3763 | if (m->m_flags & M_PKTHDR) { | |
3764 | M_COPY_PKTHDR(mn, m); | |
3765 | m->m_flags &= ~M_PKTHDR; | |
3766 | } | |
3767 | mn->m_next = m; | |
3768 | m = mn; | |
3769 | if (len < MHLEN) | |
3770 | MH_ALIGN(m, len); | |
3771 | m->m_len = len; | |
3772 | return (m); | |
3773 | } | |
3774 | ||
9bccf70c | 3775 | /* |
2d21ac55 A |
3776 | * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to |
3777 | * chain, copy junk along, and adjust length. | |
9bccf70c A |
3778 | */ |
3779 | struct mbuf * | |
2d21ac55 A |
3780 | m_prepend_2(struct mbuf *m, int len, int how) |
3781 | { | |
3782 | if (M_LEADINGSPACE(m) >= len) { | |
3783 | m->m_data -= len; | |
3784 | m->m_len += len; | |
3785 | } else { | |
9bccf70c | 3786 | m = m_prepend(m, len, how); |
2d21ac55 A |
3787 | } |
3788 | if ((m) && (m->m_flags & M_PKTHDR)) | |
3789 | m->m_pkthdr.len += len; | |
3790 | return (m); | |
9bccf70c A |
3791 | } |
3792 | ||
1c79356b A |
3793 | /* |
3794 | * Make a copy of an mbuf chain starting "off0" bytes from the beginning, | |
3795 | * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. | |
3796 | * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. | |
3797 | */ | |
3798 | int MCFail; | |
3799 | ||
3800 | struct mbuf * | |
2d21ac55 | 3801 | m_copym(struct mbuf *m, int off0, int len, int wait) |
1c79356b | 3802 | { |
2d21ac55 | 3803 | struct mbuf *n, *mhdr = NULL, **np; |
91447636 | 3804 | int off = off0; |
1c79356b A |
3805 | struct mbuf *top; |
3806 | int copyhdr = 0; | |
3807 | ||
3808 | if (off < 0 || len < 0) | |
2d21ac55 A |
3809 | panic("m_copym: invalid offset %d or len %d", off, len); |
3810 | ||
3811 | if (off == 0 && (m->m_flags & M_PKTHDR)) { | |
3812 | mhdr = m; | |
1c79356b | 3813 | copyhdr = 1; |
2d21ac55 | 3814 | } |
fa4905b1 A |
3815 | |
3816 | while (off >= m->m_len) { | |
2d21ac55 A |
3817 | if (m->m_next == NULL) |
3818 | panic("m_copym: invalid mbuf chain"); | |
1c79356b A |
3819 | off -= m->m_len; |
3820 | m = m->m_next; | |
3821 | } | |
3822 | np = ⊤ | |
2d21ac55 | 3823 | top = NULL; |
fa4905b1 | 3824 | |
1c79356b | 3825 | while (len > 0) { |
2d21ac55 | 3826 | if (m == NULL) { |
1c79356b | 3827 | if (len != M_COPYALL) |
2d21ac55 | 3828 | panic("m_copym: len != M_COPYALL"); |
1c79356b A |
3829 | break; |
3830 | } | |
2d21ac55 A |
3831 | |
3832 | n = _M_RETRY(wait, m->m_type); | |
1c79356b | 3833 | *np = n; |
fa4905b1 | 3834 | |
2d21ac55 | 3835 | if (n == NULL) |
1c79356b | 3836 | goto nospace; |
2d21ac55 A |
3837 | |
3838 | if (copyhdr != 0) { | |
3839 | M_COPY_PKTHDR(n, mhdr); | |
1c79356b A |
3840 | if (len == M_COPYALL) |
3841 | n->m_pkthdr.len -= off0; | |
3842 | else | |
3843 | n->m_pkthdr.len = len; | |
3844 | copyhdr = 0; | |
3845 | } | |
3846 | if (len == M_COPYALL) { | |
2d21ac55 A |
3847 | if (MIN(len, (m->m_len - off)) == len) { |
3848 | printf("m->m_len %ld - off %d = %ld, %ld\n", | |
3849 | m->m_len, off, m->m_len - off, | |
3850 | MIN(len, (m->m_len - off))); | |
3851 | } | |
1c79356b | 3852 | } |
2d21ac55 | 3853 | n->m_len = MIN(len, (m->m_len - off)); |
1c79356b | 3854 | if (n->m_len == M_COPYALL) { |
2d21ac55 A |
3855 | printf("n->m_len == M_COPYALL, fixing\n"); |
3856 | n->m_len = MHLEN; | |
1c79356b A |
3857 | } |
3858 | if (m->m_flags & M_EXT) { | |
1c79356b | 3859 | n->m_ext = m->m_ext; |
2d21ac55 | 3860 | m_incref(m); |
1c79356b A |
3861 | n->m_data = m->m_data + off; |
3862 | n->m_flags |= M_EXT; | |
fa4905b1 | 3863 | } else { |
2d21ac55 | 3864 | bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), |
1c79356b | 3865 | (unsigned)n->m_len); |
fa4905b1 | 3866 | } |
1c79356b A |
3867 | if (len != M_COPYALL) |
3868 | len -= n->m_len; | |
3869 | off = 0; | |
3870 | m = m->m_next; | |
3871 | np = &n->m_next; | |
3872 | } | |
fa4905b1 | 3873 | |
2d21ac55 | 3874 | if (top == NULL) |
1c79356b | 3875 | MCFail++; |
fa4905b1 | 3876 | |
1c79356b A |
3877 | return (top); |
3878 | nospace: | |
fa4905b1 | 3879 | |
1c79356b A |
3880 | m_freem(top); |
3881 | MCFail++; | |
2d21ac55 | 3882 | return (NULL); |
1c79356b A |
3883 | } |
3884 | ||
9bccf70c | 3885 | /* |
2d21ac55 A |
3886 | * Equivalent to m_copym except that all necessary mbuf hdrs are allocated |
3887 | * within this routine also, the last mbuf and offset accessed are passed | |
3888 | * out and can be passed back in to avoid having to rescan the entire mbuf | |
3889 | * list (normally hung off of the socket) | |
9bccf70c | 3890 | */ |
fa4905b1 | 3891 | struct mbuf * |
2d21ac55 A |
3892 | m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait, |
3893 | struct mbuf **m_last, int *m_off) | |
3894 | { | |
3895 | struct mbuf *n, **np = NULL; | |
3896 | int off = off0, len = len0; | |
3897 | struct mbuf *top = NULL; | |
3898 | int mcflags = MSLEEPF(wait); | |
fa4905b1 | 3899 | int copyhdr = 0; |
2d21ac55 A |
3900 | int type = 0; |
3901 | mcache_obj_t *list = NULL; | |
3902 | int needed = 0; | |
fa4905b1 | 3903 | |
2d21ac55 | 3904 | if (off == 0 && (m->m_flags & M_PKTHDR)) |
fa4905b1 A |
3905 | copyhdr = 1; |
3906 | ||
2d21ac55 A |
3907 | if (*m_last != NULL) { |
3908 | m = *m_last; | |
fa4905b1 A |
3909 | off = *m_off; |
3910 | } else { | |
2d21ac55 A |
3911 | while (off >= m->m_len) { |
3912 | off -= m->m_len; | |
fa4905b1 A |
3913 | m = m->m_next; |
3914 | } | |
3915 | } | |
91447636 | 3916 | |
2d21ac55 A |
3917 | n = m; |
3918 | while (len > 0) { | |
3919 | needed++; | |
3920 | ASSERT(n != NULL); | |
3921 | len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0))); | |
3922 | n = n->m_next; | |
3923 | } | |
3924 | needed++; | |
3925 | len = len0; | |
3926 | ||
3927 | /* | |
3928 | * If the caller doesn't want to be put to sleep, mark it with | |
3929 | * MCR_TRYHARD so that we may reclaim buffers from other places | |
3930 | * before giving up. | |
3931 | */ | |
3932 | if (mcflags & MCR_NOSLEEP) | |
3933 | mcflags |= MCR_TRYHARD; | |
3934 | ||
3935 | if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed, | |
3936 | mcflags) != needed) | |
3937 | goto nospace; | |
fa4905b1 | 3938 | |
2d21ac55 | 3939 | needed = 0; |
fa4905b1 | 3940 | while (len > 0) { |
2d21ac55 A |
3941 | n = (struct mbuf *)list; |
3942 | list = list->obj_next; | |
3943 | ASSERT(n != NULL && m != NULL); | |
3944 | ||
3945 | type = (top == NULL) ? MT_HEADER : m->m_type; | |
3946 | MBUF_INIT(n, (top == NULL), type); | |
3947 | #if CONFIG_MACF_NET | |
3948 | if (top == NULL && mac_mbuf_label_init(n, wait) != 0) { | |
3949 | mtype_stat_inc(MT_HEADER); | |
3950 | mtype_stat_dec(MT_FREE); | |
3951 | m_free(n); | |
fa4905b1 | 3952 | goto nospace; |
2d21ac55 A |
3953 | } |
3954 | #endif /* MAC_NET */ | |
3955 | ||
3956 | if (top == NULL) { | |
3957 | top = n; | |
fa4905b1 A |
3958 | np = &top->m_next; |
3959 | continue; | |
2d21ac55 A |
3960 | } else { |
3961 | needed++; | |
3962 | *np = n; | |
3963 | } | |
fa4905b1 A |
3964 | |
3965 | if (copyhdr) { | |
3966 | M_COPY_PKTHDR(n, m); | |
3967 | n->m_pkthdr.len = len; | |
3968 | copyhdr = 0; | |
3969 | } | |
2d21ac55 | 3970 | n->m_len = MIN(len, (m->m_len - off)); |
fa4905b1 A |
3971 | |
3972 | if (m->m_flags & M_EXT) { | |
3973 | n->m_ext = m->m_ext; | |
2d21ac55 | 3974 | m_incref(m); |
fa4905b1 A |
3975 | n->m_data = m->m_data + off; |
3976 | n->m_flags |= M_EXT; | |
3977 | } else { | |
2d21ac55 | 3978 | bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), |
fa4905b1 A |
3979 | (unsigned)n->m_len); |
3980 | } | |
3981 | len -= n->m_len; | |
2d21ac55 | 3982 | |
fa4905b1 | 3983 | if (len == 0) { |
2d21ac55 A |
3984 | if ((off + n->m_len) == m->m_len) { |
3985 | *m_last = m->m_next; | |
3986 | *m_off = 0; | |
fa4905b1 | 3987 | } else { |
2d21ac55 A |
3988 | *m_last = m; |
3989 | *m_off = off + n->m_len; | |
fa4905b1 | 3990 | } |
2d21ac55 | 3991 | break; |
fa4905b1 A |
3992 | } |
3993 | off = 0; | |
3994 | m = m->m_next; | |
3995 | np = &n->m_next; | |
3996 | } | |
fa4905b1 | 3997 | |
2d21ac55 A |
3998 | mtype_stat_inc(MT_HEADER); |
3999 | mtype_stat_add(type, needed); | |
4000 | mtype_stat_sub(MT_FREE, needed + 1); | |
4001 | ||
4002 | ASSERT(list == NULL); | |
fa4905b1 | 4003 | return (top); |
fa4905b1 | 4004 | |
2d21ac55 A |
4005 | nospace: |
4006 | if (list != NULL) | |
4007 | mcache_free_ext(m_cache(MC_MBUF), list); | |
4008 | if (top != NULL) | |
4009 | m_freem(top); | |
fa4905b1 | 4010 | MCFail++; |
2d21ac55 | 4011 | return (NULL); |
fa4905b1 A |
4012 | } |
4013 | ||
1c79356b A |
4014 | /* |
4015 | * Copy data from an mbuf chain starting "off" bytes from the beginning, | |
4016 | * continuing for "len" bytes, into the indicated buffer. | |
4017 | */ | |
2d21ac55 A |
4018 | void |
4019 | m_copydata(struct mbuf *m, int off, int len, caddr_t cp) | |
1c79356b | 4020 | { |
91447636 | 4021 | unsigned count; |
1c79356b A |
4022 | |
4023 | if (off < 0 || len < 0) | |
2d21ac55 A |
4024 | panic("m_copydata: invalid offset %d or len %d", off, len); |
4025 | ||
1c79356b | 4026 | while (off > 0) { |
2d21ac55 A |
4027 | if (m == NULL) |
4028 | panic("m_copydata: invalid mbuf chain"); | |
1c79356b A |
4029 | if (off < m->m_len) |
4030 | break; | |
4031 | off -= m->m_len; | |
4032 | m = m->m_next; | |
4033 | } | |
4034 | while (len > 0) { | |
2d21ac55 A |
4035 | if (m == NULL) |
4036 | panic("m_copydata: invalid mbuf chain"); | |
4037 | count = MIN(m->m_len - off, len); | |
4038 | bcopy(MTOD(m, caddr_t) + off, cp, count); | |
1c79356b A |
4039 | len -= count; |
4040 | cp += count; | |
4041 | off = 0; | |
4042 | m = m->m_next; | |
4043 | } | |
4044 | } | |
4045 | ||
4046 | /* | |
2d21ac55 A |
4047 | * Concatenate mbuf chain n to m. Both chains must be of the same type |
4048 | * (e.g. MT_DATA). Any m_pkthdr is not updated. | |
1c79356b | 4049 | */ |
2d21ac55 A |
4050 | void |
4051 | m_cat(struct mbuf *m, struct mbuf *n) | |
1c79356b A |
4052 | { |
4053 | while (m->m_next) | |
4054 | m = m->m_next; | |
4055 | while (n) { | |
2d21ac55 | 4056 | if ((m->m_flags & M_EXT) || |
1c79356b A |
4057 | m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { |
4058 | /* just join the two chains */ | |
4059 | m->m_next = n; | |
4060 | return; | |
4061 | } | |
4062 | /* splat the data from one into the other */ | |
2d21ac55 | 4063 | bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len, |
1c79356b A |
4064 | (u_int)n->m_len); |
4065 | m->m_len += n->m_len; | |
4066 | n = m_free(n); | |
4067 | } | |
4068 | } | |
4069 | ||
4070 | void | |
2d21ac55 | 4071 | m_adj(struct mbuf *mp, int req_len) |
1c79356b | 4072 | { |
91447636 A |
4073 | int len = req_len; |
4074 | struct mbuf *m; | |
4075 | int count; | |
1c79356b A |
4076 | |
4077 | if ((m = mp) == NULL) | |
4078 | return; | |
4079 | if (len >= 0) { | |
4080 | /* | |
4081 | * Trim from head. | |
4082 | */ | |
4083 | while (m != NULL && len > 0) { | |
4084 | if (m->m_len <= len) { | |
4085 | len -= m->m_len; | |
4086 | m->m_len = 0; | |
4087 | m = m->m_next; | |
4088 | } else { | |
4089 | m->m_len -= len; | |
4090 | m->m_data += len; | |
4091 | len = 0; | |
4092 | } | |
4093 | } | |
4094 | m = mp; | |
4095 | if (m->m_flags & M_PKTHDR) | |
4096 | m->m_pkthdr.len -= (req_len - len); | |
4097 | } else { | |
4098 | /* | |
4099 | * Trim from tail. Scan the mbuf chain, | |
4100 | * calculating its length and finding the last mbuf. | |
4101 | * If the adjustment only affects this mbuf, then just | |
4102 | * adjust and return. Otherwise, rescan and truncate | |
4103 | * after the remaining size. | |
4104 | */ | |
4105 | len = -len; | |
4106 | count = 0; | |
4107 | for (;;) { | |
4108 | count += m->m_len; | |
4109 | if (m->m_next == (struct mbuf *)0) | |
4110 | break; | |
4111 | m = m->m_next; | |
4112 | } | |
4113 | if (m->m_len >= len) { | |
4114 | m->m_len -= len; | |
4115 | m = mp; | |
4116 | if (m->m_flags & M_PKTHDR) | |
4117 | m->m_pkthdr.len -= len; | |
4118 | return; | |
4119 | } | |
4120 | count -= len; | |
4121 | if (count < 0) | |
4122 | count = 0; | |
4123 | /* | |
4124 | * Correct length for chain is "count". | |
4125 | * Find the mbuf with last data, adjust its length, | |
4126 | * and toss data from remaining mbufs on chain. | |
4127 | */ | |
4128 | m = mp; | |
4129 | if (m->m_flags & M_PKTHDR) | |
4130 | m->m_pkthdr.len = count; | |
4131 | for (; m; m = m->m_next) { | |
4132 | if (m->m_len >= count) { | |
4133 | m->m_len = count; | |
4134 | break; | |
4135 | } | |
4136 | count -= m->m_len; | |
4137 | } | |
91447636 | 4138 | while ((m = m->m_next)) |
1c79356b A |
4139 | m->m_len = 0; |
4140 | } | |
4141 | } | |
4142 | ||
4143 | /* | |
4144 | * Rearange an mbuf chain so that len bytes are contiguous | |
4145 | * and in the data area of an mbuf (so that mtod and dtom | |
4146 | * will work for a structure of size len). Returns the resulting | |
4147 | * mbuf chain on success, frees it and returns null on failure. | |
4148 | * If there is room, it will add up to max_protohdr-len extra bytes to the | |
4149 | * contiguous region in an attempt to avoid being called next time. | |
4150 | */ | |
4151 | int MPFail; | |
4152 | ||
4153 | struct mbuf * | |
2d21ac55 | 4154 | m_pullup(struct mbuf *n, int len) |
1c79356b | 4155 | { |
91447636 A |
4156 | struct mbuf *m; |
4157 | int count; | |
1c79356b A |
4158 | int space; |
4159 | ||
4160 | /* | |
4161 | * If first mbuf has no cluster, and has room for len bytes | |
4162 | * without shifting current data, pullup into it, | |
4163 | * otherwise allocate a new mbuf to prepend to the chain. | |
4164 | */ | |
4165 | if ((n->m_flags & M_EXT) == 0 && | |
4166 | n->m_data + len < &n->m_dat[MLEN] && n->m_next) { | |
4167 | if (n->m_len >= len) | |
4168 | return (n); | |
4169 | m = n; | |
4170 | n = n->m_next; | |
4171 | len -= m->m_len; | |
4172 | } else { | |
4173 | if (len > MHLEN) | |
4174 | goto bad; | |
2d21ac55 | 4175 | _MGET(m, M_DONTWAIT, n->m_type); |
1c79356b A |
4176 | if (m == 0) |
4177 | goto bad; | |
4178 | m->m_len = 0; | |
4179 | if (n->m_flags & M_PKTHDR) { | |
4180 | M_COPY_PKTHDR(m, n); | |
4181 | n->m_flags &= ~M_PKTHDR; | |
4182 | } | |
4183 | } | |
4184 | space = &m->m_dat[MLEN] - (m->m_data + m->m_len); | |
4185 | do { | |
2d21ac55 A |
4186 | count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len); |
4187 | bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len, | |
4188 | (unsigned)count); | |
1c79356b A |
4189 | len -= count; |
4190 | m->m_len += count; | |
4191 | n->m_len -= count; | |
4192 | space -= count; | |
4193 | if (n->m_len) | |
4194 | n->m_data += count; | |
4195 | else | |
4196 | n = m_free(n); | |
4197 | } while (len > 0 && n); | |
4198 | if (len > 0) { | |
4199 | (void) m_free(m); | |
4200 | goto bad; | |
4201 | } | |
4202 | m->m_next = n; | |
4203 | return (m); | |
4204 | bad: | |
4205 | m_freem(n); | |
4206 | MPFail++; | |
4207 | return (0); | |
4208 | } | |
4209 | ||
4210 | /* | |
4211 | * Partition an mbuf chain in two pieces, returning the tail -- | |
4212 | * all but the first len0 bytes. In case of failure, it returns NULL and | |
4213 | * attempts to restore the chain to its original state. | |
4214 | */ | |
4215 | struct mbuf * | |
2d21ac55 | 4216 | m_split(struct mbuf *m0, int len0, int wait) |
1c79356b | 4217 | { |
91447636 | 4218 | struct mbuf *m, *n; |
1c79356b A |
4219 | unsigned len = len0, remain; |
4220 | ||
4221 | for (m = m0; m && len > m->m_len; m = m->m_next) | |
4222 | len -= m->m_len; | |
2d21ac55 A |
4223 | if (m == NULL) |
4224 | return (NULL); | |
1c79356b A |
4225 | remain = m->m_len - len; |
4226 | if (m0->m_flags & M_PKTHDR) { | |
2d21ac55 A |
4227 | _MGETHDR(n, wait, m0->m_type); |
4228 | if (n == NULL) | |
4229 | return (NULL); | |
1c79356b A |
4230 | n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; |
4231 | n->m_pkthdr.len = m0->m_pkthdr.len - len0; | |
4232 | m0->m_pkthdr.len = len0; | |
4233 | if (m->m_flags & M_EXT) | |
4234 | goto extpacket; | |
4235 | if (remain > MHLEN) { | |
4236 | /* m can't be the lead packet */ | |
4237 | MH_ALIGN(n, 0); | |
4238 | n->m_next = m_split(m, len, wait); | |
2d21ac55 | 4239 | if (n->m_next == NULL) { |
1c79356b | 4240 | (void) m_free(n); |
2d21ac55 | 4241 | return (NULL); |
1c79356b A |
4242 | } else |
4243 | return (n); | |
4244 | } else | |
4245 | MH_ALIGN(n, remain); | |
4246 | } else if (remain == 0) { | |
4247 | n = m->m_next; | |
2d21ac55 | 4248 | m->m_next = NULL; |
1c79356b A |
4249 | return (n); |
4250 | } else { | |
2d21ac55 A |
4251 | _MGET(n, wait, m->m_type); |
4252 | if (n == NULL) | |
4253 | return (NULL); | |
1c79356b A |
4254 | M_ALIGN(n, remain); |
4255 | } | |
4256 | extpacket: | |
4257 | if (m->m_flags & M_EXT) { | |
4258 | n->m_flags |= M_EXT; | |
0b4e3aa0 | 4259 | n->m_ext = m->m_ext; |
2d21ac55 | 4260 | m_incref(m); |
1c79356b A |
4261 | n->m_data = m->m_data + len; |
4262 | } else { | |
2d21ac55 | 4263 | bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain); |
1c79356b A |
4264 | } |
4265 | n->m_len = remain; | |
4266 | m->m_len = len; | |
4267 | n->m_next = m->m_next; | |
2d21ac55 | 4268 | m->m_next = NULL; |
1c79356b A |
4269 | return (n); |
4270 | } | |
2d21ac55 | 4271 | |
1c79356b A |
4272 | /* |
4273 | * Routine to copy from device local memory into mbufs. | |
4274 | */ | |
4275 | struct mbuf * | |
2d21ac55 A |
4276 | m_devget(char *buf, int totlen, int off0, struct ifnet *ifp, |
4277 | void (*copy)(const void *, void *, size_t)) | |
1c79356b | 4278 | { |
91447636 | 4279 | struct mbuf *m; |
2d21ac55 | 4280 | struct mbuf *top = NULL, **mp = ⊤ |
91447636 A |
4281 | int off = off0, len; |
4282 | char *cp; | |
1c79356b A |
4283 | char *epkt; |
4284 | ||
4285 | cp = buf; | |
4286 | epkt = cp + totlen; | |
4287 | if (off) { | |
4288 | /* | |
4289 | * If 'off' is non-zero, packet is trailer-encapsulated, | |
4290 | * so we have to skip the type and length fields. | |
4291 | */ | |
2d21ac55 A |
4292 | cp += off + 2 * sizeof (u_int16_t); |
4293 | totlen -= 2 * sizeof (u_int16_t); | |
1c79356b | 4294 | } |
2d21ac55 A |
4295 | _MGETHDR(m, M_DONTWAIT, MT_DATA); |
4296 | if (m == NULL) | |
4297 | return (NULL); | |
1c79356b A |
4298 | m->m_pkthdr.rcvif = ifp; |
4299 | m->m_pkthdr.len = totlen; | |
4300 | m->m_len = MHLEN; | |
4301 | ||
4302 | while (totlen > 0) { | |
2d21ac55 A |
4303 | if (top != NULL) { |
4304 | _MGET(m, M_DONTWAIT, MT_DATA); | |
4305 | if (m == NULL) { | |
1c79356b | 4306 | m_freem(top); |
2d21ac55 | 4307 | return (NULL); |
1c79356b A |
4308 | } |
4309 | m->m_len = MLEN; | |
4310 | } | |
2d21ac55 | 4311 | len = MIN(totlen, epkt - cp); |
1c79356b A |
4312 | if (len >= MINCLSIZE) { |
4313 | MCLGET(m, M_DONTWAIT); | |
2d21ac55 A |
4314 | if (m->m_flags & M_EXT) { |
4315 | m->m_len = len = MIN(len, m_maxsize(MC_CL)); | |
4316 | } else { | |
4317 | /* give up when it's out of cluster mbufs */ | |
4318 | if (top != NULL) | |
4319 | m_freem(top); | |
1c79356b | 4320 | m_freem(m); |
2d21ac55 | 4321 | return (NULL); |
1c79356b A |
4322 | } |
4323 | } else { | |
4324 | /* | |
4325 | * Place initial small packet/header at end of mbuf. | |
4326 | */ | |
4327 | if (len < m->m_len) { | |
2d21ac55 A |
4328 | if (top == NULL && |
4329 | len + max_linkhdr <= m->m_len) | |
1c79356b A |
4330 | m->m_data += max_linkhdr; |
4331 | m->m_len = len; | |
2d21ac55 | 4332 | } else { |
1c79356b | 4333 | len = m->m_len; |
2d21ac55 | 4334 | } |
1c79356b A |
4335 | } |
4336 | if (copy) | |
2d21ac55 | 4337 | copy(cp, MTOD(m, caddr_t), (unsigned)len); |
1c79356b | 4338 | else |
2d21ac55 | 4339 | bcopy(cp, MTOD(m, caddr_t), (unsigned)len); |
1c79356b A |
4340 | cp += len; |
4341 | *mp = m; | |
4342 | mp = &m->m_next; | |
4343 | totlen -= len; | |
4344 | if (cp == epkt) | |
4345 | cp = buf; | |
4346 | } | |
4347 | return (top); | |
4348 | } | |
4349 | ||
4350 | /* | |
2d21ac55 | 4351 | * Cluster freelist allocation check. |
1c79356b A |
4352 | */ |
4353 | static int | |
91447636 | 4354 | m_howmany(int num, size_t bufsize) |
1c79356b | 4355 | { |
2d21ac55 A |
4356 | int i = 0, j = 0; |
4357 | u_int32_t m_clusters, m_bigclusters, m_16kclusters; | |
4358 | u_int32_t m_clfree, m_bigclfree, m_16kclfree; | |
4359 | ||
4360 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
4361 | ||
4362 | m_clusters = m_total(MC_CL); | |
4363 | m_bigclusters = m_total(MC_BIGCL); | |
4364 | m_16kclusters = m_total(MC_16KCL); | |
4365 | m_clfree = m_infree(MC_CL); | |
4366 | m_bigclfree = m_infree(MC_BIGCL); | |
4367 | m_16kclfree = m_infree(MC_16KCL); | |
4368 | ||
91447636 | 4369 | /* Bail if we've maxed out the mbuf memory map */ |
2d21ac55 A |
4370 | if ((bufsize != m_maxsize(MC_16KCL) && |
4371 | (m_clusters + (m_bigclusters << 1) >= nclusters)) || | |
4372 | (njcl > 0 && bufsize == m_maxsize(MC_16KCL) && | |
4373 | (m_16kclusters << 3) >= njcl)) { | |
4374 | #if DEBUG | |
4375 | if (bufsize == MCLBYTES && num > m_clfree) { | |
4376 | printf("m_howmany - out of small clusters, " | |
4377 | "%d short\n", num - mbstat.m_clfree); | |
4378 | } | |
4379 | #endif /* DEBUG */ | |
4380 | return (0); | |
4381 | } | |
4382 | ||
4383 | if (bufsize == m_maxsize(MC_CL)) { | |
4384 | /* Under minimum */ | |
4385 | if (m_clusters < MINCL) | |
4386 | return (MINCL - m_clusters); | |
4387 | /* Too few (free < 1/16 total) and not over maximum */ | |
4388 | if (m_clusters < m_maxlimit(MC_CL)) { | |
4389 | if (m_clfree >= MCL_LOWAT) | |
4390 | return (0); | |
4391 | if (num >= m_clfree) | |
4392 | i = num - m_clfree; | |
4393 | if (((m_clusters + num) >> 4) > m_clfree) | |
4394 | j = ((m_clusters + num) >> 4) - m_clfree; | |
4395 | i = MAX(i, j); | |
4396 | if (i + m_clusters >= m_maxlimit(MC_CL)) | |
4397 | i = m_maxlimit(MC_CL) - m_clusters; | |
91447636 | 4398 | } |
2d21ac55 A |
4399 | VERIFY((m_total(MC_CL) + i) <= m_maxlimit(MC_CL)); |
4400 | } else if (bufsize == m_maxsize(MC_BIGCL)) { | |
4401 | /* Under minimum */ | |
4402 | if (m_bigclusters < MINBIGCL) | |
4403 | return (MINBIGCL - m_bigclusters); | |
4404 | /* Too few (free < 1/16 total) and not over maximum */ | |
4405 | if (m_bigclusters < m_maxlimit(MC_BIGCL)) { | |
4406 | if (m_bigclfree >= MBIGCL_LOWAT) | |
4407 | return (0); | |
4408 | if (num >= m_bigclfree) | |
4409 | i = num - m_bigclfree; | |
4410 | if (((m_bigclusters + num) >> 4) > m_bigclfree) | |
4411 | j = ((m_bigclusters + num) >> 4) - m_bigclfree; | |
4412 | i = MAX(i, j); | |
4413 | if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) | |
4414 | i = m_maxlimit(MC_BIGCL) - m_bigclusters; | |
4415 | } | |
4416 | VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL)); | |
4417 | } else { | |
4418 | VERIFY(njcl > 0); | |
4419 | /* Under minimum */ | |
4420 | if (m_16kclusters < MIN16KCL) | |
4421 | return (MIN16KCL - m_16kclusters); | |
4422 | /* Too few (free < 1/16 total) and not over maximum */ | |
4423 | if (m_16kclusters < m_maxlimit(MC_16KCL)) { | |
4424 | if (m_16kclfree >= M16KCL_LOWAT) | |
4425 | return (0); | |
4426 | if (num >= m_16kclfree) | |
4427 | i = num - m_16kclfree; | |
4428 | if (((m_16kclusters + num) >> 4) > m_16kclfree) | |
4429 | j = ((m_16kclusters + num) >> 4) - m_16kclfree; | |
4430 | i = MAX(i, j); | |
4431 | if (i + m_16kclusters >= m_maxlimit(MC_16KCL)) | |
4432 | i = m_maxlimit(MC_16KCL) - m_16kclusters; | |
4433 | } | |
4434 | VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL)); | |
91447636 | 4435 | } |
2d21ac55 A |
4436 | |
4437 | return (i); | |
1c79356b A |
4438 | } |
4439 | ||
1c79356b A |
4440 | /* |
4441 | * Copy data from a buffer back into the indicated mbuf chain, | |
4442 | * starting "off" bytes from the beginning, extending the mbuf | |
4443 | * chain if necessary. | |
4444 | */ | |
4445 | void | |
2d21ac55 | 4446 | m_copyback(struct mbuf *m0, int off, int len, caddr_t cp) |
1c79356b | 4447 | { |
91447636 A |
4448 | int mlen; |
4449 | struct mbuf *m = m0, *n; | |
1c79356b A |
4450 | int totlen = 0; |
4451 | ||
2d21ac55 | 4452 | if (m0 == NULL) |
1c79356b A |
4453 | return; |
4454 | while (off > (mlen = m->m_len)) { | |
4455 | off -= mlen; | |
4456 | totlen += mlen; | |
2d21ac55 | 4457 | if (m->m_next == NULL) { |
1c79356b | 4458 | n = m_getclr(M_DONTWAIT, m->m_type); |
2d21ac55 | 4459 | if (n == NULL) |
1c79356b | 4460 | goto out; |
2d21ac55 | 4461 | n->m_len = MIN(MLEN, len + off); |
1c79356b A |
4462 | m->m_next = n; |
4463 | } | |
4464 | m = m->m_next; | |
4465 | } | |
4466 | while (len > 0) { | |
2d21ac55 A |
4467 | mlen = MIN(m->m_len - off, len); |
4468 | bcopy(cp, off + MTOD(m, caddr_t), (unsigned)mlen); | |
1c79356b A |
4469 | cp += mlen; |
4470 | len -= mlen; | |
4471 | mlen += off; | |
4472 | off = 0; | |
4473 | totlen += mlen; | |
4474 | if (len == 0) | |
4475 | break; | |
2d21ac55 A |
4476 | if (m->m_next == NULL) { |
4477 | n = _M_GET(M_DONTWAIT, m->m_type); | |
4478 | if (n == NULL) | |
1c79356b | 4479 | break; |
2d21ac55 | 4480 | n->m_len = MIN(MLEN, len); |
1c79356b A |
4481 | m->m_next = n; |
4482 | } | |
4483 | m = m->m_next; | |
4484 | } | |
2d21ac55 A |
4485 | out: |
4486 | if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) | |
1c79356b A |
4487 | m->m_pkthdr.len = totlen; |
4488 | } | |
4489 | ||
2d21ac55 A |
4490 | char * |
4491 | mcl_to_paddr(char *addr) | |
4492 | { | |
4493 | int base_phys; | |
1c79356b | 4494 | |
2d21ac55 A |
4495 | if (!MBUF_IN_MAP(addr)) |
4496 | return (NULL); | |
91447636 | 4497 | base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT]; |
1c79356b A |
4498 | |
4499 | if (base_phys == 0) | |
2d21ac55 | 4500 | return (NULL); |
91447636 | 4501 | return ((char *)((int)base_phys | ((int)addr & PGOFSET))); |
1c79356b A |
4502 | } |
4503 | ||
4504 | /* | |
4505 | * Dup the mbuf chain passed in. The whole thing. No cute additional cruft. | |
4506 | * And really copy the thing. That way, we don't "precompute" checksums | |
2d21ac55 A |
4507 | * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for |
4508 | * small packets, don't dup into a cluster. That way received packets | |
4509 | * don't take up too much room in the sockbuf (cf. sbspace()). | |
1c79356b A |
4510 | */ |
4511 | int MDFail; | |
4512 | ||
4513 | struct mbuf * | |
91447636 | 4514 | m_dup(struct mbuf *m, int how) |
2d21ac55 | 4515 | { |
91447636 | 4516 | struct mbuf *n, **np; |
1c79356b A |
4517 | struct mbuf *top; |
4518 | int copyhdr = 0; | |
4519 | ||
4520 | np = ⊤ | |
2d21ac55 | 4521 | top = NULL; |
1c79356b A |
4522 | if (m->m_flags & M_PKTHDR) |
4523 | copyhdr = 1; | |
4524 | ||
4525 | /* | |
4526 | * Quick check: if we have one mbuf and its data fits in an | |
4527 | * mbuf with packet header, just copy and go. | |
4528 | */ | |
2d21ac55 A |
4529 | if (m->m_next == NULL) { |
4530 | /* Then just move the data into an mbuf and be done... */ | |
4531 | if (copyhdr) { | |
4532 | if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) { | |
4533 | if ((n = _M_GETHDR(how, m->m_type)) == NULL) | |
4534 | return (NULL); | |
1c79356b | 4535 | n->m_len = m->m_len; |
3a60a9f5 A |
4536 | m_dup_pkthdr(n, m, how); |
4537 | bcopy(m->m_data, n->m_data, m->m_len); | |
2d21ac55 | 4538 | return (n); |
1c79356b | 4539 | } |
2d21ac55 A |
4540 | } else if (m->m_len <= MLEN) { |
4541 | if ((n = _M_GET(how, m->m_type)) == NULL) | |
4542 | return (NULL); | |
1c79356b A |
4543 | bcopy(m->m_data, n->m_data, m->m_len); |
4544 | n->m_len = m->m_len; | |
2d21ac55 | 4545 | return (n); |
1c79356b A |
4546 | } |
4547 | } | |
2d21ac55 | 4548 | while (m != NULL) { |
1c79356b A |
4549 | #if BLUE_DEBUG |
4550 | kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len, | |
2d21ac55 | 4551 | m->m_data); |
1c79356b A |
4552 | #endif |
4553 | if (copyhdr) | |
2d21ac55 | 4554 | n = _M_GETHDR(how, m->m_type); |
1c79356b | 4555 | else |
2d21ac55 A |
4556 | n = _M_GET(how, m->m_type); |
4557 | if (n == NULL) | |
1c79356b | 4558 | goto nospace; |
2d21ac55 A |
4559 | if (m->m_flags & M_EXT) { |
4560 | if (m->m_len <= m_maxsize(MC_CL)) | |
4561 | MCLGET(n, how); | |
4562 | else if (m->m_len <= m_maxsize(MC_BIGCL)) | |
4563 | n = m_mbigget(n, how); | |
4564 | else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) | |
4565 | n = m_m16kget(n, how); | |
4566 | if (!(n->m_flags & M_EXT)) { | |
4567 | (void) m_free(n); | |
1c79356b | 4568 | goto nospace; |
2d21ac55 | 4569 | } |
1c79356b A |
4570 | } |
4571 | *np = n; | |
2d21ac55 A |
4572 | if (copyhdr) { |
4573 | /* Don't use M_COPY_PKTHDR: preserve m_data */ | |
3a60a9f5 | 4574 | m_dup_pkthdr(n, m, how); |
1c79356b | 4575 | copyhdr = 0; |
2d21ac55 | 4576 | if (!(n->m_flags & M_EXT)) |
1c79356b A |
4577 | n->m_data = n->m_pktdat; |
4578 | } | |
4579 | n->m_len = m->m_len; | |
4580 | /* | |
4581 | * Get the dup on the same bdry as the original | |
4582 | * Assume that the two mbufs have the same offset to data area | |
2d21ac55 | 4583 | * (up to word boundaries) |
1c79356b | 4584 | */ |
2d21ac55 | 4585 | bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len); |
1c79356b A |
4586 | m = m->m_next; |
4587 | np = &n->m_next; | |
4588 | #if BLUE_DEBUG | |
4589 | kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len, | |
2d21ac55 | 4590 | n->m_data); |
1c79356b A |
4591 | #endif |
4592 | } | |
4593 | ||
2d21ac55 | 4594 | if (top == NULL) |
1c79356b A |
4595 | MDFail++; |
4596 | return (top); | |
2d21ac55 A |
4597 | |
4598 | nospace: | |
1c79356b A |
4599 | m_freem(top); |
4600 | MDFail++; | |
2d21ac55 | 4601 | return (NULL); |
1c79356b A |
4602 | } |
4603 | ||
2d21ac55 A |
4604 | #define MBUF_MULTIPAGES(m) \ |
4605 | (((m)->m_flags & M_EXT) && \ | |
4606 | ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \ | |
4607 | (!IS_P2ALIGNED((m)->m_data, NBPG) && \ | |
4608 | P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len)))) | |
4609 | ||
4610 | static struct mbuf * | |
4611 | m_expand(struct mbuf *m, struct mbuf **last) | |
9bccf70c | 4612 | { |
2d21ac55 A |
4613 | struct mbuf *top = NULL; |
4614 | struct mbuf **nm = ⊤ | |
4615 | uintptr_t data0, data; | |
4616 | unsigned int len0, len; | |
4617 | ||
4618 | VERIFY(MBUF_MULTIPAGES(m)); | |
4619 | VERIFY(m->m_next == NULL); | |
4620 | data0 = (uintptr_t)m->m_data; | |
4621 | len0 = m->m_len; | |
4622 | *last = top; | |
4623 | ||
4624 | for (;;) { | |
4625 | struct mbuf *n; | |
4626 | ||
4627 | data = data0; | |
4628 | if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG) | |
4629 | len = NBPG; | |
4630 | else if (!IS_P2ALIGNED(data, NBPG) && | |
4631 | P2ROUNDUP(data, NBPG) < (data + len0)) | |
4632 | len = P2ROUNDUP(data, NBPG) - data; | |
4633 | else | |
4634 | len = len0; | |
4635 | ||
4636 | VERIFY(len > 0); | |
4637 | VERIFY(m->m_flags & M_EXT); | |
4638 | m->m_data = (void *)data; | |
4639 | m->m_len = len; | |
4640 | ||
4641 | *nm = *last = m; | |
4642 | nm = &m->m_next; | |
4643 | m->m_next = NULL; | |
4644 | ||
4645 | data0 += len; | |
4646 | len0 -= len; | |
4647 | if (len0 == 0) | |
4648 | break; | |
4649 | ||
4650 | n = _M_RETRY(M_DONTWAIT, MT_DATA); | |
4651 | if (n == NULL) { | |
4652 | m_freem(top); | |
4653 | top = *last = NULL; | |
4654 | break; | |
4655 | } | |
4656 | ||
4657 | n->m_ext = m->m_ext; | |
4658 | m_incref(m); | |
4659 | n->m_flags |= M_EXT; | |
4660 | m = n; | |
4661 | } | |
4662 | return (top); | |
9bccf70c A |
4663 | } |
4664 | ||
2d21ac55 A |
4665 | struct mbuf * |
4666 | m_normalize(struct mbuf *m) | |
9bccf70c | 4667 | { |
2d21ac55 A |
4668 | struct mbuf *top = NULL; |
4669 | struct mbuf **nm = ⊤ | |
4670 | boolean_t expanded = FALSE; | |
4671 | ||
4672 | while (m != NULL) { | |
4673 | struct mbuf *n; | |
4674 | ||
4675 | n = m->m_next; | |
4676 | m->m_next = NULL; | |
4677 | ||
4678 | /* Does the data cross one or more page boundaries? */ | |
4679 | if (MBUF_MULTIPAGES(m)) { | |
4680 | struct mbuf *last; | |
4681 | if ((m = m_expand(m, &last)) == NULL) { | |
4682 | m_freem(n); | |
4683 | m_freem(top); | |
4684 | top = NULL; | |
4685 | break; | |
4686 | } | |
4687 | *nm = m; | |
4688 | nm = &last->m_next; | |
4689 | expanded = TRUE; | |
4690 | } else { | |
4691 | *nm = m; | |
4692 | nm = &m->m_next; | |
4693 | } | |
4694 | m = n; | |
4695 | } | |
4696 | if (expanded) | |
4697 | atomic_add_32(&mb_normalized, 1); | |
4698 | return (top); | |
9bccf70c A |
4699 | } |
4700 | ||
9bccf70c A |
4701 | void |
4702 | m_mchtype(struct mbuf *m, int t) | |
4703 | { | |
2d21ac55 A |
4704 | mtype_stat_inc(t); |
4705 | mtype_stat_dec(m->m_type); | |
4706 | (m)->m_type = t; | |
9bccf70c A |
4707 | } |
4708 | ||
2d21ac55 A |
4709 | void * |
4710 | m_mtod(struct mbuf *m) | |
9bccf70c | 4711 | { |
2d21ac55 | 4712 | return (MTOD(m, void *)); |
9bccf70c A |
4713 | } |
4714 | ||
2d21ac55 A |
4715 | struct mbuf * |
4716 | m_dtom(void *x) | |
9bccf70c A |
4717 | { |
4718 | return ((struct mbuf *)((u_long)(x) & ~(MSIZE-1))); | |
4719 | } | |
4720 | ||
2d21ac55 A |
4721 | void |
4722 | m_mcheck(struct mbuf *m) | |
9bccf70c | 4723 | { |
2d21ac55 | 4724 | _MCHECK(m); |
9bccf70c A |
4725 | } |
4726 | ||
2d21ac55 A |
4727 | /* |
4728 | * Inform the corresponding mcache(s) that there's a waiter below. | |
4729 | */ | |
4730 | static void | |
4731 | mbuf_waiter_inc(mbuf_class_t class, boolean_t comp) | |
9bccf70c | 4732 | { |
2d21ac55 A |
4733 | mcache_waiter_inc(m_cache(class)); |
4734 | if (comp) { | |
4735 | if (class == MC_CL) { | |
4736 | mcache_waiter_inc(m_cache(MC_MBUF_CL)); | |
4737 | } else if (class == MC_BIGCL) { | |
4738 | mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); | |
4739 | } else if (class == MC_16KCL) { | |
4740 | mcache_waiter_inc(m_cache(MC_MBUF_16KCL)); | |
4741 | } else { | |
4742 | mcache_waiter_inc(m_cache(MC_MBUF_CL)); | |
4743 | mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); | |
4744 | } | |
4745 | } | |
9bccf70c A |
4746 | } |
4747 | ||
2d21ac55 A |
4748 | /* |
4749 | * Inform the corresponding mcache(s) that there's no more waiter below. | |
4750 | */ | |
4751 | static void | |
4752 | mbuf_waiter_dec(mbuf_class_t class, boolean_t comp) | |
4753 | { | |
4754 | mcache_waiter_dec(m_cache(class)); | |
4755 | if (comp) { | |
4756 | if (class == MC_CL) { | |
4757 | mcache_waiter_dec(m_cache(MC_MBUF_CL)); | |
4758 | } else if (class == MC_BIGCL) { | |
4759 | mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); | |
4760 | } else if (class == MC_16KCL) { | |
4761 | mcache_waiter_dec(m_cache(MC_MBUF_16KCL)); | |
4762 | } else { | |
4763 | mcache_waiter_dec(m_cache(MC_MBUF_CL)); | |
4764 | mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); | |
4765 | } | |
4766 | } | |
4767 | } | |
9bccf70c | 4768 | |
2d21ac55 A |
4769 | /* |
4770 | * Called during blocking allocation. Returns TRUE if one or more objects | |
4771 | * are available at the per-CPU caches layer and that allocation should be | |
4772 | * retried at that level. | |
4773 | */ | |
4774 | static boolean_t | |
4775 | mbuf_sleep(mbuf_class_t class, unsigned int num, int wait) | |
9bccf70c | 4776 | { |
2d21ac55 A |
4777 | boolean_t mcache_retry = FALSE; |
4778 | ||
4779 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
4780 | ||
4781 | /* Check if there's anything at the cache layer */ | |
4782 | if (mbuf_cached_above(class, wait)) { | |
4783 | mcache_retry = TRUE; | |
4784 | goto done; | |
4785 | } | |
4786 | ||
4787 | /* Nothing? Then try hard to get it from somewhere */ | |
4788 | m_reclaim(class, num, (wait & MCR_COMP)); | |
4789 | ||
4790 | /* We tried hard and got something? */ | |
4791 | if (m_infree(class) > 0) { | |
4792 | mbstat.m_wait++; | |
4793 | goto done; | |
4794 | } else if (mbuf_cached_above(class, wait)) { | |
4795 | mbstat.m_wait++; | |
4796 | mcache_retry = TRUE; | |
4797 | goto done; | |
4798 | } else if (wait & MCR_TRYHARD) { | |
4799 | mcache_retry = TRUE; | |
4800 | goto done; | |
4801 | } | |
4802 | ||
4803 | /* | |
4804 | * There's really nothing for us right now; inform the | |
4805 | * cache(s) that there is a waiter below and go to sleep. | |
4806 | */ | |
4807 | mbuf_waiter_inc(class, (wait & MCR_COMP)); | |
4808 | ||
4809 | VERIFY(!(wait & MCR_NOSLEEP)); | |
4810 | mb_waiters++; | |
4811 | (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL); | |
4812 | ||
4813 | /* We are now up; stop getting notified until next round */ | |
4814 | mbuf_waiter_dec(class, (wait & MCR_COMP)); | |
4815 | ||
4816 | /* We waited and got something */ | |
4817 | if (m_infree(class) > 0) { | |
4818 | mbstat.m_wait++; | |
4819 | goto done; | |
4820 | } else if (mbuf_cached_above(class, wait)) { | |
4821 | mbstat.m_wait++; | |
4822 | mcache_retry = TRUE; | |
4823 | } | |
4824 | done: | |
4825 | return (mcache_retry); | |
9bccf70c A |
4826 | } |
4827 | ||
91447636 | 4828 | static void |
2d21ac55 | 4829 | mbuf_worker_thread(void) |
1c79356b | 4830 | { |
2d21ac55 A |
4831 | int mbuf_expand; |
4832 | ||
91447636 | 4833 | while (1) { |
2d21ac55 A |
4834 | lck_mtx_lock(mbuf_mlock); |
4835 | ||
4836 | mbuf_expand = 0; | |
91447636 A |
4837 | if (mbuf_expand_mcl) { |
4838 | int n; | |
2d21ac55 A |
4839 | |
4840 | /* Adjust to current number of cluster in use */ | |
4841 | n = mbuf_expand_mcl - | |
4842 | (m_total(MC_CL) - m_infree(MC_CL)); | |
4843 | if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) | |
4844 | n = m_maxlimit(MC_CL) - m_total(MC_CL); | |
91447636 | 4845 | mbuf_expand_mcl = 0; |
2d21ac55 A |
4846 | |
4847 | if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0) | |
4848 | mbuf_expand++; | |
91447636 A |
4849 | } |
4850 | if (mbuf_expand_big) { | |
4851 | int n; | |
2d21ac55 A |
4852 | |
4853 | /* Adjust to current number of 4 KB cluster in use */ | |
4854 | n = mbuf_expand_big - | |
4855 | (m_total(MC_BIGCL) - m_infree(MC_BIGCL)); | |
4856 | if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) | |
4857 | n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL); | |
91447636 | 4858 | mbuf_expand_big = 0; |
2d21ac55 A |
4859 | |
4860 | if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0) | |
4861 | mbuf_expand++; | |
4862 | } | |
4863 | if (mbuf_expand_16k) { | |
4864 | int n; | |
4865 | ||
4866 | /* Adjust to current number of 16 KB cluster in use */ | |
4867 | n = mbuf_expand_16k - | |
4868 | (m_total(MC_16KCL) - m_infree(MC_16KCL)); | |
4869 | if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) | |
4870 | n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL); | |
4871 | mbuf_expand_16k = 0; | |
4872 | ||
4873 | if (n > 0) | |
4874 | (void) freelist_populate(MC_16KCL, n, M_WAIT); | |
4875 | } | |
4876 | ||
4877 | /* | |
4878 | * Because we can run out of memory before filling the mbuf | |
4879 | * map, we should not allocate more clusters than they are | |
4880 | * mbufs -- otherwise we could have a large number of useless | |
4881 | * clusters allocated. | |
91447636 | 4882 | */ |
2d21ac55 A |
4883 | if (mbuf_expand) { |
4884 | while (m_total(MC_MBUF) < | |
4885 | (m_total(MC_BIGCL) + m_total(MC_CL))) { | |
4886 | if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) | |
4887 | break; | |
4888 | } | |
91447636 | 4889 | } |
2d21ac55 A |
4890 | |
4891 | lck_mtx_unlock(mbuf_mlock); | |
4892 | ||
4893 | assert_wait(&mbuf_worker_run, THREAD_UNINT); | |
4894 | (void) thread_block((thread_continue_t)mbuf_worker_thread); | |
91447636 | 4895 | } |
1c79356b A |
4896 | } |
4897 | ||
91447636 | 4898 | static void |
2d21ac55 | 4899 | mbuf_worker_thread_init(void) |
55e303ae | 4900 | { |
2d21ac55 A |
4901 | mbuf_worker_ready++; |
4902 | mbuf_worker_thread(); | |
55e303ae | 4903 | } |
1c79356b | 4904 | |
2d21ac55 A |
4905 | static mcl_slab_t * |
4906 | slab_get(void *buf) | |
4907 | { | |
4908 | mcl_slabg_t *slg; | |
4909 | unsigned int ix, k; | |
4910 | ||
4911 | lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); | |
4912 | ||
4913 | VERIFY(MBUF_IN_MAP(buf)); | |
4914 | ix = ((char *)buf - (char *)mbutl) >> MBSHIFT; | |
4915 | VERIFY(ix < maxslabgrp); | |
4916 | ||
4917 | if ((slg = slabstbl[ix]) == NULL) { | |
4918 | /* | |
4919 | * In the current implementation, we never shrink the memory | |
4920 | * pool (hence the cluster map); if we attempt to reallocate | |
4921 | * a cluster group when it's already allocated, panic since | |
4922 | * this is a sign of a memory corruption (slabstbl[ix] got | |
4923 | * nullified). This also means that there shouldn't be any | |
4924 | * hole in the kernel sub-map for the mbuf pool. | |
4925 | */ | |
4926 | ++slabgrp; | |
4927 | VERIFY(ix < slabgrp); | |
4928 | /* | |
4929 | * Slabs expansion can only be done single threaded; when | |
4930 | * we get here, it must be as a result of m_clalloc() which | |
4931 | * is serialized and therefore mb_clalloc_busy must be set. | |
4932 | */ | |
4933 | VERIFY(mb_clalloc_busy); | |
4934 | lck_mtx_unlock(mbuf_mlock); | |
4935 | ||
4936 | /* This is a new buffer; create the slabs group for it */ | |
4937 | MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP, | |
4938 | M_WAITOK | M_ZERO); | |
4939 | VERIFY(slg != NULL); | |
4940 | ||
4941 | lck_mtx_lock(mbuf_mlock); | |
4942 | /* | |
4943 | * No other thread could have gone into m_clalloc() after | |
4944 | * we dropped the lock above, so verify that it's true. | |
4945 | */ | |
4946 | VERIFY(mb_clalloc_busy); | |
4947 | ||
4948 | slabstbl[ix] = slg; | |
4949 | ||
4950 | /* Chain each slab in the group to its forward neighbor */ | |
4951 | for (k = 1; k < NSLABSPMB; k++) | |
4952 | slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k]; | |
4953 | VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL); | |
4954 | ||
4955 | /* And chain the last slab in the previous group to this */ | |
4956 | if (ix > 0) { | |
4957 | VERIFY(slabstbl[ix - 1]-> | |
4958 | slg_slab[NSLABSPMB - 1].sl_next == NULL); | |
4959 | slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next = | |
4960 | &slg->slg_slab[0]; | |
4961 | } | |
4962 | } | |
4963 | ||
4964 | ix = MTOCL(buf) % NSLABSPMB; | |
4965 | VERIFY(ix < NSLABSPMB); | |
4966 | ||
4967 | return (&slg->slg_slab[ix]); | |
4968 | } | |
4969 | ||
4970 | static void | |
4971 | slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags, | |
4972 | void *base, void *head, unsigned int len, int refcnt, int chunks) | |
4973 | { | |
4974 | sp->sl_class = class; | |
4975 | sp->sl_flags = flags; | |
4976 | sp->sl_base = base; | |
4977 | sp->sl_head = head; | |
4978 | sp->sl_len = len; | |
4979 | sp->sl_refcnt = refcnt; | |
4980 | sp->sl_chunks = chunks; | |
4981 | slab_detach(sp); | |
4982 | } | |
4983 | ||
4984 | static void | |
4985 | slab_insert(mcl_slab_t *sp, mbuf_class_t class) | |
4986 | { | |
4987 | VERIFY(slab_is_detached(sp)); | |
4988 | m_slab_cnt(class)++; | |
4989 | TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link); | |
4990 | sp->sl_flags &= ~SLF_DETACHED; | |
4991 | if (class == MC_BIGCL) { | |
4992 | sp = sp->sl_next; | |
4993 | /* Next slab must already be present */ | |
4994 | VERIFY(sp != NULL); | |
4995 | VERIFY(slab_is_detached(sp)); | |
4996 | sp->sl_flags &= ~SLF_DETACHED; | |
4997 | } else if (class == MC_16KCL) { | |
4998 | int k; | |
4999 | for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { | |
5000 | sp = sp->sl_next; | |
5001 | /* Next slab must already be present */ | |
5002 | VERIFY(sp != NULL); | |
5003 | VERIFY(slab_is_detached(sp)); | |
5004 | sp->sl_flags &= ~SLF_DETACHED; | |
5005 | } | |
5006 | } | |
5007 | } | |
5008 | ||
5009 | static void | |
5010 | slab_remove(mcl_slab_t *sp, mbuf_class_t class) | |
5011 | { | |
5012 | VERIFY(!slab_is_detached(sp)); | |
5013 | VERIFY(m_slab_cnt(class) > 0); | |
5014 | m_slab_cnt(class)--; | |
5015 | TAILQ_REMOVE(&m_slablist(class), sp, sl_link); | |
5016 | slab_detach(sp); | |
5017 | if (class == MC_BIGCL) { | |
5018 | sp = sp->sl_next; | |
5019 | /* Next slab must already be present */ | |
5020 | VERIFY(sp != NULL); | |
5021 | VERIFY(!slab_is_detached(sp)); | |
5022 | slab_detach(sp); | |
5023 | } else if (class == MC_16KCL) { | |
5024 | int k; | |
5025 | for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { | |
5026 | sp = sp->sl_next; | |
5027 | /* Next slab must already be present */ | |
5028 | VERIFY(sp != NULL); | |
5029 | VERIFY(!slab_is_detached(sp)); | |
5030 | slab_detach(sp); | |
5031 | } | |
5032 | } | |
5033 | } | |
5034 | ||
5035 | static boolean_t | |
5036 | slab_inrange(mcl_slab_t *sp, void *buf) | |
5037 | { | |
5038 | return ((uintptr_t)buf >= (uintptr_t)sp->sl_base && | |
5039 | (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len)); | |
5040 | } | |
5041 | ||
5042 | #undef panic(...) | |
5043 | ||
5044 | static void | |
5045 | slab_nextptr_panic(mcl_slab_t *sp, void *addr) | |
5046 | { | |
5047 | int i; | |
5048 | unsigned int chunk_len = sp->sl_len / sp->sl_chunks; | |
5049 | uintptr_t buf = (uintptr_t)sp->sl_base; | |
5050 | ||
5051 | for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) { | |
5052 | void *next = ((mcache_obj_t *)buf)->obj_next; | |
5053 | if (next != addr) | |
5054 | continue; | |
5055 | if (mclaudit == NULL) { | |
5056 | if (next != NULL && !MBUF_IN_MAP(next)) { | |
5057 | mcache_t *cp = m_cache(sp->sl_class); | |
5058 | panic("%s: %s buffer %p in slab %p modified " | |
5059 | "after free at offset 0: %p out of range " | |
5060 | "[%p-%p)\n", __func__, cp->mc_name, | |
5061 | (void *)buf, sp, next, mbutl, embutl); | |
5062 | /* NOTREACHED */ | |
5063 | } | |
5064 | } else { | |
5065 | mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class, | |
5066 | (mcache_obj_t *)buf); | |
5067 | mcl_audit_verify_nextptr(next, mca); | |
5068 | } | |
5069 | } | |
5070 | } | |
5071 | ||
5072 | static void | |
5073 | slab_detach(mcl_slab_t *sp) | |
5074 | { | |
5075 | sp->sl_link.tqe_next = (mcl_slab_t *)-1; | |
5076 | sp->sl_link.tqe_prev = (mcl_slab_t **)-1; | |
5077 | sp->sl_flags |= SLF_DETACHED; | |
5078 | } | |
5079 | ||
5080 | static boolean_t | |
5081 | slab_is_detached(mcl_slab_t *sp) | |
5082 | { | |
5083 | return ((intptr_t)sp->sl_link.tqe_next == -1 && | |
5084 | (intptr_t)sp->sl_link.tqe_prev == -1 && | |
5085 | (sp->sl_flags & SLF_DETACHED)); | |
5086 | } | |
5087 | ||
5088 | static void | |
5089 | mcl_audit_init(void *buf, mcache_audit_t **mca_list, | |
5090 | mcache_obj_t **con_list, size_t con_size, unsigned int num) | |
5091 | { | |
5092 | mcache_audit_t *mca, *mca_tail; | |
5093 | mcache_obj_t *con = NULL; | |
5094 | boolean_t save_contents = (con_list != NULL); | |
5095 | unsigned int i, ix; | |
5096 | ||
5097 | ASSERT(num <= NMBPCL); | |
5098 | ASSERT(con_list == NULL || con_size != 0); | |
5099 | ||
5100 | ix = MTOCL(buf); | |
5101 | /* Make sure we haven't been here before */ | |
5102 | for (i = 0; i < NMBPCL; i++) | |
5103 | VERIFY(mclaudit[ix].cl_audit[i] == NULL); | |
5104 | ||
5105 | mca = mca_tail = *mca_list; | |
5106 | if (save_contents) | |
5107 | con = *con_list; | |
5108 | ||
5109 | for (i = 0; i < num; i++) { | |
5110 | mcache_audit_t *next; | |
5111 | ||
5112 | next = mca->mca_next; | |
5113 | bzero(mca, sizeof (*mca)); | |
5114 | mca->mca_next = next; | |
5115 | mclaudit[ix].cl_audit[i] = mca; | |
5116 | ||
5117 | /* Attach the contents buffer if requested */ | |
5118 | if (save_contents) { | |
5119 | VERIFY(con != NULL); | |
5120 | mca->mca_contents_size = con_size; | |
5121 | mca->mca_contents = con; | |
5122 | con = con->obj_next; | |
5123 | bzero(mca->mca_contents, mca->mca_contents_size); | |
5124 | } | |
5125 | ||
5126 | mca_tail = mca; | |
5127 | mca = mca->mca_next; | |
5128 | } | |
91447636 | 5129 | |
2d21ac55 A |
5130 | if (save_contents) |
5131 | *con_list = con; | |
5132 | ||
5133 | *mca_list = mca_tail->mca_next; | |
5134 | mca_tail->mca_next = NULL; | |
5135 | } | |
5136 | ||
5137 | /* | |
5138 | * Given an address of a buffer (mbuf/cluster/big cluster), return | |
5139 | * the corresponding audit structure for that buffer. | |
5140 | */ | |
5141 | static mcache_audit_t * | |
5142 | mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o) | |
5143 | { | |
5144 | mcache_audit_t *mca = NULL; | |
5145 | int ix = MTOCL(o); | |
5146 | ||
5147 | VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG))); | |
5148 | ||
5149 | switch (class) { | |
5150 | case MC_MBUF: | |
5151 | /* | |
5152 | * For the mbuf case, find the index of the cluster | |
5153 | * used by the mbuf and use that index to locate the | |
5154 | * base address of the cluster. Then find out the | |
5155 | * mbuf index relative to the cluster base and use | |
5156 | * it to locate the audit structure. | |
5157 | */ | |
5158 | VERIFY(MCLIDX(CLTOM(ix), o) < (int)NMBPCL); | |
5159 | mca = mclaudit[ix].cl_audit[MCLIDX(CLTOM(ix), o)]; | |
5160 | break; | |
5161 | ||
5162 | case MC_CL: | |
5163 | case MC_BIGCL: | |
5164 | case MC_16KCL: | |
5165 | /* | |
5166 | * Same as above, but only return the first element. | |
5167 | */ | |
5168 | mca = mclaudit[ix].cl_audit[0]; | |
5169 | break; | |
5170 | ||
5171 | default: | |
5172 | VERIFY(0); | |
5173 | /* NOTREACHED */ | |
5174 | } | |
5175 | ||
5176 | return (mca); | |
5177 | } | |
5178 | ||
5179 | static void | |
5180 | mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite, | |
5181 | boolean_t alloc) | |
5182 | { | |
5183 | struct mbuf *m = addr; | |
5184 | mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next; | |
5185 | ||
5186 | VERIFY(mca->mca_contents != NULL && | |
5187 | mca->mca_contents_size == AUDIT_CONTENTS_SIZE); | |
5188 | ||
5189 | mcl_audit_verify_nextptr(next, mca); | |
5190 | ||
5191 | if (!alloc) { | |
5192 | /* Save constructed mbuf fields */ | |
5193 | mcl_audit_save_mbuf(m, mca); | |
5194 | mcache_set_pattern(MCACHE_FREE_PATTERN, m, m_maxsize(MC_MBUF)); | |
5195 | ((mcache_obj_t *)m)->obj_next = next; | |
5196 | return; | |
5197 | } | |
5198 | ||
5199 | /* Check if the buffer has been corrupted while in freelist */ | |
5200 | mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF)); | |
5201 | ||
5202 | /* Restore constructed mbuf fields */ | |
5203 | mcl_audit_restore_mbuf(m, mca, composite); | |
5204 | } | |
5205 | ||
5206 | static void | |
5207 | mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite) | |
5208 | { | |
5209 | struct mbuf *ms = (struct mbuf *)mca->mca_contents; | |
5210 | ||
5211 | if (composite) { | |
5212 | struct mbuf *next = m->m_next; | |
5213 | VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL && | |
5214 | MBUF_IS_COMPOSITE(ms)); | |
5215 | /* | |
5216 | * We could have hand-picked the mbuf fields and restore | |
5217 | * them individually, but that will be a maintenance | |
5218 | * headache. Instead, restore everything that was saved; | |
5219 | * the mbuf layer will recheck and reinitialize anyway. | |
5220 | */ | |
5221 | bcopy(ms, m, mca->mca_contents_size); | |
5222 | m->m_next = next; | |
5223 | } else { | |
5224 | /* | |
5225 | * For a regular mbuf (no cluster attached) there's nothing | |
5226 | * to restore other than the type field, which is expected | |
5227 | * to be MT_FREE. | |
5228 | */ | |
5229 | m->m_type = ms->m_type; | |
5230 | } | |
5231 | _MCHECK(m); | |
5232 | } | |
5233 | ||
5234 | static void | |
5235 | mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca) | |
5236 | { | |
5237 | _MCHECK(m); | |
5238 | bcopy(m, mca->mca_contents, mca->mca_contents_size); | |
5239 | } | |
5240 | ||
5241 | static void | |
5242 | mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc, | |
5243 | boolean_t save_next) | |
5244 | { | |
5245 | mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next; | |
5246 | ||
5247 | if (!alloc) { | |
5248 | mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size); | |
5249 | if (save_next) { | |
5250 | mcl_audit_verify_nextptr(next, mca); | |
5251 | ((mcache_obj_t *)addr)->obj_next = next; | |
5252 | } | |
5253 | } else { | |
5254 | /* Check if the buffer has been corrupted while in freelist */ | |
5255 | mcl_audit_verify_nextptr(next, mca); | |
5256 | mcache_audit_free_verify_set(mca, addr, 0, size); | |
5257 | } | |
5258 | } | |
5259 | ||
5260 | static void | |
5261 | mcl_audit_mcheck_panic(struct mbuf *m) | |
5262 | { | |
5263 | mcache_audit_t *mca; | |
5264 | ||
5265 | MRANGE(m); | |
5266 | mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); | |
5267 | ||
5268 | panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n", | |
5269 | m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca)); | |
5270 | /* NOTREACHED */ | |
5271 | } | |
5272 | ||
5273 | static void | |
5274 | mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) | |
5275 | { | |
5276 | if (next != NULL && next != (void *)MCACHE_FREE_PATTERN && | |
5277 | !MBUF_IN_MAP(next)) { | |
5278 | panic("mcl_audit: buffer %p modified after free at offset 0: " | |
5279 | "%p out of range [%p-%p)\n%s\n", | |
5280 | mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca)); | |
5281 | /* NOTREACHED */ | |
5282 | } | |
5283 | } | |
5284 | ||
5285 | SYSCTL_DECL(_kern_ipc); | |
5286 | SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD | CTLFLAG_LOCKED, | |
5287 | 0, 0, mbstat_sysctl, "S,mbstat", ""); | |
5288 | SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLFLAG_RD | CTLFLAG_LOCKED, | |
5289 | 0, 0, mb_stat_sysctl, "S,mb_stat", ""); | |
5290 | SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, CTLFLAG_RD | CTLFLAG_LOCKED, | |
5291 | &mb_normalized, 0, ""); |