]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
f427ee49 | 2 | * Copyright (c) 2000-2020 Apple Inc. All rights reserved. |
1c79356b | 3 | * |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
0a7de745 | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
0a7de745 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
0a7de745 | 17 | * |
2d21ac55 A |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
0a7de745 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
1c79356b A |
27 | */ |
28 | /* | |
29 | * @OSF_COPYRIGHT@ | |
30 | */ | |
0a7de745 | 31 | /* |
1c79356b A |
32 | * Mach Operating System |
33 | * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University | |
34 | * All Rights Reserved. | |
0a7de745 | 35 | * |
1c79356b A |
36 | * Permission to use, copy, modify and distribute this software and its |
37 | * documentation is hereby granted, provided that both the copyright | |
38 | * notice and this permission notice appear in all copies of the | |
39 | * software, derivative works or modified versions, and any portions | |
40 | * thereof, and that both notices appear in supporting documentation. | |
0a7de745 | 41 | * |
1c79356b A |
42 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
43 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR | |
44 | * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. | |
0a7de745 | 45 | * |
1c79356b | 46 | * Carnegie Mellon requests users of this software to return to |
0a7de745 | 47 | * |
1c79356b A |
48 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
49 | * School of Computer Science | |
50 | * Carnegie Mellon University | |
51 | * Pittsburgh PA 15213-3890 | |
0a7de745 | 52 | * |
1c79356b A |
53 | * any improvements or extensions that they make and grant Carnegie Mellon |
54 | * the rights to redistribute these changes. | |
55 | */ | |
56 | /* | |
57 | */ | |
58 | /* | |
59 | * File: kern/zalloc.c | |
60 | * Author: Avadis Tevanian, Jr. | |
61 | * | |
62 | * Zone-based memory allocator. A zone is a collection of fixed size | |
63 | * data blocks for which quick allocation/deallocation is possible. | |
64 | */ | |
91447636 | 65 | |
f427ee49 | 66 | #define ZALLOC_ALLOW_DEPRECATED 1 |
c3c9b80d | 67 | #if !ZALLOC_TEST |
91447636 A |
68 | #include <mach/mach_types.h> |
69 | #include <mach/vm_param.h> | |
70 | #include <mach/kern_return.h> | |
71 | #include <mach/mach_host_server.h> | |
6d2010ae | 72 | #include <mach/task_server.h> |
91447636 | 73 | #include <mach/machine/vm_types.h> |
316670eb | 74 | #include <mach/vm_map.h> |
a39ff7e2 | 75 | #include <mach/sdt.h> |
91447636 | 76 | |
5ba3f43e | 77 | #include <kern/bits.h> |
f427ee49 | 78 | #include <kern/startup.h> |
91447636 | 79 | #include <kern/kern_types.h> |
1c79356b | 80 | #include <kern/assert.h> |
39037602 | 81 | #include <kern/backtrace.h> |
91447636 | 82 | #include <kern/host.h> |
1c79356b A |
83 | #include <kern/macro_help.h> |
84 | #include <kern/sched.h> | |
b0d623f7 | 85 | #include <kern/locks.h> |
1c79356b A |
86 | #include <kern/sched_prim.h> |
87 | #include <kern/misc_protos.h> | |
0b4e3aa0 | 88 | #include <kern/thread_call.h> |
f427ee49 | 89 | #include <kern/zalloc_internal.h> |
91447636 A |
90 | #include <kern/kalloc.h> |
91 | ||
5c9f4661 A |
92 | #include <prng/random.h> |
93 | ||
91447636 A |
94 | #include <vm/pmap.h> |
95 | #include <vm/vm_map.h> | |
1c79356b | 96 | #include <vm/vm_kern.h> |
91447636 | 97 | #include <vm/vm_page.h> |
c3c9b80d | 98 | #include <vm/vm_pageout.h> |
f427ee49 | 99 | #include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */ |
91447636 | 100 | |
316670eb A |
101 | #include <pexpert/pexpert.h> |
102 | ||
1c79356b | 103 | #include <machine/machparam.h> |
39236c6e | 104 | #include <machine/machine_routines.h> /* ml_cpu_get_info */ |
1c79356b | 105 | |
f427ee49 A |
106 | #include <os/atomic.h> |
107 | ||
2d21ac55 | 108 | #include <libkern/OSDebug.h> |
7ddcb079 | 109 | #include <libkern/OSAtomic.h> |
d9a64523 | 110 | #include <libkern/section_keywords.h> |
2d21ac55 A |
111 | #include <sys/kdebug.h> |
112 | ||
5ba3f43e A |
113 | #include <san/kasan.h> |
114 | ||
f427ee49 | 115 | #if KASAN_ZALLOC |
c3c9b80d A |
116 | /* |
117 | * Set to 0 to debug poisoning and ZC_ZFREE_CLEARMEM validation under kasan. | |
118 | * Otherwise they are double-duty with what kasan already does. | |
119 | */ | |
120 | #define ZALLOC_ENABLE_POISONING 0 | |
f427ee49 A |
121 | #define ZONE_ENABLE_LOGGING 0 |
122 | #elif DEBUG || DEVELOPMENT | |
c3c9b80d | 123 | #define ZALLOC_ENABLE_POISONING 1 |
f427ee49 A |
124 | #define ZONE_ENABLE_LOGGING 1 |
125 | #else | |
c3c9b80d | 126 | #define ZALLOC_ENABLE_POISONING 1 |
f427ee49 A |
127 | #define ZONE_ENABLE_LOGGING 0 |
128 | #endif | |
129 | ||
c3c9b80d A |
130 | #if __LP64__ |
131 | #define ZALLOC_EARLY_GAPS 1 | |
132 | #else | |
133 | #define ZALLOC_EARLY_GAPS 0 | |
134 | #endif | |
135 | ||
136 | #if DEBUG | |
137 | #define z_debug_assert(expr) assert(expr) | |
138 | #else | |
139 | #define z_debug_assert(expr) (void)(expr) | |
140 | #endif | |
141 | ||
f427ee49 A |
142 | extern void vm_pageout_garbage_collect(int collect); |
143 | ||
144 | /* Returns pid of the task with the largest number of VM map entries. */ | |
145 | extern pid_t find_largest_process_vm_map_entries(void); | |
146 | ||
147 | /* | |
148 | * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills. | |
149 | * For any other pid we try to kill that process synchronously. | |
150 | */ | |
151 | extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid); | |
152 | ||
153 | extern zone_t vm_map_entry_zone; | |
154 | extern zone_t vm_object_zone; | |
f427ee49 | 155 | |
c3c9b80d | 156 | #define ZONE_MIN_ELEM_SIZE sizeof(uint64_t) |
f427ee49 A |
157 | #define ZONE_MAX_ALLOC_SIZE (32 * 1024) |
158 | ||
f427ee49 A |
159 | struct zone_page_metadata { |
160 | /* The index of the zone this metadata page belongs to */ | |
c3c9b80d | 161 | zone_id_t zm_index : 11; |
f427ee49 | 162 | |
c3c9b80d A |
163 | /* Whether `zm_bitmap` is an inline bitmap or a packed bitmap reference */ |
164 | uint16_t zm_inline_bitmap : 1; | |
f427ee49 A |
165 | |
166 | /* | |
c3c9b80d A |
167 | * Zones allocate in "chunks" of zone_t::z_chunk_pages consecutive |
168 | * pages, or zpercpu_count() pages if the zone is percpu. | |
f427ee49 | 169 | * |
c3c9b80d A |
170 | * The first page of it has its metadata set with: |
171 | * - 0 if none of the pages are currently wired | |
172 | * - the number of wired pages in the chunk (not scaled for percpu). | |
f427ee49 | 173 | * |
c3c9b80d A |
174 | * Other pages in the chunk have their zm_chunk_len set to |
175 | * ZM_SECONDARY_PAGE or ZM_SECONDARY_PCPU_PAGE depending on whether | |
176 | * the zone is percpu or not. For those, zm_page_index holds the | |
177 | * index of that page in the run. | |
f427ee49 | 178 | */ |
c3c9b80d A |
179 | uint16_t zm_chunk_len : 4; |
180 | #define ZM_CHUNK_LEN_MAX 0x8 | |
181 | #define ZM_SECONDARY_PAGE 0xe | |
182 | #define ZM_SECONDARY_PCPU_PAGE 0xf | |
183 | ||
184 | union { | |
185 | #define ZM_ALLOC_SIZE_LOCK 1u | |
186 | uint16_t zm_alloc_size; /* first page only */ | |
187 | uint16_t zm_page_index; /* secondary pages only */ | |
188 | }; | |
189 | union { | |
190 | uint32_t zm_bitmap; /* most zones */ | |
191 | uint32_t zm_bump; /* permanent zones */ | |
192 | }; | |
f427ee49 A |
193 | |
194 | zone_pva_t zm_page_next; | |
195 | zone_pva_t zm_page_prev; | |
f427ee49 | 196 | }; |
c3c9b80d | 197 | static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing"); |
f427ee49 | 198 | |
c3c9b80d A |
199 | __enum_closed_decl(zone_addr_kind_t, bool, { |
200 | ZONE_ADDR_FOREIGN, | |
201 | ZONE_ADDR_NATIVE, | |
202 | }); | |
203 | #define ZONE_ADDR_KIND_COUNT 2 | |
f427ee49 | 204 | |
c3c9b80d A |
205 | /*! |
206 | * @typedef zone_element_t | |
207 | * | |
208 | * @brief | |
209 | * Type that represents a "resolved" zone element. | |
210 | * | |
211 | * @description | |
212 | * This type encodes an element pointer as a tuple of: | |
213 | * { chunk base, element index, element protection }. | |
214 | * | |
215 | * The chunk base is extracted with @c trunc_page() | |
216 | * as it is always page aligned, and occupies the bits above @c PAGE_SHIFT. | |
217 | * | |
218 | * The low two bits encode the protection mode (see @c zprot_mode_t). | |
219 | * | |
220 | * The other bits encode the element index in the chunk rather than its address. | |
221 | */ | |
222 | typedef struct zone_element { | |
223 | vm_offset_t ze_value; | |
224 | } zone_element_t; | |
f427ee49 | 225 | |
c3c9b80d A |
226 | /*! |
227 | * @typedef zone_magazine_t | |
228 | * | |
229 | * @brief | |
230 | * Magazine of cached allocations. | |
231 | * | |
232 | * @field zm_cur how many elements this magazine holds (unused while loaded). | |
233 | * @field zm_link linkage used by magazine depots. | |
234 | * @field zm_elems an array of @c zc_mag_size() elements. | |
235 | */ | |
236 | typedef struct zone_magazine { | |
237 | uint16_t zm_cur; | |
238 | STAILQ_ENTRY(zone_magazine) zm_link; | |
239 | zone_element_t zm_elems[0]; | |
240 | } *zone_magazine_t; | |
241 | ||
242 | /*! | |
243 | * @typedef zone_cache_t | |
244 | * | |
245 | * @brief | |
246 | * Magazine of cached allocations. | |
247 | * | |
248 | * @discussion | |
249 | * Below is a diagram of the caching system. This design is inspired by the | |
250 | * paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and | |
251 | * Arbitrary Resources" by Jeff Bonwick and Jonathan Adams and the FreeBSD UMA | |
252 | * zone allocator (itself derived from this seminal work). | |
253 | * | |
254 | * It is divided into 3 layers: | |
255 | * - the per-cpu layer, | |
256 | * - the recirculation depot layer, | |
257 | * - the Zone Allocator. | |
258 | * | |
259 | * The per-cpu and recirculation depot layer use magazines (@c zone_magazine_t), | |
260 | * which are stacks of up to @c zc_mag_size() elements. | |
261 | * | |
262 | * <h2>CPU layer</h2> | |
263 | * | |
264 | * The CPU layer (@c zone_cache_t) looks like this: | |
265 | * | |
266 | * â•â”€ a ─ f ─┬───────── zm_depot ──────────╮ | |
267 | * │ â•â”€â•® â•â”€â•® │ â•â”€â•® â•â”€â•® â•â”€â•® â•â”€â•® â•â”€â•® │ | |
268 | * │ │#│ │#│ │ │#│ │#│ │#│ │#│ │#│ │ | |
269 | * │ │#│ │ │ │ │#│ │#│ │#│ │#│ │#│ │ | |
270 | * │ │ │ │ │ │ │#│ │#│ │#│ │#│ │#│ │ | |
271 | * │ ╰─╯ ╰─╯ │ ╰─╯ ╰─╯ ╰─╯ ╰─╯ ╰─╯ │ | |
272 | * ╰─────────┴─────────────────────────────╯ | |
273 | * | |
274 | * It has two pre-loaded magazines (a)lloc and (f)ree which we allocate from, | |
275 | * or free to. Serialization is achieved through disabling preemption, and only | |
276 | * the current CPU can acces those allocations. This is represented on the left | |
277 | * hand side of the diagram above. | |
278 | * | |
279 | * The right hand side is the per-cpu depot. It consists of @c zm_depot_count | |
280 | * full magazines, and is protected by the @c zm_depot_lock for access. | |
281 | * The lock is expected to absolutely never be contended, as only the local CPU | |
282 | * tends to access the local per-cpu depot in regular operation mode. | |
283 | * | |
284 | * However unlike UMA, our implementation allows for the zone GC to reclaim | |
285 | * per-CPU magazines aggresively, which is serialized with the @c zm_depot_lock. | |
286 | * | |
287 | * | |
288 | * <h2>Recirculation Depot</h2> | |
289 | * | |
290 | * The recirculation depot layer is a list similar to the per-cpu depot, | |
291 | * however it is different in two fundamental ways: | |
292 | * | |
293 | * - it is protected by the regular zone lock, | |
294 | * - elements referenced by the magazines in that layer appear free | |
295 | * to the zone layer. | |
296 | * | |
297 | * | |
298 | * <h2>Magazine circulation and sizing</h2> | |
299 | * | |
300 | * The caching system sizes itself dynamically. Operations that allocate/free | |
301 | * a single element call @c zone_lock_nopreempt_check_contention() which records | |
302 | * contention on the lock by doing a trylock and recording its success. | |
303 | * | |
304 | * This information is stored in the @c z_contention_cur field of the zone, | |
305 | * and a windoed moving average is maintained in @c z_contention_wma. | |
306 | * Each time a CPU registers any contention, it will also allow its own per-cpu | |
307 | * cache to grow, incrementing @c zc_depot_max, which is how the per-cpu layer | |
308 | * might grow into using its local depot. | |
309 | * | |
310 | * Note that @c zc_depot_max assume that the (a) and (f) pre-loaded magazines | |
311 | * on average contain @c zc_mag_size() elements. | |
312 | * | |
313 | * When a per-cpu layer cannot hold more full magazines in its depot, | |
314 | * then it will overflow about 1/3 of its depot into the recirculation depot | |
315 | * (see @c zfree_cached_slow(). Conversely, when a depot is empty, then it will | |
316 | * refill its per-cpu depot to about 1/3 of its size from the recirculation | |
317 | * depot (see @c zalloc_cached_slow()). | |
318 | * | |
319 | * Lastly, the zone layer keeps track of the high and low watermark of how many | |
320 | * elements have been free per period of time (including being part of the | |
321 | * recirculation depot) in the @c z_elems_free_min and @c z_elems_free_max | |
322 | * fields. A weighted moving average of the amplitude of this is maintained in | |
323 | * the @c z_elems_free_wss which informs the zone GC on how to gently trim | |
324 | * zones without hurting performance. | |
325 | * | |
326 | * | |
327 | * <h2>Security considerations</h2> | |
328 | * | |
329 | * The zone caching layer has been designed to avoid returning elements in | |
330 | * a strict LIFO behavior: @c zalloc() will allocate from the (a) magazine, | |
331 | * and @c zfree() free to the (f) magazine, and only swap them when the | |
332 | * requested operation cannot be fulfilled. | |
333 | * | |
334 | * The per-cpu overflow depot or the recirculation depots are similarly used | |
335 | * in FIFO order. | |
336 | * | |
337 | * More importantly, when magazines flow through the recirculation depot, | |
338 | * the elements they contain are marked as "free" in the zone layer bitmaps. | |
339 | * Because allocations out of per-cpu caches verify the bitmaps at allocation | |
340 | * time, this acts as a poor man's double-free quarantine. The magazines | |
341 | * allow to avoid the cost of the bit-scanning involved in the zone-level | |
342 | * @c zalloc_item() codepath. | |
343 | * | |
344 | * | |
345 | * @field zc_alloc_cur denormalized number of elements in the (a) magazine | |
346 | * @field zc_free_cur denormalized number of elements in the (f) magazine | |
347 | * @field zc_alloc_elems a pointer to the array of elements in (a) | |
348 | * @field zc_free_elems a pointer to the array of elements in (f) | |
349 | * | |
350 | * @field zc_depot_lock a lock to access @c zc_depot, @c zc_depot_cur. | |
351 | * @field zc_depot a list of @c zc_depot_cur full magazines | |
352 | * @field zc_depot_cur number of magazines in @c zc_depot | |
353 | * @field zc_depot_max the maximum number of elements in @c zc_depot, | |
354 | * protected by the zone lock. | |
355 | */ | |
356 | typedef struct zone_cache { | |
357 | uint16_t zc_alloc_cur; | |
358 | uint16_t zc_free_cur; | |
359 | uint16_t zc_depot_cur; | |
360 | uint16_t __zc_padding; | |
361 | zone_element_t *zc_alloc_elems; | |
362 | zone_element_t *zc_free_elems; | |
363 | hw_lock_bit_t zc_depot_lock; | |
364 | uint32_t zc_depot_max; | |
365 | struct zone_depot zc_depot; | |
366 | } *zone_cache_t; | |
f427ee49 A |
367 | |
368 | static __security_const_late struct { | |
c3c9b80d A |
369 | struct zone_map_range zi_map_range[ZONE_ADDR_KIND_COUNT]; |
370 | struct zone_map_range zi_meta_range; /* debugging only */ | |
371 | struct zone_map_range zi_bits_range; /* bits buddy allocator */ | |
f427ee49 A |
372 | |
373 | /* | |
374 | * The metadata lives within the zi_meta_range address range. | |
375 | * | |
376 | * The correct formula to find a metadata index is: | |
c3c9b80d | 377 | * absolute_page_index - page_index(MIN(zi_map_range[*].min_address)) |
f427ee49 A |
378 | * |
379 | * And then this index is used to dereference zi_meta_range.min_address | |
380 | * as a `struct zone_page_metadata` array. | |
381 | * | |
382 | * To avoid doing that substraction all the time in the various fast-paths, | |
c3c9b80d A |
383 | * zi_meta_base are pre-offset with that minimum page index to avoid redoing |
384 | * that math all the time. | |
385 | * | |
386 | * Do note that the array might have a hole punched in the middle, | |
387 | * see zone_metadata_init(). | |
f427ee49 | 388 | */ |
c3c9b80d | 389 | struct zone_page_metadata *zi_meta_base; |
f427ee49 A |
390 | } zone_info; |
391 | ||
c3c9b80d A |
392 | /* |
393 | * Initial array of metadata for stolen memory. | |
394 | * | |
395 | * The numbers here have to be kept in sync with vm_map_steal_memory() | |
396 | * so that we have reserved enough metadata. | |
397 | * | |
398 | * After zone_init() has run (which happens while the kernel is still single | |
399 | * threaded), the metadata is moved to its final dynamic location, and | |
400 | * this array is unmapped with the rest of __startup_data at lockdown. | |
401 | */ | |
402 | #if CONFIG_GZALLOC | |
403 | #define ZONE_FOREIGN_META_INLINE_COUNT 20032 | |
404 | #else | |
405 | #define ZONE_FOREIGN_META_INLINE_COUNT 64 | |
406 | #endif | |
407 | __startup_data | |
408 | static struct zone_page_metadata | |
409 | zone_foreign_meta_array_startup[ZONE_FOREIGN_META_INLINE_COUNT]; | |
410 | ||
0a7de745 A |
411 | /* |
412 | * The zone_locks_grp allows for collecting lock statistics. | |
413 | * All locks are associated to this group in zinit. | |
414 | * Look at tools/lockstat for debugging lock contention. | |
415 | */ | |
c3c9b80d A |
416 | static LCK_GRP_DECLARE(zone_locks_grp, "zone_locks"); |
417 | static LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp); | |
f427ee49 A |
418 | |
419 | /* | |
420 | * Exclude more than one concurrent garbage collection | |
421 | */ | |
c3c9b80d A |
422 | static LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc"); |
423 | static LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp); | |
0a7de745 | 424 | |
c3c9b80d | 425 | bool panic_include_zprint = FALSE; |
f427ee49 A |
426 | mach_memory_info_t *panic_kext_memory_info = NULL; |
427 | vm_size_t panic_kext_memory_size = 0; | |
0a7de745 | 428 | |
39236c6e | 429 | /* |
f427ee49 A |
430 | * Protects zone_array, num_zones, num_zones_in_use, and |
431 | * zone_destroyed_bitmap | |
39236c6e | 432 | */ |
f427ee49 | 433 | static SIMPLE_LOCK_DECLARE(all_zones_lock, 0); |
c3c9b80d A |
434 | static zone_id_t num_zones_in_use; |
435 | zone_id_t _Atomic num_zones; | |
f427ee49 | 436 | SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count; |
39236c6e | 437 | |
f427ee49 A |
438 | #if KASAN_ZALLOC |
439 | #define MAX_ZONES 566 | |
440 | #else /* !KASAN_ZALLOC */ | |
441 | #define MAX_ZONES 402 | |
442 | #endif/* !KASAN_ZALLOC */ | |
c3c9b80d A |
443 | |
444 | /* | |
445 | * Initial globals for zone stats until we can allocate the real ones. | |
446 | * Those get migrated inside the per-CPU ones during zone_init() and | |
447 | * this array is unmapped with the rest of __startup_data at lockdown. | |
448 | */ | |
449 | ||
450 | /* zone to allocate zone_magazine structs from */ | |
451 | static SECURITY_READ_ONLY_LATE(zone_t) zc_magazine_zone; | |
452 | /* | |
453 | * Until pid1 is made, zone caching is off, | |
454 | * until compute_zone_working_set_size() runs for the firt time. | |
455 | * | |
456 | * -1 represents the "never enabled yet" value. | |
457 | */ | |
458 | static int8_t zone_caching_disabled = -1; | |
459 | ||
460 | __startup_data | |
461 | static struct zone_cache zone_cache_startup[MAX_ZONES]; | |
462 | __startup_data | |
463 | static struct zone_stats zone_stats_startup[MAX_ZONES]; | |
464 | struct zone zone_array[MAX_ZONES]; | |
f427ee49 A |
465 | |
466 | /* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */ | |
467 | static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count; | |
468 | ||
469 | /* Used to keep track of destroyed slots in the zone_array */ | |
470 | static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)]; | |
471 | ||
f427ee49 A |
472 | /* number of zone mapped pages used by all zones */ |
473 | static long _Atomic zones_phys_page_mapped_count; | |
474 | ||
f427ee49 A |
475 | /* |
476 | * Turn ZSECURITY_OPTIONS_STRICT_IOKIT_FREE off on x86 so as not | |
477 | * not break third party kexts that haven't yet been recompiled | |
478 | * to use the new iokit macros. | |
479 | */ | |
480 | #if XNU_TARGET_OS_OSX && __x86_64__ | |
481 | #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT 0 | |
482 | #else | |
483 | #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT \ | |
484 | ZSECURITY_OPTIONS_STRICT_IOKIT_FREE | |
485 | #endif | |
486 | ||
487 | #define ZSECURITY_DEFAULT ( \ | |
2a1bd2d3 | 488 | ZSECURITY_OPTIONS_SEQUESTER | \ |
f427ee49 A |
489 | ZSECURITY_OPTIONS_SUBMAP_USER_DATA | \ |
490 | ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC | \ | |
491 | ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT | \ | |
492 | 0) | |
493 | TUNABLE(zone_security_options_t, zsecurity_options, "zs", ZSECURITY_DEFAULT); | |
494 | ||
495 | #if VM_MAX_TAG_ZONES | |
496 | /* enable tags for zones that ask for it */ | |
c3c9b80d | 497 | static TUNABLE(bool, zone_tagging_on, "-zt", false); |
f427ee49 A |
498 | #endif /* VM_MAX_TAG_ZONES */ |
499 | ||
500 | #if DEBUG || DEVELOPMENT | |
501 | TUNABLE(bool, zalloc_disable_copyio_check, "-no-copyio-zalloc-check", false); | |
f427ee49 A |
502 | #endif /* DEBUG || DEVELOPMENT */ |
503 | #if CONFIG_ZLEAKS | |
504 | /* Making pointer scanning leaks detection possible for all zones */ | |
c3c9b80d | 505 | static TUNABLE(bool, zone_leaks_scan_enable, "-zl", false); |
f427ee49 A |
506 | #else |
507 | #define zone_leaks_scan_enable false | |
508 | #endif | |
509 | ||
c3c9b80d | 510 | /*! @enum zprot_mode_t |
f427ee49 | 511 | * |
c3c9b80d A |
512 | * @brief |
513 | * Zone element corruption detection mode. | |
c910b4d9 | 514 | * |
c3c9b80d | 515 | * @discussion |
f427ee49 | 516 | * We use four techniques to detect modification of a zone element |
39236c6e | 517 | * after it's been freed. |
316670eb | 518 | * |
c3c9b80d A |
519 | * Elements that are in zones can be in 3 possible states: |
520 | * - zeroed out (@c ZPM_ZERO) | |
521 | * - poisoned (@c ZPM_POISON) with the @c ZONE_POISON pattern | |
522 | * - with a left and right canary (@c ZPM_CANARY). | |
523 | * | |
524 | * @c ZPM_AUTO is used when the actual protection for the element is unknown, | |
525 | * and will be detected looking at the last word of the allocation at validation | |
526 | * time. | |
527 | * | |
528 | * The mode of an element in zones is discovered by looking at its last | |
529 | * pointer-sized value: | |
530 | * - 0 means that it is zeroed out | |
531 | * - @c ZONE_POISON means it is poisoned | |
532 | * - any other value means it is using canaries. | |
533 | * | |
534 | * Elements are zeroed if: | |
535 | * - the element size is smaller than @c zp_min_size, | |
536 | * - the owning zone has the @c z_free_zeroes flag set, | |
537 | * - the chunk backing store is fresh (and was just allocated). | |
538 | * | |
539 | * Elements are poisoned periodically for every N frees (counted per-zone), | |
540 | * if the elements aren't otherwise zeroed out. | |
f427ee49 A |
541 | * If -zp is passed as a boot arg, poisoning occurs for every free. |
542 | * | |
c3c9b80d A |
543 | * Else elements use canaries. When canaries are used, the first and last |
544 | * pointer sized values in the allocation are set to values derived from the | |
545 | * element address and the @c zp_canary nonce. The first @c zp_min_size | |
546 | * bytes of the elment are also cleared. | |
c910b4d9 | 547 | * |
39236c6e A |
548 | * Performance slowdown is inversely proportional to the frequency of poisoning, |
549 | * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32 | |
550 | * and higher. You can expect to find a 100% reproducible bug in an average of | |
551 | * N tries, with a standard deviation of about N, but you will want to set | |
552 | * "-zp" to always poison every free if you are attempting to reproduce | |
553 | * a known bug. | |
316670eb | 554 | * |
39236c6e A |
555 | * For a more heavyweight, but finer-grained method of detecting misuse |
556 | * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c. | |
316670eb | 557 | */ |
c3c9b80d A |
558 | __enum_closed_decl(zprot_mode_t, vm_offset_t, { |
559 | ZPM_AUTO, /* element is indeterminate */ | |
560 | ZPM_ZERO, /* element is zeroed */ | |
561 | ZPM_POISON, /* element is poisoned */ | |
562 | ZPM_CANARY, /* element extremities have a canary */ | |
563 | }); | |
564 | #define ZPM_MASK ((zprot_mode_t)0x3) | |
316670eb | 565 | |
fe8ab488 | 566 | |
f427ee49 A |
567 | /* |
568 | * set by zp-factor=N boot arg | |
569 | * | |
570 | * A zp_factor of 0 indicates zone poisoning is disabled and can also be set by | |
571 | * passing the -no-zp boot-arg. | |
572 | * | |
573 | * A zp_factor of 1 indicates zone poisoning is on for all elements and can be | |
574 | * set by passing the -zp boot-arg. | |
575 | */ | |
c3c9b80d | 576 | static TUNABLE(uint32_t, zp_factor, "zp-factor", 16); |
fe8ab488 | 577 | |
f427ee49 | 578 | /* set by zp-scale=N boot arg, scales zp_factor by zone size */ |
c3c9b80d | 579 | static TUNABLE(uint32_t, zp_scale, "zp-scale", 4); |
316670eb | 580 | |
39236c6e | 581 | /* |
c3c9b80d A |
582 | * Zone caching tunables |
583 | * | |
584 | * zc_mag_size(): | |
585 | * size of magazines, larger to reduce contention at the expense of memory | |
586 | * | |
587 | * zc_auto_enable_threshold | |
588 | * number of contentions per second after which zone caching engages | |
589 | * automatically. | |
590 | * | |
591 | * 0 to disable. | |
592 | * | |
593 | * zc_grow_threshold | |
594 | * numer of contentions per second after which the per-cpu depot layer | |
595 | * grows at each newly observed contention without restriction. | |
596 | * | |
597 | * 0 to disable. | |
598 | * | |
599 | * zc_recirc_denom | |
600 | * denominator of the fraction of per-cpu depot to migrate to/from | |
601 | * the recirculation depot layer at a time. Default 3 (1/3). | |
602 | * | |
603 | * zc_defrag_ratio | |
604 | * percentage of the working set to recirc size below which | |
605 | * the zone is defragmented. Default is 50%. | |
606 | * | |
607 | * zc_free_batch_size | |
608 | * The size of batches of frees/reclaim that can be done keeping | |
609 | * the zone lock held (and preemption disabled). | |
610 | */ | |
611 | static TUNABLE(uint16_t, zc_magazine_size, "zc_mag_size()", 8); | |
612 | static TUNABLE(uint32_t, zc_auto_threshold, "zc_auto_enable_threshold", 20); | |
613 | static TUNABLE(uint32_t, zc_grow_threshold, "zc_grow_threshold", 8); | |
614 | static TUNABLE(uint32_t, zc_recirc_denom, "zc_recirc_denom", 3); | |
615 | static TUNABLE(uint32_t, zc_defrag_ratio, "zc_defrag_ratio", 50); | |
616 | static TUNABLE(uint32_t, zc_free_batch_size, "zc_free_batch_size", 1024); | |
617 | ||
618 | static SECURITY_READ_ONLY_LATE(uintptr_t) zp_canary; | |
619 | /* | |
620 | * Perf results for zeroing all non data zones and 2K of data zones | |
621 | * showed little regression, therefore setting zp_min_size to 2048 | |
316670eb | 622 | */ |
c3c9b80d A |
623 | static TUNABLE(uint32_t, zp_min_size, "zclear_size", 2048); |
624 | static SECURITY_READ_ONLY_LATE(uint32_t) zone_phys_mapped_max_pages; | |
625 | static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT]; | |
626 | static SECURITY_READ_ONLY_LATE(uint32_t) zone_last_submap_idx; | |
39236c6e | 627 | |
c3c9b80d | 628 | static zone_t zone_find_largest(void); |
39236c6e | 629 | |
c3c9b80d | 630 | #endif /* !ZALLOC_TEST */ |
f427ee49 | 631 | #pragma mark Zone metadata |
c3c9b80d | 632 | #if !ZALLOC_TEST |
39236c6e | 633 | |
f427ee49 A |
634 | static inline zone_id_t |
635 | zone_index(zone_t z) | |
636 | { | |
637 | return (zone_id_t)(z - zone_array); | |
638 | } | |
39236c6e | 639 | |
f427ee49 A |
640 | static inline bool |
641 | zone_has_index(zone_t z, zone_id_t zid) | |
642 | { | |
643 | return zone_array + zid == z; | |
644 | } | |
316670eb | 645 | |
c3c9b80d A |
646 | static zone_element_t |
647 | zone_element_encode(vm_offset_t base, vm_offset_t eidx, zprot_mode_t zpm) | |
f427ee49 | 648 | { |
c3c9b80d A |
649 | return (zone_element_t){ .ze_value = base | (eidx << 2) | zpm }; |
650 | } | |
651 | ||
652 | static vm_offset_t | |
653 | zone_element_base(zone_element_t ze) | |
654 | { | |
655 | return trunc_page(ze.ze_value); | |
656 | } | |
657 | ||
658 | static vm_offset_t | |
659 | zone_element_idx(zone_element_t ze) | |
660 | { | |
661 | return (ze.ze_value & PAGE_MASK) >> 2; | |
662 | } | |
663 | ||
664 | #if ZALLOC_ENABLE_POISONING | |
665 | static zprot_mode_t | |
666 | zone_element_prot(zone_element_t ze) | |
667 | { | |
668 | return (zprot_mode_t)(ze.ze_value & ZPM_MASK); | |
669 | } | |
670 | #endif | |
671 | ||
672 | static vm_offset_t | |
673 | zone_element_addr(zone_element_t ze, vm_offset_t esize) | |
674 | { | |
675 | return zone_element_base(ze) + esize * zone_element_idx(ze); | |
f427ee49 | 676 | } |
5c9f4661 | 677 | |
f427ee49 A |
678 | __abortlike |
679 | static void | |
680 | zone_metadata_corruption(zone_t zone, struct zone_page_metadata *meta, | |
681 | const char *kind) | |
39236c6e | 682 | { |
f427ee49 A |
683 | panic("zone metadata corruption: %s (meta %p, zone %s%s)", |
684 | kind, meta, zone_heap_name(zone), zone->z_name); | |
685 | } | |
39236c6e | 686 | |
f427ee49 A |
687 | __abortlike |
688 | static void | |
689 | zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr) | |
690 | { | |
691 | panic("zone element pointer validation failed (addr: %p, zone %s%s)", | |
692 | (void *)addr, zone_heap_name(zone), zone->z_name); | |
693 | } | |
39236c6e | 694 | |
c3c9b80d A |
695 | __abortlike |
696 | static void | |
697 | zone_invalid_element_panic(zone_t zone, zone_element_t ze) | |
698 | { | |
699 | panic("zone element pointer validation failed (elem: %p,%d, zone %s%s)", | |
700 | (void *)zone_element_base(ze), (int)zone_element_idx(ze), | |
701 | zone_heap_name(zone), zone->z_name); | |
702 | } | |
703 | ||
f427ee49 A |
704 | __abortlike |
705 | static void | |
706 | zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr, | |
707 | struct zone_page_metadata *meta) | |
708 | { | |
709 | panic("%p not in the expected zone %s%s (%d != %d)", | |
710 | (void *)addr, zone_heap_name(zone), zone->z_name, | |
711 | meta->zm_index, zone_index(zone)); | |
712 | } | |
39236c6e | 713 | |
f427ee49 A |
714 | __abortlike |
715 | static void | |
716 | zone_page_metadata_native_queue_corruption(zone_t zone, zone_pva_t *queue) | |
717 | { | |
718 | panic("foreign metadata index %d enqueued in native head %p from zone %s%s", | |
719 | queue->packed_address, queue, zone_heap_name(zone), | |
720 | zone->z_name); | |
721 | } | |
39236c6e | 722 | |
f427ee49 A |
723 | __abortlike |
724 | static void | |
725 | zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta) | |
726 | { | |
727 | panic("metadata list corruption through element %p detected in zone %s%s", | |
728 | meta, zone_heap_name(zone), zone->z_name); | |
729 | } | |
39236c6e | 730 | |
f427ee49 A |
731 | __abortlike __unused |
732 | static void | |
733 | zone_invalid_foreign_addr_panic(zone_t zone, vm_offset_t addr) | |
734 | { | |
735 | panic("addr %p being freed to foreign zone %s%s not from foreign range", | |
736 | (void *)addr, zone_heap_name(zone), zone->z_name); | |
737 | } | |
39236c6e | 738 | |
f427ee49 A |
739 | __abortlike |
740 | static void | |
741 | zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta, | |
742 | const char *kind) | |
743 | { | |
744 | panic("accounting mismatch (%s) for zone %s%s, meta %p", kind, | |
745 | zone_heap_name(zone), zone->z_name, meta); | |
746 | } | |
39236c6e | 747 | |
c3c9b80d A |
748 | __abortlike |
749 | static void | |
750 | zone_meta_double_free_panic(zone_t zone, zone_element_t ze, const char *caller) | |
751 | { | |
752 | panic("%s: double free of %p to zone %s%s", caller, | |
753 | (void *)zone_element_addr(ze, zone_elem_size(zone)), | |
754 | zone_heap_name(zone), zone->z_name); | |
755 | } | |
756 | ||
f427ee49 A |
757 | __abortlike |
758 | static void | |
759 | zone_accounting_panic(zone_t zone, const char *kind) | |
760 | { | |
761 | panic("accounting mismatch (%s) for zone %s%s", kind, | |
762 | zone_heap_name(zone), zone->z_name); | |
763 | } | |
fe8ab488 | 764 | |
c3c9b80d A |
765 | #define zone_counter_sub(z, stat, value) ({ \ |
766 | if (os_sub_overflow((z)->stat, value, &(z)->stat)) { \ | |
767 | zone_accounting_panic(z, #stat " wrap-around"); \ | |
768 | } \ | |
769 | (z)->stat; \ | |
770 | }) | |
771 | ||
772 | static inline void | |
773 | zone_elems_free_add(zone_t z, uint32_t count) | |
774 | { | |
775 | uint32_t n = (z->z_elems_free += count); | |
776 | if (z->z_elems_free_max < n) { | |
777 | z->z_elems_free_max = n; | |
778 | } | |
779 | } | |
780 | ||
781 | static inline void | |
782 | zone_elems_free_sub(zone_t z, uint32_t count) | |
783 | { | |
784 | uint32_t n = zone_counter_sub(z, z_elems_free, count); | |
785 | ||
786 | if (z->z_elems_free_min > n) { | |
787 | z->z_elems_free_min = n; | |
788 | } | |
789 | } | |
790 | ||
791 | static inline uint16_t | |
792 | zone_meta_alloc_size_add(zone_t z, struct zone_page_metadata *m, | |
793 | vm_offset_t esize) | |
794 | { | |
795 | if (os_add_overflow(m->zm_alloc_size, (uint16_t)esize, &m->zm_alloc_size)) { | |
796 | zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around"); | |
797 | } | |
798 | return m->zm_alloc_size; | |
799 | } | |
800 | ||
801 | static inline uint16_t | |
802 | zone_meta_alloc_size_sub(zone_t z, struct zone_page_metadata *m, | |
803 | vm_offset_t esize) | |
804 | { | |
805 | if (os_sub_overflow(m->zm_alloc_size, esize, &m->zm_alloc_size)) { | |
806 | zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around"); | |
807 | } | |
808 | return m->zm_alloc_size; | |
809 | } | |
810 | ||
f427ee49 A |
811 | __abortlike |
812 | static void | |
813 | zone_nofail_panic(zone_t zone) | |
814 | { | |
815 | panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)", | |
816 | zone_heap_name(zone), zone->z_name); | |
817 | } | |
39236c6e | 818 | |
f427ee49 A |
819 | #if __arm64__ |
820 | // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to | |
821 | #define zone_range_load(r, rmin, rmax) \ | |
822 | asm("ldp %[rmin], %[rmax], [%[range]]" \ | |
823 | : [rmin] "=r"(rmin), [rmax] "=r"(rmax) \ | |
824 | : [range] "r"(r)) | |
825 | #else | |
826 | #define zone_range_load(r, rmin, rmax) \ | |
827 | ({ rmin = (r)->min_address; rmax = (r)->max_address; }) | |
39236c6e A |
828 | #endif |
829 | ||
f427ee49 A |
830 | __header_always_inline bool |
831 | zone_range_contains(const struct zone_map_range *r, vm_offset_t addr, vm_offset_t size) | |
832 | { | |
833 | vm_offset_t rmin, rmax; | |
39236c6e | 834 | |
39236c6e | 835 | /* |
f427ee49 A |
836 | * The `&` is not a typo: we really expect the check to pass, |
837 | * so encourage the compiler to eagerly load and test without branches | |
39236c6e | 838 | */ |
f427ee49 A |
839 | zone_range_load(r, rmin, rmax); |
840 | return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax); | |
841 | } | |
39236c6e | 842 | |
f427ee49 A |
843 | __header_always_inline vm_size_t |
844 | zone_range_size(const struct zone_map_range *r) | |
845 | { | |
846 | vm_offset_t rmin, rmax; | |
39236c6e | 847 | |
f427ee49 A |
848 | zone_range_load(r, rmin, rmax); |
849 | return rmax - rmin; | |
39236c6e A |
850 | } |
851 | ||
c3c9b80d A |
852 | #define from_zone_map(addr, size, kind) \ |
853 | zone_range_contains(&zone_info.zi_map_range[kind], \ | |
854 | (vm_offset_t)(addr), size) | |
39236c6e | 855 | |
c3c9b80d A |
856 | #define zone_native_size() \ |
857 | zone_range_size(&zone_info.zi_map_range[ZONE_ADDR_NATIVE]) | |
39037602 | 858 | |
c3c9b80d A |
859 | #define zone_foreign_size() \ |
860 | zone_range_size(&zone_info.zi_map_range[ZONE_ADDR_FOREIGN]) | |
39236c6e | 861 | |
f427ee49 A |
862 | __header_always_inline bool |
863 | zone_pva_is_null(zone_pva_t page) | |
d9a64523 | 864 | { |
f427ee49 | 865 | return page.packed_address == 0; |
d9a64523 A |
866 | } |
867 | ||
f427ee49 A |
868 | __header_always_inline bool |
869 | zone_pva_is_queue(zone_pva_t page) | |
870 | { | |
871 | // actual kernel pages have the top bit set | |
872 | return (int32_t)page.packed_address > 0; | |
873 | } | |
39037602 | 874 | |
f427ee49 A |
875 | __header_always_inline bool |
876 | zone_pva_is_equal(zone_pva_t pva1, zone_pva_t pva2) | |
877 | { | |
878 | return pva1.packed_address == pva2.packed_address; | |
879 | } | |
39236c6e | 880 | |
f427ee49 A |
881 | __header_always_inline void |
882 | zone_queue_set_head(zone_t z, zone_pva_t queue, zone_pva_t oldv, | |
883 | struct zone_page_metadata *meta) | |
884 | { | |
885 | zone_pva_t *queue_head = &((zone_pva_t *)zone_array)[queue.packed_address]; | |
39037602 | 886 | |
f427ee49 A |
887 | if (!zone_pva_is_equal(*queue_head, oldv)) { |
888 | zone_page_metadata_list_corruption(z, meta); | |
889 | } | |
890 | *queue_head = meta->zm_page_next; | |
891 | } | |
39037602 | 892 | |
f427ee49 A |
893 | __header_always_inline zone_pva_t |
894 | zone_queue_encode(zone_pva_t *headp) | |
895 | { | |
896 | return (zone_pva_t){ (uint32_t)(headp - (zone_pva_t *)zone_array) }; | |
897 | } | |
39037602 | 898 | |
f427ee49 A |
899 | __header_always_inline zone_pva_t |
900 | zone_pva_from_addr(vm_address_t addr) | |
901 | { | |
902 | // cannot use atop() because we want to maintain the sign bit | |
903 | return (zone_pva_t){ (uint32_t)((intptr_t)addr >> PAGE_SHIFT) }; | |
904 | } | |
39037602 | 905 | |
c3c9b80d A |
906 | __header_always_inline zone_pva_t |
907 | zone_pva_from_element(zone_element_t ze) | |
908 | { | |
909 | return zone_pva_from_addr(ze.ze_value); | |
910 | } | |
911 | ||
f427ee49 A |
912 | __header_always_inline vm_address_t |
913 | zone_pva_to_addr(zone_pva_t page) | |
914 | { | |
915 | // cause sign extension so that we end up with the right address | |
916 | return (vm_offset_t)(int32_t)page.packed_address << PAGE_SHIFT; | |
917 | } | |
39037602 | 918 | |
f427ee49 | 919 | __header_always_inline struct zone_page_metadata * |
c3c9b80d | 920 | zone_pva_to_meta(zone_pva_t page) |
f427ee49 | 921 | { |
c3c9b80d | 922 | return &zone_info.zi_meta_base[page.packed_address]; |
f427ee49 | 923 | } |
39037602 | 924 | |
f427ee49 | 925 | __header_always_inline zone_pva_t |
c3c9b80d | 926 | zone_pva_from_meta(struct zone_page_metadata *meta) |
f427ee49 | 927 | { |
c3c9b80d | 928 | return (zone_pva_t){ (uint32_t)(meta - zone_info.zi_meta_base) }; |
f427ee49 | 929 | } |
5ba3f43e | 930 | |
f427ee49 | 931 | __header_always_inline struct zone_page_metadata * |
c3c9b80d | 932 | zone_meta_from_addr(vm_offset_t addr) |
39037602 | 933 | { |
c3c9b80d A |
934 | return zone_pva_to_meta(zone_pva_from_addr(addr)); |
935 | } | |
936 | ||
937 | __header_always_inline struct zone_page_metadata * | |
938 | zone_meta_from_element(zone_element_t ze) | |
939 | { | |
940 | return zone_pva_to_meta(zone_pva_from_element(ze)); | |
39037602 A |
941 | } |
942 | ||
c3c9b80d A |
943 | __header_always_inline zone_id_t |
944 | zone_index_from_ptr(const void *ptr) | |
945 | { | |
946 | return zone_pva_to_meta(zone_pva_from_addr((vm_offset_t)ptr))->zm_index; | |
947 | } | |
f427ee49 A |
948 | |
949 | __header_always_inline vm_offset_t | |
c3c9b80d | 950 | zone_meta_to_addr(struct zone_page_metadata *meta) |
39037602 | 951 | { |
c3c9b80d | 952 | return ptoa((int32_t)(meta - zone_info.zi_meta_base)); |
39037602 A |
953 | } |
954 | ||
f427ee49 A |
955 | __header_always_inline void |
956 | zone_meta_queue_push(zone_t z, zone_pva_t *headp, | |
c3c9b80d | 957 | struct zone_page_metadata *meta) |
39037602 | 958 | { |
f427ee49 A |
959 | zone_pva_t head = *headp; |
960 | zone_pva_t queue_pva = zone_queue_encode(headp); | |
961 | struct zone_page_metadata *tmp; | |
962 | ||
963 | meta->zm_page_next = head; | |
964 | if (!zone_pva_is_null(head)) { | |
c3c9b80d | 965 | tmp = zone_pva_to_meta(head); |
f427ee49 A |
966 | if (!zone_pva_is_equal(tmp->zm_page_prev, queue_pva)) { |
967 | zone_page_metadata_list_corruption(z, meta); | |
968 | } | |
c3c9b80d | 969 | tmp->zm_page_prev = zone_pva_from_meta(meta); |
f427ee49 A |
970 | } |
971 | meta->zm_page_prev = queue_pva; | |
c3c9b80d | 972 | *headp = zone_pva_from_meta(meta); |
39037602 A |
973 | } |
974 | ||
f427ee49 | 975 | __header_always_inline struct zone_page_metadata * |
c3c9b80d | 976 | zone_meta_queue_pop_native(zone_t z, zone_pva_t *headp, vm_offset_t *page_addrp) |
39037602 | 977 | { |
f427ee49 | 978 | zone_pva_t head = *headp; |
c3c9b80d | 979 | struct zone_page_metadata *meta = zone_pva_to_meta(head); |
f427ee49 A |
980 | vm_offset_t page_addr = zone_pva_to_addr(head); |
981 | struct zone_page_metadata *tmp; | |
982 | ||
c3c9b80d | 983 | if (!from_zone_map(page_addr, 1, ZONE_ADDR_NATIVE)) { |
f427ee49 A |
984 | zone_page_metadata_native_queue_corruption(z, headp); |
985 | } | |
f427ee49 A |
986 | |
987 | if (!zone_pva_is_null(meta->zm_page_next)) { | |
c3c9b80d | 988 | tmp = zone_pva_to_meta(meta->zm_page_next); |
f427ee49 A |
989 | if (!zone_pva_is_equal(tmp->zm_page_prev, head)) { |
990 | zone_page_metadata_list_corruption(z, meta); | |
991 | } | |
992 | tmp->zm_page_prev = meta->zm_page_prev; | |
993 | } | |
994 | *headp = meta->zm_page_next; | |
995 | ||
c3c9b80d | 996 | meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 }; |
f427ee49 | 997 | *page_addrp = page_addr; |
c3c9b80d A |
998 | |
999 | if (!zone_has_index(z, meta->zm_index)) { | |
1000 | zone_page_metadata_index_confusion_panic(z, | |
1001 | zone_meta_to_addr(meta), meta); | |
1002 | } | |
f427ee49 | 1003 | return meta; |
39037602 A |
1004 | } |
1005 | ||
f427ee49 | 1006 | __header_always_inline void |
c3c9b80d | 1007 | zone_meta_remqueue(zone_t z, struct zone_page_metadata *meta) |
39236c6e | 1008 | { |
c3c9b80d | 1009 | zone_pva_t meta_pva = zone_pva_from_meta(meta); |
f427ee49 A |
1010 | struct zone_page_metadata *tmp; |
1011 | ||
1012 | if (!zone_pva_is_null(meta->zm_page_next)) { | |
c3c9b80d | 1013 | tmp = zone_pva_to_meta(meta->zm_page_next); |
f427ee49 A |
1014 | if (!zone_pva_is_equal(tmp->zm_page_prev, meta_pva)) { |
1015 | zone_page_metadata_list_corruption(z, meta); | |
1016 | } | |
1017 | tmp->zm_page_prev = meta->zm_page_prev; | |
1018 | } | |
1019 | if (zone_pva_is_queue(meta->zm_page_prev)) { | |
1020 | zone_queue_set_head(z, meta->zm_page_prev, meta_pva, meta); | |
1021 | } else { | |
c3c9b80d | 1022 | tmp = zone_pva_to_meta(meta->zm_page_prev); |
f427ee49 A |
1023 | if (!zone_pva_is_equal(tmp->zm_page_next, meta_pva)) { |
1024 | zone_page_metadata_list_corruption(z, meta); | |
1025 | } | |
1026 | tmp->zm_page_next = meta->zm_page_next; | |
1027 | } | |
1028 | ||
c3c9b80d A |
1029 | meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 }; |
1030 | } | |
1031 | ||
1032 | __header_always_inline void | |
1033 | zone_meta_requeue(zone_t z, zone_pva_t *headp, | |
1034 | struct zone_page_metadata *meta) | |
1035 | { | |
1036 | zone_meta_remqueue(z, meta); | |
1037 | zone_meta_queue_push(z, headp, meta); | |
1038 | } | |
1039 | ||
1040 | /* prevents a given metadata from ever reaching the z_pageq_empty queue */ | |
1041 | static inline void | |
1042 | zone_meta_lock_in_partial(zone_t z, struct zone_page_metadata *m, uint32_t len) | |
1043 | { | |
1044 | uint16_t new_size = zone_meta_alloc_size_add(z, m, ZM_ALLOC_SIZE_LOCK); | |
1045 | ||
1046 | assert(new_size % sizeof(vm_offset_t) == ZM_ALLOC_SIZE_LOCK); | |
1047 | if (new_size == ZM_ALLOC_SIZE_LOCK) { | |
1048 | zone_meta_requeue(z, &z->z_pageq_partial, m); | |
1049 | zone_counter_sub(z, z_wired_empty, len); | |
1050 | } | |
1051 | } | |
1052 | ||
1053 | /* allows a given metadata to reach the z_pageq_empty queue again */ | |
1054 | static inline void | |
1055 | zone_meta_unlock_from_partial(zone_t z, struct zone_page_metadata *m, uint32_t len) | |
1056 | { | |
1057 | uint16_t new_size = zone_meta_alloc_size_sub(z, m, ZM_ALLOC_SIZE_LOCK); | |
1058 | ||
1059 | assert(new_size % sizeof(vm_offset_t) == 0); | |
1060 | if (new_size == 0) { | |
1061 | zone_meta_requeue(z, &z->z_pageq_empty, m); | |
1062 | z->z_wired_empty += len; | |
1063 | } | |
39236c6e A |
1064 | } |
1065 | ||
0a7de745 | 1066 | /* |
39037602 | 1067 | * Routine to populate a page backing metadata in the zone_metadata_region. |
0a7de745 | 1068 | * Must be called without the zone lock held as it might potentially block. |
39037602 | 1069 | */ |
f427ee49 | 1070 | static void |
c3c9b80d | 1071 | zone_meta_populate(vm_offset_t base, vm_size_t size) |
39037602 | 1072 | { |
c3c9b80d A |
1073 | struct zone_page_metadata *from = zone_meta_from_addr(base); |
1074 | struct zone_page_metadata *to = from + atop(size); | |
f427ee49 | 1075 | vm_offset_t page_addr = trunc_page(from); |
d9a64523 | 1076 | |
f427ee49 A |
1077 | for (; page_addr < (vm_offset_t)to; page_addr += PAGE_SIZE) { |
1078 | #if !KASAN_ZALLOC | |
d9a64523 A |
1079 | /* |
1080 | * This can race with another thread doing a populate on the same metadata | |
1081 | * page, where we see an updated pmap but unmapped KASan shadow, causing a | |
1082 | * fault in the shadow when we first access the metadata page. Avoid this | |
1083 | * by always synchronizing on the zone_metadata_region lock with KASan. | |
1084 | */ | |
f427ee49 | 1085 | if (pmap_find_phys(kernel_pmap, page_addr)) { |
39037602 | 1086 | continue; |
0a7de745 | 1087 | } |
d9a64523 | 1088 | #endif |
f427ee49 A |
1089 | |
1090 | for (;;) { | |
1091 | kern_return_t ret = KERN_SUCCESS; | |
1092 | ||
1093 | /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */ | |
1094 | lck_mtx_lock(&zone_metadata_region_lck); | |
1095 | if (0 == pmap_find_phys(kernel_pmap, page_addr)) { | |
1096 | ret = kernel_memory_populate(kernel_map, page_addr, | |
1097 | PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO, | |
1098 | VM_KERN_MEMORY_OSFMK); | |
1099 | } | |
1100 | lck_mtx_unlock(&zone_metadata_region_lck); | |
1101 | ||
1102 | if (ret == KERN_SUCCESS) { | |
1103 | break; | |
1104 | } | |
1105 | ||
1106 | /* | |
1107 | * We can't pass KMA_NOPAGEWAIT under a global lock as it leads | |
1108 | * to bad system deadlocks, so if the allocation failed, | |
1109 | * we need to do the VM_PAGE_WAIT() outside of the lock. | |
1110 | */ | |
1111 | VM_PAGE_WAIT(); | |
39037602 | 1112 | } |
39037602 | 1113 | } |
39037602 A |
1114 | } |
1115 | ||
c3c9b80d A |
1116 | __header_always_inline |
1117 | struct zone_page_metadata * | |
1118 | zone_element_validate(zone_t zone, zone_element_t ze) | |
39037602 | 1119 | { |
c3c9b80d A |
1120 | struct zone_page_metadata *meta; |
1121 | vm_offset_t page = zone_element_base(ze); | |
f427ee49 | 1122 | |
c3c9b80d A |
1123 | if (!from_zone_map(page, 1, ZONE_ADDR_NATIVE) && |
1124 | !from_zone_map(page, 1, ZONE_ADDR_FOREIGN)) { | |
1125 | zone_invalid_element_panic(zone, ze); | |
1126 | } | |
1127 | meta = zone_meta_from_addr(page); | |
1128 | ||
1129 | if (meta->zm_chunk_len > ZM_CHUNK_LEN_MAX) { | |
1130 | zone_invalid_element_panic(zone, ze); | |
1131 | } | |
1132 | if (zone_element_idx(ze) >= zone->z_chunk_elems) { | |
1133 | zone_invalid_element_panic(zone, ze); | |
1134 | } | |
1135 | ||
1136 | if (!zone_has_index(zone, meta->zm_index)) { | |
1137 | vm_offset_t addr = zone_element_addr(ze, zone_elem_size(zone)); | |
1138 | zone_page_metadata_index_confusion_panic(zone, addr, meta); | |
f427ee49 | 1139 | } |
c3c9b80d A |
1140 | |
1141 | return meta; | |
39037602 A |
1142 | } |
1143 | ||
f427ee49 A |
1144 | __attribute__((always_inline)) |
1145 | static struct zone_page_metadata * | |
c3c9b80d A |
1146 | zone_element_resolve(zone_t zone, vm_offset_t addr, vm_offset_t esize, |
1147 | zone_element_t *ze) | |
39037602 | 1148 | { |
f427ee49 | 1149 | struct zone_page_metadata *meta; |
c3c9b80d | 1150 | vm_offset_t page, eidx; |
f427ee49 | 1151 | |
c3c9b80d A |
1152 | if (!from_zone_map(addr, esize, ZONE_ADDR_NATIVE) && |
1153 | !from_zone_map(addr, esize, ZONE_ADDR_FOREIGN)) { | |
1154 | zone_invalid_element_addr_panic(zone, addr); | |
1155 | } | |
f427ee49 | 1156 | page = trunc_page(addr); |
c3c9b80d | 1157 | meta = zone_meta_from_addr(addr); |
39037602 | 1158 | |
c3c9b80d | 1159 | if (meta->zm_chunk_len == ZM_SECONDARY_PCPU_PAGE) { |
f427ee49 | 1160 | zone_invalid_element_addr_panic(zone, addr); |
c3c9b80d A |
1161 | } |
1162 | if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { | |
1163 | page -= ptoa(meta->zm_page_index); | |
1164 | meta -= meta->zm_page_index; | |
f427ee49 A |
1165 | } |
1166 | ||
c3c9b80d A |
1167 | eidx = (addr - page) / esize; |
1168 | if ((addr - page) % esize) { | |
f427ee49 A |
1169 | zone_invalid_element_addr_panic(zone, addr); |
1170 | } | |
1171 | ||
1172 | if (!zone_has_index(zone, meta->zm_index)) { | |
1173 | zone_page_metadata_index_confusion_panic(zone, addr, meta); | |
1174 | } | |
1175 | ||
c3c9b80d | 1176 | *ze = zone_element_encode(page, eidx, ZPM_AUTO); |
f427ee49 A |
1177 | return meta; |
1178 | } | |
1179 | ||
f427ee49 A |
1180 | /* Routine to get the size of a zone allocated address. |
1181 | * If the address doesnt belong to the zone maps, returns 0. | |
1182 | */ | |
1183 | vm_size_t | |
1184 | zone_element_size(void *addr, zone_t *z) | |
1185 | { | |
f427ee49 A |
1186 | struct zone *src_zone; |
1187 | ||
c3c9b80d A |
1188 | if (from_zone_map(addr, sizeof(void *), ZONE_ADDR_NATIVE) || |
1189 | from_zone_map(addr, sizeof(void *), ZONE_ADDR_FOREIGN)) { | |
1190 | src_zone = &zone_array[zone_index_from_ptr(addr)]; | |
f427ee49 A |
1191 | if (z) { |
1192 | *z = src_zone; | |
1193 | } | |
1194 | return zone_elem_size(src_zone); | |
1195 | } | |
c3c9b80d | 1196 | |
f427ee49 A |
1197 | #if CONFIG_GZALLOC |
1198 | if (__improbable(gzalloc_enabled())) { | |
1199 | vm_size_t gzsize; | |
1200 | if (gzalloc_element_size(addr, z, &gzsize)) { | |
1201 | return gzsize; | |
1202 | } | |
1203 | } | |
1204 | #endif /* CONFIG_GZALLOC */ | |
1205 | ||
1206 | return 0; | |
1207 | } | |
1208 | ||
1209 | /* This function just formats the reason for the panics by redoing the checks */ | |
1210 | __abortlike | |
1211 | static void | |
1212 | zone_require_panic(zone_t zone, void *addr) | |
39236c6e | 1213 | { |
f427ee49 A |
1214 | uint32_t zindex; |
1215 | zone_t other; | |
1216 | ||
c3c9b80d | 1217 | if (!from_zone_map(addr, zone_elem_size(zone), ZONE_ADDR_NATIVE)) { |
f427ee49 A |
1218 | panic("zone_require failed: address not in a zone (addr: %p)", addr); |
1219 | } | |
1220 | ||
c3c9b80d | 1221 | zindex = zone_index_from_ptr(addr); |
f427ee49 A |
1222 | other = &zone_array[zindex]; |
1223 | if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) { | |
1224 | panic("zone_require failed: invalid zone index %d " | |
1225 | "(addr: %p, expected: %s%s)", zindex, | |
1226 | addr, zone_heap_name(zone), zone->z_name); | |
0a7de745 | 1227 | } else { |
f427ee49 A |
1228 | panic("zone_require failed: address in unexpected zone id %d (%s%s) " |
1229 | "(addr: %p, expected: %s%s)", | |
1230 | zindex, zone_heap_name(other), other->z_name, | |
1231 | addr, zone_heap_name(zone), zone->z_name); | |
0a7de745 | 1232 | } |
39037602 A |
1233 | } |
1234 | ||
f427ee49 A |
1235 | __abortlike |
1236 | static void | |
1237 | zone_id_require_panic(zone_id_t zid, void *addr) | |
1238 | { | |
1239 | zone_require_panic(&zone_array[zid], addr); | |
1240 | } | |
1241 | ||
cb323159 | 1242 | /* |
f427ee49 | 1243 | * Routines to panic if a pointer is not mapped to an expected zone. |
cb323159 A |
1244 | * This can be used as a means of pinning an object to the zone it is expected |
1245 | * to be a part of. Causes a panic if the address does not belong to any | |
1246 | * specified zone, does not belong to any zone, has been freed and therefore | |
1247 | * unmapped from the zone, or the pointer contains an uninitialized value that | |
1248 | * does not belong to any zone. | |
f427ee49 A |
1249 | * |
1250 | * Note that this can only work with collectable zones without foreign pages. | |
cb323159 | 1251 | */ |
cb323159 | 1252 | void |
f427ee49 | 1253 | zone_require(zone_t zone, void *addr) |
cb323159 | 1254 | { |
c3c9b80d A |
1255 | vm_size_t esize = zone_elem_size(zone); |
1256 | ||
1257 | if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE))) { | |
1258 | if (zone_has_index(zone, zone_index_from_ptr(addr))) { | |
1259 | return; | |
1260 | } | |
f427ee49 | 1261 | #if CONFIG_GZALLOC |
c3c9b80d | 1262 | } else if (__probable(zone->gzalloc_tracked)) { |
f427ee49 | 1263 | return; |
f427ee49 | 1264 | #endif |
c3c9b80d | 1265 | } |
f427ee49 A |
1266 | zone_require_panic(zone, addr); |
1267 | } | |
cb323159 | 1268 | |
f427ee49 A |
1269 | void |
1270 | zone_id_require(zone_id_t zid, vm_size_t esize, void *addr) | |
1271 | { | |
c3c9b80d A |
1272 | if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE))) { |
1273 | if (zid == zone_index_from_ptr(addr)) { | |
1274 | return; | |
1275 | } | |
1276 | #if CONFIG_GZALLOC | |
1277 | } else if (__probable(zone_array[zid].gzalloc_tracked)) { | |
f427ee49 | 1278 | return; |
c3c9b80d | 1279 | #endif |
eb6b6ca3 | 1280 | } |
c3c9b80d A |
1281 | zone_id_require_panic(zid, addr); |
1282 | } | |
1283 | ||
1284 | void | |
1285 | zone_id_require_allow_foreign(zone_id_t zid, vm_size_t esize, void *addr) | |
1286 | { | |
1287 | if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE) || | |
1288 | from_zone_map(addr, esize, ZONE_ADDR_FOREIGN))) { | |
1289 | if (zid == zone_index_from_ptr(addr)) { | |
1290 | return; | |
1291 | } | |
f427ee49 | 1292 | #if CONFIG_GZALLOC |
c3c9b80d | 1293 | } else if (__probable(zone_array[zid].gzalloc_tracked)) { |
f427ee49 | 1294 | return; |
f427ee49 | 1295 | #endif |
c3c9b80d | 1296 | } |
f427ee49 | 1297 | zone_id_require_panic(zid, addr); |
cb323159 A |
1298 | } |
1299 | ||
f427ee49 A |
1300 | bool |
1301 | zone_owns(zone_t zone, void *addr) | |
1302 | { | |
c3c9b80d A |
1303 | vm_size_t esize = zone_elem_size(zone); |
1304 | ||
1305 | if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE))) { | |
1306 | return zone_has_index(zone, zone_index_from_ptr(addr)); | |
f427ee49 | 1307 | #if CONFIG_GZALLOC |
c3c9b80d | 1308 | } else if (__probable(zone->gzalloc_tracked)) { |
f427ee49 | 1309 | return true; |
f427ee49 | 1310 | #endif |
c3c9b80d | 1311 | } |
f427ee49 A |
1312 | return false; |
1313 | } | |
5ba3f43e | 1314 | |
c3c9b80d A |
1315 | #endif /* !ZALLOC_TEST */ |
1316 | #pragma mark Zone bits allocator | |
5ba3f43e | 1317 | |
c3c9b80d A |
1318 | /*! |
1319 | * @defgroup Zone Bitmap allocator | |
1320 | * @{ | |
1321 | * | |
1322 | * @brief | |
1323 | * Functions implementing the zone bitmap allocator | |
1324 | * | |
1325 | * @discussion | |
1326 | * The zone allocator maintains which elements are allocated or free in bitmaps. | |
1327 | * | |
1328 | * When the number of elements per page is smaller than 32, it is stored inline | |
1329 | * on the @c zone_page_metadata structure (@c zm_inline_bitmap is set, | |
1330 | * and @c zm_bitmap used for storage). | |
1331 | * | |
1332 | * When the number of elements is larger, then a bitmap is allocated from | |
1333 | * a buddy allocator (impelemented under the @c zba_* namespace). Pointers | |
1334 | * to bitmaps are implemented as a packed 32 bit bitmap reference, stored in | |
1335 | * @c zm_bitmap. The low 3 bits encode the scale (order) of the allocation in | |
1336 | * @c ZBA_GRANULE units, and hence actual allocations encoded with that scheme | |
1337 | * cannot be larger than 1024 bytes (8192 bits). | |
1338 | * | |
1339 | * This buddy allocator can actually accomodate allocations as large | |
1340 | * as 8k on 16k systems and 2k on 4k systems. | |
1341 | * | |
1342 | * Note: @c zba_* functions are implementation details not meant to be used | |
1343 | * outside of the allocation of the allocator itself. Interfaces to the rest of | |
1344 | * the zone allocator are documented and not @c zba_* prefixed. | |
1345 | */ | |
5ba3f43e | 1346 | |
c3c9b80d A |
1347 | #define ZBA_CHUNK_SIZE PAGE_MAX_SIZE |
1348 | #define ZBA_GRANULE sizeof(uint64_t) | |
1349 | #define ZBA_GRANULE_BITS (8 * sizeof(uint64_t)) | |
1350 | #define ZBA_MAX_ORDER (PAGE_MAX_SHIFT - 4) | |
1351 | #define ZBA_MAX_ALLOC_ORDER 7 | |
1352 | #define ZBA_SLOTS (ZBA_CHUNK_SIZE / ZBA_GRANULE) | |
1353 | static_assert(2ul * ZBA_GRANULE << ZBA_MAX_ORDER == ZBA_CHUNK_SIZE, "chunk sizes"); | |
1354 | static_assert(ZBA_MAX_ALLOC_ORDER <= ZBA_MAX_ORDER, "ZBA_MAX_ORDER is enough"); | |
1355 | ||
1356 | struct zone_bits_chain { | |
1357 | uint32_t zbc_next; | |
1358 | uint32_t zbc_prev; | |
1359 | } __attribute__((aligned(ZBA_GRANULE))); | |
1360 | ||
1361 | struct zone_bits_head { | |
1362 | uint32_t zbh_next; | |
1363 | uint32_t zbh_unused; | |
1364 | } __attribute__((aligned(ZBA_GRANULE))); | |
1365 | ||
1366 | static_assert(sizeof(struct zone_bits_chain) == ZBA_GRANULE, "zbc size"); | |
1367 | static_assert(sizeof(struct zone_bits_head) == ZBA_GRANULE, "zbh size"); | |
1368 | ||
1369 | struct zone_bits_allocator_meta { | |
1370 | uint32_t zbam_chunks; | |
1371 | uint32_t __zbam_padding; | |
1372 | struct zone_bits_head zbam_lists[ZBA_MAX_ORDER + 1]; | |
1373 | }; | |
5ba3f43e | 1374 | |
c3c9b80d A |
1375 | struct zone_bits_allocator_header { |
1376 | uint64_t zbah_bits[ZBA_SLOTS / (8 * sizeof(uint64_t))]; | |
1377 | }; | |
5ba3f43e | 1378 | |
c3c9b80d A |
1379 | #if ZALLOC_TEST |
1380 | static struct zalloc_bits_allocator_test_setup { | |
1381 | vm_offset_t zbats_base; | |
1382 | void (*zbats_populate)(vm_address_t addr, vm_size_t size); | |
1383 | } zba_test_info; | |
5ba3f43e | 1384 | |
c3c9b80d A |
1385 | static struct zone_bits_allocator_header * |
1386 | zba_base_header(void) | |
1387 | { | |
1388 | return (struct zone_bits_allocator_header *)zba_test_info.zbats_base; | |
1389 | } | |
5ba3f43e | 1390 | |
c3c9b80d A |
1391 | static void |
1392 | zba_populate(uint32_t n) | |
1393 | { | |
1394 | vm_address_t base = zba_test_info.zbats_base; | |
1395 | zba_test_info.zbats_populate(base + n * ZBA_CHUNK_SIZE, ZBA_CHUNK_SIZE); | |
1396 | } | |
1397 | #else | |
1398 | __startup_data | |
1399 | static uint8_t zba_chunk_startup[ZBA_CHUNK_SIZE] | |
1400 | __attribute__((aligned(ZBA_CHUNK_SIZE))); | |
1401 | static LCK_MTX_EARLY_DECLARE(zba_mtx, &zone_locks_grp); | |
5ba3f43e | 1402 | |
c3c9b80d A |
1403 | static struct zone_bits_allocator_header * |
1404 | zba_base_header(void) | |
1405 | { | |
1406 | return (struct zone_bits_allocator_header *)zone_info.zi_bits_range.min_address; | |
1407 | } | |
5ba3f43e | 1408 | |
c3c9b80d A |
1409 | static void |
1410 | zba_lock(void) | |
1411 | { | |
1412 | lck_mtx_lock(&zba_mtx); | |
1413 | } | |
5ba3f43e | 1414 | |
c3c9b80d A |
1415 | static void |
1416 | zba_unlock(void) | |
1417 | { | |
1418 | lck_mtx_unlock(&zba_mtx); | |
1419 | } | |
f427ee49 | 1420 | |
c3c9b80d A |
1421 | static void |
1422 | zba_populate(uint32_t n) | |
1423 | { | |
1424 | vm_size_t size = ZBA_CHUNK_SIZE; | |
1425 | vm_address_t addr; | |
5ba3f43e | 1426 | |
c3c9b80d A |
1427 | addr = zone_info.zi_bits_range.min_address + n * size; |
1428 | if (addr >= zone_info.zi_bits_range.max_address) { | |
1429 | zone_t z = zone_find_largest(); | |
1430 | panic("zba_populate: out of bitmap space, " | |
1431 | "likely due to memory leak in zone [%s%s] " | |
1432 | "(%luM, %d elements allocated)", | |
1433 | zone_heap_name(z), zone_name(z), | |
1434 | (unsigned long)zone_size_wired(z) >> 20, | |
1435 | zone_count_allocated(z)); | |
1436 | } | |
1437 | ||
1438 | for (;;) { | |
1439 | kern_return_t kr = KERN_SUCCESS; | |
1440 | ||
1441 | if (0 == pmap_find_phys(kernel_pmap, addr)) { | |
1442 | kr = kernel_memory_populate(kernel_map, addr, size, | |
1443 | KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO, | |
1444 | VM_KERN_MEMORY_OSFMK); | |
1445 | } | |
1446 | ||
1447 | if (kr == KERN_SUCCESS) { | |
1448 | return; | |
1449 | } | |
1450 | ||
1451 | zba_unlock(); | |
1452 | VM_PAGE_WAIT(); | |
1453 | zba_lock(); | |
1454 | } | |
1455 | } | |
5ba3f43e | 1456 | #endif |
5ba3f43e | 1457 | |
c3c9b80d A |
1458 | __pure2 |
1459 | static struct zone_bits_allocator_meta * | |
1460 | zba_meta(void) | |
1461 | { | |
1462 | return (struct zone_bits_allocator_meta *)&zba_base_header()[1]; | |
1463 | } | |
1464 | ||
1465 | __pure2 | |
1466 | static uint64_t * | |
1467 | zba_slot_base(void) | |
1468 | { | |
1469 | return (uint64_t *)zba_base_header(); | |
1470 | } | |
1471 | ||
1472 | __pure2 | |
1473 | static vm_address_t | |
1474 | zba_page_addr(uint32_t n) | |
1475 | { | |
1476 | return (vm_address_t)zba_base_header() + n * ZBA_CHUNK_SIZE; | |
1477 | } | |
1478 | ||
1479 | __pure2 | |
1480 | static struct zone_bits_head * | |
1481 | zba_head(uint32_t order) | |
1482 | { | |
1483 | return &zba_meta()->zbam_lists[order]; | |
1484 | } | |
5ba3f43e | 1485 | |
c3c9b80d | 1486 | __pure2 |
5ba3f43e | 1487 | static uint32_t |
c3c9b80d | 1488 | zba_head_index(uint32_t order) |
5ba3f43e | 1489 | { |
c3c9b80d A |
1490 | uint32_t hdr_size = sizeof(struct zone_bits_allocator_header) + |
1491 | offsetof(struct zone_bits_allocator_meta, zbam_lists); | |
1492 | return (hdr_size / ZBA_GRANULE) + order; | |
5ba3f43e A |
1493 | } |
1494 | ||
c3c9b80d A |
1495 | __pure2 |
1496 | static struct zone_bits_chain * | |
1497 | zba_chain_for_index(uint32_t index) | |
1498 | { | |
1499 | return (struct zone_bits_chain *)(zba_slot_base() + index); | |
1500 | } | |
1501 | ||
1502 | __pure2 | |
5ba3f43e | 1503 | static uint32_t |
c3c9b80d | 1504 | zba_chain_to_index(const struct zone_bits_chain *zbc) |
5ba3f43e | 1505 | { |
c3c9b80d | 1506 | return (uint32_t)((const uint64_t *)zbc - zba_slot_base()); |
5ba3f43e A |
1507 | } |
1508 | ||
c3c9b80d | 1509 | __abortlike |
5ba3f43e | 1510 | static void |
c3c9b80d | 1511 | zba_head_corruption_panic(uint32_t order) |
5ba3f43e | 1512 | { |
c3c9b80d A |
1513 | panic("zone bits allocator head[%d:%p] is corrupt", order, |
1514 | zba_head(order)); | |
1515 | } | |
0a7de745 | 1516 | |
c3c9b80d A |
1517 | __abortlike |
1518 | static void | |
1519 | zba_chain_corruption_panic(struct zone_bits_chain *a, struct zone_bits_chain *b) | |
1520 | { | |
1521 | panic("zone bits allocator freelist is corrupt (%p <-> %p)", a, b); | |
1522 | } | |
0a7de745 | 1523 | |
c3c9b80d A |
1524 | static void |
1525 | zba_push_block(struct zone_bits_chain *zbc, uint32_t order) | |
1526 | { | |
1527 | struct zone_bits_head *hd = zba_head(order); | |
1528 | uint32_t hd_index = zba_head_index(order); | |
1529 | uint32_t index = zba_chain_to_index(zbc); | |
1530 | struct zone_bits_chain *next; | |
1531 | ||
1532 | if (hd->zbh_next) { | |
1533 | next = zba_chain_for_index(hd->zbh_next); | |
1534 | if (next->zbc_prev != hd_index) { | |
1535 | zba_head_corruption_panic(order); | |
0a7de745 | 1536 | } |
c3c9b80d | 1537 | next->zbc_prev = index; |
0a7de745 | 1538 | } |
c3c9b80d A |
1539 | zbc->zbc_next = hd->zbh_next; |
1540 | zbc->zbc_prev = hd_index; | |
1541 | hd->zbh_next = index; | |
5ba3f43e A |
1542 | } |
1543 | ||
c3c9b80d A |
1544 | static void |
1545 | zba_remove_block(struct zone_bits_chain *zbc) | |
5ba3f43e | 1546 | { |
c3c9b80d A |
1547 | struct zone_bits_chain *prev = zba_chain_for_index(zbc->zbc_prev); |
1548 | uint32_t index = zba_chain_to_index(zbc); | |
0a7de745 | 1549 | |
c3c9b80d A |
1550 | if (prev->zbc_next != index) { |
1551 | zba_chain_corruption_panic(prev, zbc); | |
1552 | } | |
1553 | if ((prev->zbc_next = zbc->zbc_next)) { | |
1554 | struct zone_bits_chain *next = zba_chain_for_index(zbc->zbc_next); | |
1555 | if (next->zbc_prev != index) { | |
1556 | zba_chain_corruption_panic(zbc, next); | |
0a7de745 | 1557 | } |
c3c9b80d | 1558 | next->zbc_prev = zbc->zbc_prev; |
0a7de745 | 1559 | } |
5ba3f43e A |
1560 | } |
1561 | ||
c3c9b80d A |
1562 | static vm_address_t |
1563 | zba_try_pop_block(uint32_t order) | |
5ba3f43e | 1564 | { |
c3c9b80d A |
1565 | struct zone_bits_head *hd = zba_head(order); |
1566 | struct zone_bits_chain *zbc; | |
0a7de745 | 1567 | |
c3c9b80d A |
1568 | if (hd->zbh_next == 0) { |
1569 | return 0; | |
0a7de745 | 1570 | } |
c3c9b80d A |
1571 | |
1572 | zbc = zba_chain_for_index(hd->zbh_next); | |
1573 | zba_remove_block(zbc); | |
1574 | return (vm_address_t)zbc; | |
5ba3f43e A |
1575 | } |
1576 | ||
c3c9b80d A |
1577 | static struct zone_bits_allocator_header * |
1578 | zba_header(vm_offset_t addr) | |
1579 | { | |
1580 | addr &= -(vm_offset_t)ZBA_CHUNK_SIZE; | |
1581 | return (struct zone_bits_allocator_header *)addr; | |
1582 | } | |
5ba3f43e | 1583 | |
c3c9b80d A |
1584 | static size_t |
1585 | zba_node_parent(size_t node) | |
1586 | { | |
1587 | return (node - 1) / 2; | |
1588 | } | |
5ba3f43e | 1589 | |
c3c9b80d A |
1590 | static size_t |
1591 | zba_node_left_child(size_t node) | |
1592 | { | |
1593 | return node * 2 + 1; | |
1594 | } | |
5ba3f43e | 1595 | |
c3c9b80d A |
1596 | static size_t |
1597 | zba_node_buddy(size_t node) | |
5ba3f43e | 1598 | { |
c3c9b80d A |
1599 | return ((node - 1) ^ 1) + 1; |
1600 | } | |
0a7de745 | 1601 | |
c3c9b80d A |
1602 | static size_t |
1603 | zba_node(vm_offset_t addr, uint32_t order) | |
1604 | { | |
1605 | vm_offset_t offs = (addr % ZBA_CHUNK_SIZE) / ZBA_GRANULE; | |
1606 | return (offs >> order) + (1 << (ZBA_MAX_ORDER - order + 1)) - 1; | |
1607 | } | |
0a7de745 | 1608 | |
c3c9b80d A |
1609 | static struct zone_bits_chain * |
1610 | zba_chain_for_node(struct zone_bits_allocator_header *zbah, size_t node, uint32_t order) | |
1611 | { | |
1612 | vm_offset_t offs = (node - (1 << (ZBA_MAX_ORDER - order + 1)) + 1) << order; | |
1613 | return (struct zone_bits_chain *)((vm_offset_t)zbah + offs * ZBA_GRANULE); | |
5ba3f43e A |
1614 | } |
1615 | ||
c3c9b80d A |
1616 | static void |
1617 | zba_node_flip_split(struct zone_bits_allocator_header *zbah, size_t node) | |
5ba3f43e | 1618 | { |
c3c9b80d A |
1619 | zbah->zbah_bits[node / 64] ^= 1ull << (node % 64); |
1620 | } | |
0a7de745 | 1621 | |
c3c9b80d A |
1622 | static bool |
1623 | zba_node_is_split(struct zone_bits_allocator_header *zbah, size_t node) | |
1624 | { | |
1625 | return zbah->zbah_bits[node / 64] & (1ull << (node % 64)); | |
5ba3f43e A |
1626 | } |
1627 | ||
1628 | static void | |
c3c9b80d | 1629 | zba_free(vm_offset_t addr, uint32_t order) |
5ba3f43e | 1630 | { |
c3c9b80d A |
1631 | struct zone_bits_allocator_header *zbah = zba_header(addr); |
1632 | struct zone_bits_chain *zbc; | |
1633 | size_t node = zba_node(addr, order); | |
0a7de745 | 1634 | |
c3c9b80d A |
1635 | while (node) { |
1636 | size_t parent = zba_node_parent(node); | |
0a7de745 | 1637 | |
c3c9b80d A |
1638 | zba_node_flip_split(zbah, parent); |
1639 | if (zba_node_is_split(zbah, parent)) { | |
1640 | break; | |
1641 | } | |
0a7de745 | 1642 | |
c3c9b80d A |
1643 | zbc = zba_chain_for_node(zbah, zba_node_buddy(node), order); |
1644 | zba_remove_block(zbc); | |
1645 | order++; | |
1646 | node = parent; | |
0a7de745 | 1647 | } |
0a7de745 | 1648 | |
c3c9b80d A |
1649 | zba_push_block(zba_chain_for_node(zbah, node, order), order); |
1650 | } | |
0a7de745 | 1651 | |
c3c9b80d A |
1652 | static vm_size_t |
1653 | zba_chunk_header_size(uint32_t n) | |
1654 | { | |
1655 | vm_size_t hdr_size = sizeof(struct zone_bits_allocator_header); | |
1656 | if (n == 0) { | |
1657 | hdr_size += sizeof(struct zone_bits_allocator_meta); | |
0a7de745 | 1658 | } |
c3c9b80d | 1659 | return hdr_size; |
5ba3f43e A |
1660 | } |
1661 | ||
1662 | static void | |
c3c9b80d | 1663 | zba_init_chunk(uint32_t n) |
5ba3f43e | 1664 | { |
c3c9b80d A |
1665 | vm_size_t hdr_size = zba_chunk_header_size(n); |
1666 | vm_offset_t page = zba_page_addr(n); | |
1667 | struct zone_bits_allocator_header *zbah = zba_header(page); | |
1668 | vm_size_t size = ZBA_CHUNK_SIZE; | |
1669 | size_t node; | |
0a7de745 | 1670 | |
c3c9b80d A |
1671 | for (uint32_t o = ZBA_MAX_ORDER + 1; o-- > 0;) { |
1672 | if (size < hdr_size + (ZBA_GRANULE << o)) { | |
1673 | continue; | |
0a7de745 | 1674 | } |
c3c9b80d A |
1675 | size -= ZBA_GRANULE << o; |
1676 | node = zba_node(page + size, o); | |
1677 | zba_node_flip_split(zbah, zba_node_parent(node)); | |
1678 | zba_push_block(zba_chain_for_node(zbah, node, o), o); | |
0a7de745 A |
1679 | } |
1680 | ||
c3c9b80d | 1681 | zba_meta()->zbam_chunks = n + 1; |
5ba3f43e A |
1682 | } |
1683 | ||
c3c9b80d | 1684 | __attribute__((noinline)) |
5ba3f43e | 1685 | static void |
c3c9b80d | 1686 | zba_grow(void) |
5ba3f43e | 1687 | { |
c3c9b80d | 1688 | uint32_t chunk = zba_meta()->zbam_chunks; |
0a7de745 | 1689 | |
c3c9b80d A |
1690 | zba_populate(chunk); |
1691 | if (zba_meta()->zbam_chunks == chunk) { | |
1692 | zba_init_chunk(chunk); | |
0a7de745 | 1693 | } |
5ba3f43e A |
1694 | } |
1695 | ||
c3c9b80d A |
1696 | static vm_offset_t |
1697 | zba_alloc(uint32_t order) | |
5ba3f43e | 1698 | { |
c3c9b80d A |
1699 | struct zone_bits_allocator_header *zbah; |
1700 | uint32_t cur = order; | |
1701 | vm_address_t addr; | |
1702 | size_t node; | |
5ba3f43e | 1703 | |
c3c9b80d A |
1704 | while ((addr = zba_try_pop_block(cur)) == 0) { |
1705 | if (cur++ >= ZBA_MAX_ORDER) { | |
1706 | zba_grow(); | |
1707 | cur = order; | |
0a7de745 | 1708 | } |
0a7de745 | 1709 | } |
5ba3f43e | 1710 | |
c3c9b80d A |
1711 | zbah = zba_header(addr); |
1712 | node = zba_node(addr, cur); | |
1713 | zba_node_flip_split(zbah, zba_node_parent(node)); | |
1714 | while (cur > order) { | |
1715 | cur--; | |
1716 | zba_node_flip_split(zbah, node); | |
1717 | node = zba_node_left_child(node); | |
1718 | zba_push_block(zba_chain_for_node(zbah, node + 1, cur), cur); | |
1719 | } | |
5ba3f43e | 1720 | |
c3c9b80d | 1721 | return addr; |
5ba3f43e A |
1722 | } |
1723 | ||
c3c9b80d A |
1724 | #define zba_map_index(type, n) (n / (8 * sizeof(type))) |
1725 | #define zba_map_bit(type, n) ((type)1 << (n % (8 * sizeof(type)))) | |
1726 | #define zba_map_mask_lt(type, n) (zba_map_bit(type, n) - 1) | |
1727 | #define zba_map_mask_ge(type, n) ((type)-zba_map_bit(type, n)) | |
5ba3f43e | 1728 | |
c3c9b80d A |
1729 | #if !ZALLOC_TEST |
1730 | static uint32_t | |
1731 | zba_bits_ref_order(uint32_t bref) | |
39037602 | 1732 | { |
c3c9b80d | 1733 | return bref & 0x7; |
f427ee49 | 1734 | } |
39037602 | 1735 | |
c3c9b80d A |
1736 | static bitmap_t * |
1737 | zba_bits_ref_ptr(uint32_t bref) | |
f427ee49 | 1738 | { |
c3c9b80d | 1739 | return zba_slot_base() + (bref >> 3); |
39236c6e A |
1740 | } |
1741 | ||
c3c9b80d A |
1742 | static vm_offset_t |
1743 | zba_scan_bitmap_inline(zone_t zone, struct zone_page_metadata *meta, | |
1744 | vm_offset_t eidx) | |
5ba3f43e | 1745 | { |
c3c9b80d A |
1746 | size_t i = eidx / 32; |
1747 | uint32_t map; | |
5ba3f43e | 1748 | |
c3c9b80d A |
1749 | if (eidx % 32) { |
1750 | map = meta[i].zm_bitmap & zba_map_mask_ge(uint32_t, eidx); | |
1751 | if (map) { | |
1752 | eidx = __builtin_ctz(map); | |
1753 | meta[i].zm_bitmap ^= 1u << eidx; | |
1754 | return i * 32 + eidx; | |
1755 | } | |
1756 | i++; | |
5ba3f43e | 1757 | } |
f427ee49 | 1758 | |
c3c9b80d A |
1759 | uint32_t chunk_len = meta->zm_chunk_len; |
1760 | if (chunk_len == 1 && zone->z_percpu) { | |
1761 | chunk_len = zpercpu_count(); | |
1762 | } | |
1763 | for (int j = 0; j < chunk_len; j++, i++) { | |
1764 | if (i >= chunk_len) { | |
1765 | i = 0; | |
1766 | } | |
1767 | if (__probable(map = meta[i].zm_bitmap)) { | |
1768 | meta[i].zm_bitmap &= map - 1; | |
1769 | return i * 32 + __builtin_ctz(map); | |
1770 | } | |
1771 | } | |
5ba3f43e | 1772 | |
c3c9b80d | 1773 | zone_page_meta_accounting_panic(zone, meta, "zm_bitmap"); |
f427ee49 | 1774 | } |
5ba3f43e | 1775 | |
c3c9b80d A |
1776 | static vm_offset_t |
1777 | zba_scan_bitmap_ref(zone_t zone, struct zone_page_metadata *meta, | |
1778 | vm_offset_t eidx) | |
39236c6e | 1779 | { |
c3c9b80d A |
1780 | uint32_t bits_size = 1 << zba_bits_ref_order(meta->zm_bitmap); |
1781 | bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); | |
1782 | size_t i = eidx / 64; | |
1783 | uint64_t map; | |
39236c6e | 1784 | |
c3c9b80d A |
1785 | if (eidx % 64) { |
1786 | map = bits[i] & zba_map_mask_ge(uint64_t, eidx); | |
1787 | if (map) { | |
1788 | eidx = __builtin_ctzll(map); | |
1789 | bits[i] ^= 1ull << eidx; | |
1790 | return i * 64 + eidx; | |
1791 | } | |
1792 | i++; | |
1793 | } | |
1794 | ||
1795 | for (int j = 0; j < bits_size; i++, j++) { | |
1796 | if (i >= bits_size) { | |
1797 | i = 0; | |
1798 | } | |
1799 | if (__probable(map = bits[i])) { | |
1800 | bits[i] &= map - 1; | |
1801 | return i * 64 + __builtin_ctzll(map); | |
1802 | } | |
1803 | } | |
1804 | ||
1805 | zone_page_meta_accounting_panic(zone, meta, "zm_bitmap"); | |
f427ee49 | 1806 | } |
39236c6e | 1807 | |
c3c9b80d A |
1808 | /*! |
1809 | * @function zone_meta_find_and_clear_bit | |
1810 | * | |
1811 | * @brief | |
1812 | * The core of the bitmap allocator: find a bit set in the bitmaps. | |
1813 | * | |
1814 | * @discussion | |
1815 | * This method will round robin through available allocations, | |
1816 | * with a per-core memory of the last allocated element index allocated. | |
1817 | * | |
1818 | * This is done in order to avoid a fully LIFO behavior which makes exploiting | |
1819 | * double-free bugs way too practical. | |
1820 | * | |
1821 | * @param zone The zone we're allocating from. | |
1822 | * @param meta The main metadata for the chunk being allocated from. | |
1823 | */ | |
1824 | static vm_offset_t | |
1825 | zone_meta_find_and_clear_bit(zone_t zone, struct zone_page_metadata *meta) | |
f427ee49 | 1826 | { |
c3c9b80d A |
1827 | zone_stats_t zs = zpercpu_get(zone->z_stats); |
1828 | vm_offset_t eidx = zs->zs_alloc_rr + 1; | |
1829 | ||
1830 | if (meta->zm_inline_bitmap) { | |
1831 | eidx = zba_scan_bitmap_inline(zone, meta, eidx); | |
1832 | } else { | |
1833 | eidx = zba_scan_bitmap_ref(zone, meta, eidx); | |
1834 | } | |
1835 | zs->zs_alloc_rr = (uint16_t)eidx; | |
1836 | return eidx; | |
f427ee49 A |
1837 | } |
1838 | ||
c3c9b80d A |
1839 | /*! |
1840 | * @function zone_meta_bits_init | |
1841 | * | |
1842 | * @brief | |
1843 | * Initializes the zm_bitmap field(s) for a newly assigned chunk. | |
1844 | * | |
1845 | * @param meta The main metadata for the initialized chunk. | |
1846 | * @param count The number of elements the chunk can hold | |
1847 | * (which might be partial for partially populated chunks). | |
1848 | * @param nbits The maximum nuber of bits that will be used. | |
1849 | */ | |
1850 | static void | |
1851 | zone_meta_bits_init(struct zone_page_metadata *meta, | |
1852 | uint32_t count, uint32_t nbits) | |
f427ee49 | 1853 | { |
c3c9b80d A |
1854 | static_assert(ZONE_MAX_ALLOC_SIZE / ZONE_MIN_ELEM_SIZE <= |
1855 | ZBA_GRANULE_BITS << ZBA_MAX_ORDER, "bitmaps will be large enough"); | |
f427ee49 | 1856 | |
c3c9b80d | 1857 | if (meta->zm_inline_bitmap) { |
f427ee49 | 1858 | /* |
c3c9b80d A |
1859 | * We're called with the metadata zm_bitmap fields already |
1860 | * zeroed out. | |
f427ee49 | 1861 | */ |
c3c9b80d A |
1862 | for (size_t i = 0; 32 * i < count; i++) { |
1863 | if (32 * i + 32 <= count) { | |
1864 | meta[i].zm_bitmap = ~0u; | |
1865 | } else { | |
1866 | meta[i].zm_bitmap = zba_map_mask_lt(uint32_t, count); | |
1867 | } | |
0a7de745 | 1868 | } |
c3c9b80d A |
1869 | } else { |
1870 | uint32_t order = flsll((nbits - 1) / ZBA_GRANULE_BITS); | |
1871 | uint64_t *bits; | |
39236c6e | 1872 | |
c3c9b80d A |
1873 | assert(order <= ZBA_MAX_ALLOC_ORDER); |
1874 | assert(count <= ZBA_GRANULE_BITS << order); | |
f427ee49 | 1875 | |
c3c9b80d A |
1876 | zba_lock(); |
1877 | bits = (uint64_t *)zba_alloc(order); | |
1878 | zba_unlock(); | |
1879 | ||
1880 | for (size_t i = 0; i < 1u << order; i++) { | |
1881 | if (64 * i + 64 <= count) { | |
1882 | bits[i] = ~0ull; | |
1883 | } else if (64 * i < count) { | |
1884 | bits[i] = zba_map_mask_lt(uint64_t, count); | |
1885 | } else { | |
1886 | bits[i] = 0ull; | |
1887 | } | |
f427ee49 | 1888 | } |
39236c6e | 1889 | |
c3c9b80d A |
1890 | meta->zm_bitmap = (uint32_t)((vm_offset_t)bits - |
1891 | (vm_offset_t)zba_slot_base()) + order; | |
1892 | } | |
39236c6e A |
1893 | } |
1894 | ||
c3c9b80d A |
1895 | /*! |
1896 | * @function zone_meta_bits_merge | |
1897 | * | |
1898 | * @brief | |
1899 | * Adds elements <code>[start, end)</code> to a chunk being extended. | |
1900 | * | |
1901 | * @param meta The main metadata for the extended chunk. | |
1902 | * @param start The index of the first element to add to the chunk. | |
1903 | * @param end The index of the last (exclusive) element to add. | |
1904 | */ | |
1905 | static void | |
1906 | zone_meta_bits_merge(struct zone_page_metadata *meta, | |
1907 | uint32_t start, uint32_t end) | |
39236c6e | 1908 | { |
c3c9b80d A |
1909 | if (meta->zm_inline_bitmap) { |
1910 | while (start < end) { | |
1911 | size_t s_i = start / 32; | |
1912 | size_t s_e = end / 32; | |
f427ee49 | 1913 | |
c3c9b80d A |
1914 | if (s_i == s_e) { |
1915 | meta[s_i].zm_bitmap |= zba_map_mask_lt(uint32_t, end) & | |
1916 | zba_map_mask_ge(uint32_t, start); | |
1917 | break; | |
1918 | } | |
1919 | ||
1920 | meta[s_i].zm_bitmap |= zba_map_mask_ge(uint32_t, start); | |
1921 | start += 32 - (start % 32); | |
f427ee49 | 1922 | } |
f427ee49 | 1923 | } else { |
c3c9b80d | 1924 | uint64_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); |
0a7de745 | 1925 | |
c3c9b80d A |
1926 | while (start < end) { |
1927 | size_t s_i = start / 64; | |
1928 | size_t s_e = end / 64; | |
f427ee49 | 1929 | |
c3c9b80d A |
1930 | if (s_i == s_e) { |
1931 | bits[s_i] |= zba_map_mask_lt(uint64_t, end) & | |
1932 | zba_map_mask_ge(uint64_t, start); | |
1933 | break; | |
1934 | } | |
1935 | bits[s_i] |= zba_map_mask_ge(uint64_t, start); | |
1936 | start += 64 - (start % 64); | |
1937 | } | |
1938 | } | |
39236c6e A |
1939 | } |
1940 | ||
c3c9b80d A |
1941 | /*! |
1942 | * @function zone_bits_free | |
1943 | * | |
1944 | * @brief | |
1945 | * Frees a bitmap to the zone bitmap allocator. | |
1946 | * | |
1947 | * @param bref | |
1948 | * A bitmap reference set by @c zone_meta_bits_init() in a @c zm_bitmap field. | |
1949 | */ | |
1950 | static void | |
1951 | zone_bits_free(uint32_t bref) | |
f427ee49 | 1952 | { |
c3c9b80d A |
1953 | zba_lock(); |
1954 | zba_free((vm_offset_t)zba_bits_ref_ptr(bref), zba_bits_ref_order(bref)); | |
1955 | zba_unlock(); | |
f427ee49 A |
1956 | } |
1957 | ||
c3c9b80d A |
1958 | /*! |
1959 | * @function zone_meta_is_free | |
1960 | * | |
1961 | * @brief | |
1962 | * Returns whether a given element appears free. | |
39236c6e | 1963 | */ |
c3c9b80d A |
1964 | static bool |
1965 | zone_meta_is_free(struct zone_page_metadata *meta, zone_element_t ze) | |
1966 | { | |
1967 | vm_offset_t eidx = zone_element_idx(ze); | |
1968 | if (meta->zm_inline_bitmap) { | |
1969 | uint32_t bit = zba_map_bit(uint32_t, eidx); | |
1970 | return meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit; | |
316670eb | 1971 | } else { |
c3c9b80d A |
1972 | bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); |
1973 | uint64_t bit = zba_map_bit(uint64_t, eidx); | |
1974 | return bits[zba_map_index(uint64_t, eidx)] & bit; | |
0a7de745 | 1975 | } |
316670eb A |
1976 | } |
1977 | ||
c3c9b80d A |
1978 | /*! |
1979 | * @function zone_meta_mark_free | |
1980 | * | |
1981 | * @brief | |
1982 | * Marks an element as free and returns whether it was marked as used. | |
39236c6e | 1983 | */ |
c3c9b80d A |
1984 | static bool |
1985 | zone_meta_mark_free(struct zone_page_metadata *meta, zone_element_t ze) | |
39236c6e | 1986 | { |
c3c9b80d | 1987 | vm_offset_t eidx = zone_element_idx(ze); |
39236c6e | 1988 | |
c3c9b80d A |
1989 | if (meta->zm_inline_bitmap) { |
1990 | uint32_t bit = zba_map_bit(uint32_t, eidx); | |
1991 | if (meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit) { | |
1992 | return false; | |
1993 | } | |
1994 | meta[zba_map_index(uint32_t, eidx)].zm_bitmap ^= bit; | |
1995 | } else { | |
1996 | bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); | |
1997 | uint64_t bit = zba_map_bit(uint64_t, eidx); | |
1998 | if (bits[zba_map_index(uint64_t, eidx)] & bit) { | |
1999 | return false; | |
f427ee49 | 2000 | } |
c3c9b80d | 2001 | bits[zba_map_index(uint64_t, eidx)] ^= bit; |
0a7de745 | 2002 | } |
c3c9b80d | 2003 | return true; |
f427ee49 | 2004 | } |
39236c6e | 2005 | |
c3c9b80d A |
2006 | /*! |
2007 | * @function zone_meta_mark_used | |
2008 | * | |
2009 | * @brief | |
2010 | * Marks an element as used and returns whether it was marked as free | |
f427ee49 | 2011 | */ |
c3c9b80d A |
2012 | static bool |
2013 | zone_meta_mark_used(struct zone_page_metadata *meta, zone_element_t ze) | |
f427ee49 | 2014 | { |
c3c9b80d | 2015 | vm_offset_t eidx = zone_element_idx(ze); |
39236c6e | 2016 | |
c3c9b80d A |
2017 | if (meta->zm_inline_bitmap) { |
2018 | uint32_t bit = zba_map_bit(uint32_t, eidx); | |
2019 | if (meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit) { | |
2020 | meta[zba_map_index(uint32_t, eidx)].zm_bitmap ^= bit; | |
2021 | return true; | |
2022 | } | |
2023 | } else { | |
2024 | bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); | |
2025 | uint64_t bit = zba_map_bit(uint64_t, eidx); | |
2026 | if (bits[zba_map_index(uint64_t, eidx)] & bit) { | |
2027 | bits[zba_map_index(uint64_t, eidx)] ^= bit; | |
2028 | return true; | |
2029 | } | |
39236c6e | 2030 | } |
c3c9b80d | 2031 | return false; |
39236c6e A |
2032 | } |
2033 | ||
c3c9b80d A |
2034 | #endif /* !ZALLOC_TEST */ |
2035 | /*! @} */ | |
2036 | #pragma mark ZTAGS | |
2037 | #if !ZALLOC_TEST | |
2038 | #if VM_MAX_TAG_ZONES | |
39236c6e | 2039 | /* |
c3c9b80d A |
2040 | * Zone tagging allows for per "tag" accounting of allocations for the kalloc |
2041 | * zones only. | |
2042 | * | |
2043 | * There are 3 kinds of tags that can be used: | |
2044 | * - pre-registered VM_KERN_MEMORY_* | |
2045 | * - dynamic tags allocated per call sites in core-kernel (using vm_tag_alloc()) | |
2046 | * - per-kext tags computed by IOKit (using the magic VM_TAG_BT marker). | |
2047 | * | |
2048 | * The VM tracks the statistics in lazily allocated structures. | |
2049 | * See vm_tag_will_update_zone(), vm_tag_update_zone_size(). | |
2050 | * | |
2051 | * If for some reason the requested tag cannot be accounted for, | |
2052 | * the tag is forced to VM_KERN_MEMORY_KALLOC which is pre-allocated. | |
2053 | * | |
2054 | * Each allocated element also remembers the tag it was assigned, | |
2055 | * in its ztSlot() which lets zalloc/zfree update statistics correctly. | |
39236c6e | 2056 | */ |
39236c6e | 2057 | |
c3c9b80d | 2058 | // for zones with tagging enabled: |
fe8ab488 | 2059 | |
c3c9b80d A |
2060 | // calculate a pointer to the tag base entry, |
2061 | // holding either a uint32_t the first tag offset for a page in the zone map, | |
2062 | // or two uint16_t tags if the page can only hold one or two elements | |
f427ee49 | 2063 | |
c3c9b80d A |
2064 | #define ZTAGBASE(zone, element) \ |
2065 | (&((uint32_t *)zone_tagbase_min)[atop((element) - \ | |
2066 | zone_info.zi_map_range[ZONE_ADDR_NATIVE].min_address)]) | |
39236c6e | 2067 | |
c3c9b80d A |
2068 | static vm_offset_t zone_tagbase_min; |
2069 | static vm_offset_t zone_tagbase_max; | |
2070 | static vm_offset_t zone_tagbase_map_size; | |
2071 | static vm_map_t zone_tagbase_map; | |
f427ee49 | 2072 | |
c3c9b80d A |
2073 | static vm_offset_t zone_tags_min; |
2074 | static vm_offset_t zone_tags_max; | |
2075 | static vm_offset_t zone_tags_map_size; | |
2076 | static vm_map_t zone_tags_map; | |
39236c6e | 2077 | |
c3c9b80d | 2078 | // simple heap allocator for allocating the tags for new memory |
39236c6e | 2079 | |
c3c9b80d | 2080 | static LCK_MTX_EARLY_DECLARE(ztLock, &zone_locks_grp); /* heap lock */ |
f427ee49 | 2081 | |
c3c9b80d A |
2082 | enum{ |
2083 | ztFreeIndexCount = 8, | |
2084 | ztFreeIndexMax = (ztFreeIndexCount - 1), | |
2085 | ztTagsPerBlock = 4 | |
2086 | }; | |
f427ee49 | 2087 | |
c3c9b80d A |
2088 | struct ztBlock { |
2089 | #if __LITTLE_ENDIAN__ | |
2090 | uint64_t free:1, | |
2091 | next:21, | |
2092 | prev:21, | |
2093 | size:21; | |
2094 | #else | |
2095 | // ztBlock needs free bit least significant | |
2096 | #error !__LITTLE_ENDIAN__ | |
f427ee49 | 2097 | #endif |
c3c9b80d A |
2098 | }; |
2099 | typedef struct ztBlock ztBlock; | |
39236c6e | 2100 | |
c3c9b80d A |
2101 | static ztBlock * ztBlocks; |
2102 | static uint32_t ztBlocksCount; | |
2103 | static uint32_t ztBlocksFree; | |
f427ee49 | 2104 | |
c3c9b80d A |
2105 | static uint32_t |
2106 | ztLog2up(uint32_t size) | |
f427ee49 | 2107 | { |
c3c9b80d A |
2108 | if (1 == size) { |
2109 | size = 0; | |
2110 | } else { | |
2111 | size = 32 - __builtin_clz(size - 1); | |
316670eb | 2112 | } |
c3c9b80d | 2113 | return size; |
f427ee49 | 2114 | } |
39236c6e | 2115 | |
c3c9b80d A |
2116 | // pointer to the tag for an element |
2117 | static vm_tag_t * | |
2118 | ztSlot(zone_t zone, vm_offset_t element) | |
f427ee49 | 2119 | { |
c3c9b80d A |
2120 | vm_tag_t *result; |
2121 | if (zone->tags_inline) { | |
2122 | result = (vm_tag_t *)ZTAGBASE(zone, element); | |
2123 | if ((PAGE_MASK & element) >= zone_elem_size(zone)) { | |
2124 | result++; | |
f427ee49 A |
2125 | } |
2126 | } else { | |
c3c9b80d A |
2127 | result = &((vm_tag_t *)zone_tags_min)[ZTAGBASE(zone, element)[0] + |
2128 | (element & PAGE_MASK) / zone_elem_size(zone)]; | |
0a7de745 | 2129 | } |
c3c9b80d | 2130 | return result; |
316670eb | 2131 | } |
1c79356b | 2132 | |
c3c9b80d A |
2133 | static uint32_t |
2134 | ztLog2down(uint32_t size) | |
f427ee49 | 2135 | { |
c3c9b80d A |
2136 | size = 31 - __builtin_clz(size); |
2137 | return size; | |
f427ee49 | 2138 | } |
0b4e3aa0 | 2139 | |
c3c9b80d A |
2140 | static void |
2141 | ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags) | |
2142 | { | |
2143 | vm_map_offset_t addr = (vm_map_offset_t) address; | |
2144 | vm_map_offset_t page, end; | |
2145 | ||
2146 | page = trunc_page(addr); | |
2147 | end = round_page(addr + size); | |
2148 | ||
2149 | for (; page < end; page += page_size) { | |
2150 | if (!pmap_find_phys(kernel_pmap, page)) { | |
2151 | kern_return_t __unused | |
2152 | ret = kernel_memory_populate(map, page, PAGE_SIZE, | |
2153 | KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG); | |
2154 | assert(ret == KERN_SUCCESS); | |
2155 | } | |
2156 | } | |
2157 | } | |
2158 | ||
2159 | static boolean_t | |
2160 | ztPresent(const void * address, size_t size) | |
2161 | { | |
2162 | vm_map_offset_t addr = (vm_map_offset_t) address; | |
2163 | vm_map_offset_t page, end; | |
2164 | boolean_t result; | |
2165 | ||
2166 | page = trunc_page(addr); | |
2167 | end = round_page(addr + size); | |
2168 | for (result = TRUE; (page < end); page += page_size) { | |
2169 | result = pmap_find_phys(kernel_pmap, page); | |
2170 | if (!result) { | |
2171 | break; | |
2172 | } | |
2173 | } | |
2174 | return result; | |
2175 | } | |
2176 | ||
2177 | ||
2178 | void __unused | |
2179 | ztDump(boolean_t sanity); | |
2180 | void __unused | |
2181 | ztDump(boolean_t sanity) | |
2182 | { | |
2183 | uint32_t q, cq, p; | |
2184 | ||
2185 | for (q = 0; q <= ztFreeIndexMax; q++) { | |
2186 | p = q; | |
2187 | do{ | |
2188 | if (sanity) { | |
2189 | cq = ztLog2down(ztBlocks[p].size); | |
2190 | if (cq > ztFreeIndexMax) { | |
2191 | cq = ztFreeIndexMax; | |
2192 | } | |
2193 | if (!ztBlocks[p].free | |
2194 | || ((p != q) && (q != cq)) | |
2195 | || (ztBlocks[ztBlocks[p].next].prev != p) | |
2196 | || (ztBlocks[ztBlocks[p].prev].next != p)) { | |
2197 | kprintf("zterror at %d", p); | |
2198 | ztDump(FALSE); | |
2199 | kprintf("zterror at %d", p); | |
2200 | assert(FALSE); | |
2201 | } | |
2202 | continue; | |
2203 | } | |
2204 | kprintf("zt[%03d]%c %d, %d, %d\n", | |
2205 | p, ztBlocks[p].free ? 'F' : 'A', | |
2206 | ztBlocks[p].next, ztBlocks[p].prev, | |
2207 | ztBlocks[p].size); | |
2208 | p = ztBlocks[p].next; | |
2209 | if (p == q) { | |
2210 | break; | |
2211 | } | |
2212 | }while (p != q); | |
2213 | if (!sanity) { | |
2214 | printf("\n"); | |
2215 | } | |
2216 | } | |
2217 | if (!sanity) { | |
2218 | printf("-----------------------\n"); | |
2219 | } | |
2220 | } | |
2221 | ||
2222 | ||
2223 | ||
2224 | #define ZTBDEQ(idx) \ | |
2225 | ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next; \ | |
2226 | ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev; | |
2227 | ||
2228 | static void | |
2229 | ztFree(zone_t zone __unused, uint32_t index, uint32_t count) | |
2230 | { | |
2231 | uint32_t q, w, p, size, merge; | |
2232 | ||
2233 | assert(count); | |
2234 | ztBlocksFree += count; | |
2235 | ||
2236 | // merge with preceding | |
2237 | merge = (index + count); | |
2238 | if ((merge < ztBlocksCount) | |
2239 | && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge])) | |
2240 | && ztBlocks[merge].free) { | |
2241 | ZTBDEQ(merge); | |
2242 | count += ztBlocks[merge].size; | |
2243 | } | |
2244 | ||
2245 | // merge with following | |
2246 | merge = (index - 1); | |
2247 | if ((merge > ztFreeIndexMax) | |
2248 | && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge])) | |
2249 | && ztBlocks[merge].free) { | |
2250 | size = ztBlocks[merge].size; | |
2251 | count += size; | |
2252 | index -= size; | |
2253 | ZTBDEQ(index); | |
2254 | } | |
2255 | ||
2256 | q = ztLog2down(count); | |
2257 | if (q > ztFreeIndexMax) { | |
2258 | q = ztFreeIndexMax; | |
2259 | } | |
2260 | w = q; | |
2261 | // queue in order of size | |
2262 | while (TRUE) { | |
2263 | p = ztBlocks[w].next; | |
2264 | if (p == q) { | |
2265 | break; | |
2266 | } | |
2267 | if (ztBlocks[p].size >= count) { | |
2268 | break; | |
2269 | } | |
2270 | w = p; | |
2271 | } | |
2272 | ztBlocks[p].prev = index; | |
2273 | ztBlocks[w].next = index; | |
2274 | ||
2275 | // fault in first | |
2276 | ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0); | |
2277 | ||
2278 | // mark first & last with free flag and size | |
2279 | ztBlocks[index].free = TRUE; | |
2280 | ztBlocks[index].size = count; | |
2281 | ztBlocks[index].prev = w; | |
2282 | ztBlocks[index].next = p; | |
2283 | if (count > 1) { | |
2284 | index += (count - 1); | |
2285 | // fault in last | |
2286 | ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0); | |
2287 | ztBlocks[index].free = TRUE; | |
2288 | ztBlocks[index].size = count; | |
2289 | } | |
2290 | } | |
2291 | ||
2292 | static uint32_t | |
2293 | ztAlloc(zone_t zone, uint32_t count) | |
2294 | { | |
2295 | uint32_t q, w, p, leftover; | |
2296 | ||
2297 | assert(count); | |
2298 | ||
2299 | q = ztLog2up(count); | |
2300 | if (q > ztFreeIndexMax) { | |
2301 | q = ztFreeIndexMax; | |
2302 | } | |
2303 | do{ | |
2304 | w = q; | |
2305 | while (TRUE) { | |
2306 | p = ztBlocks[w].next; | |
2307 | if (p == q) { | |
2308 | break; | |
2309 | } | |
2310 | if (ztBlocks[p].size >= count) { | |
2311 | // dequeue, mark both ends allocated | |
2312 | ztBlocks[w].next = ztBlocks[p].next; | |
2313 | ztBlocks[ztBlocks[p].next].prev = w; | |
2314 | ztBlocks[p].free = FALSE; | |
2315 | ztBlocksFree -= ztBlocks[p].size; | |
2316 | if (ztBlocks[p].size > 1) { | |
2317 | ztBlocks[p + ztBlocks[p].size - 1].free = FALSE; | |
2318 | } | |
2319 | ||
2320 | // fault all the allocation | |
2321 | ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0); | |
2322 | // mark last as allocated | |
2323 | if (count > 1) { | |
2324 | ztBlocks[p + count - 1].free = FALSE; | |
2325 | } | |
2326 | // free remainder | |
2327 | leftover = ztBlocks[p].size - count; | |
2328 | if (leftover) { | |
2329 | ztFree(zone, p + ztBlocks[p].size - leftover, leftover); | |
2330 | } | |
2331 | ||
2332 | return p; | |
2333 | } | |
2334 | w = p; | |
2335 | } | |
2336 | q++; | |
2337 | }while (q <= ztFreeIndexMax); | |
2338 | ||
2339 | return -1U; | |
2340 | } | |
2341 | ||
2342 | __startup_func | |
2343 | static void | |
2344 | zone_tagging_init(vm_size_t max_zonemap_size) | |
2345 | { | |
2346 | kern_return_t ret; | |
2347 | vm_map_kernel_flags_t vmk_flags; | |
2348 | uint32_t idx; | |
2349 | ||
2350 | // allocate submaps VM_KERN_MEMORY_DIAG | |
2351 | ||
2352 | zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t); | |
2353 | vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; | |
2354 | vmk_flags.vmkf_permanent = TRUE; | |
2355 | ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size, | |
2356 | FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG, | |
2357 | &zone_tagbase_map); | |
2358 | ||
2359 | if (ret != KERN_SUCCESS) { | |
2360 | panic("zone_init: kmem_suballoc failed"); | |
2361 | } | |
2362 | zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size); | |
2363 | ||
2364 | zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t); | |
2365 | vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; | |
2366 | vmk_flags.vmkf_permanent = TRUE; | |
2367 | ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size, | |
2368 | FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG, | |
2369 | &zone_tags_map); | |
2370 | ||
2371 | if (ret != KERN_SUCCESS) { | |
2372 | panic("zone_init: kmem_suballoc failed"); | |
2373 | } | |
2374 | zone_tags_max = zone_tags_min + round_page(zone_tags_map_size); | |
2375 | ||
2376 | ztBlocks = (ztBlock *) zone_tags_min; | |
2377 | ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock)); | |
2378 | ||
2379 | // initialize the qheads | |
2380 | lck_mtx_lock(&ztLock); | |
2381 | ||
2382 | ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0); | |
2383 | for (idx = 0; idx < ztFreeIndexCount; idx++) { | |
2384 | ztBlocks[idx].free = TRUE; | |
2385 | ztBlocks[idx].next = idx; | |
2386 | ztBlocks[idx].prev = idx; | |
2387 | ztBlocks[idx].size = 0; | |
2388 | } | |
2389 | // free remaining space | |
2390 | ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount); | |
2391 | ||
2392 | lck_mtx_unlock(&ztLock); | |
2393 | } | |
2394 | ||
2395 | static void | |
2396 | ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size) | |
2397 | { | |
2398 | uint32_t * tagbase; | |
2399 | uint32_t count, block, blocks, idx; | |
2400 | size_t pages; | |
2401 | ||
2402 | pages = atop(size); | |
2403 | tagbase = ZTAGBASE(zone, mem); | |
2404 | ||
2405 | lck_mtx_lock(&ztLock); | |
2406 | ||
2407 | // fault tagbase | |
2408 | ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0); | |
2409 | ||
2410 | if (!zone->tags_inline) { | |
2411 | // allocate tags | |
2412 | count = (uint32_t)(size / zone_elem_size(zone)); | |
2413 | blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock); | |
2414 | block = ztAlloc(zone, blocks); | |
2415 | if (-1U == block) { | |
2416 | ztDump(false); | |
2417 | } | |
2418 | assert(-1U != block); | |
2419 | } | |
2420 | ||
2421 | lck_mtx_unlock(&ztLock); | |
2422 | ||
2423 | if (!zone->tags_inline) { | |
2424 | // set tag base for each page | |
2425 | block *= ztTagsPerBlock; | |
2426 | for (idx = 0; idx < pages; idx++) { | |
2427 | vm_offset_t esize = zone_elem_size(zone); | |
2428 | tagbase[idx] = block + (uint32_t)((ptoa(idx) + esize - 1) / esize); | |
2429 | } | |
2430 | } | |
2431 | } | |
2432 | ||
2433 | static void | |
2434 | ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size) | |
2435 | { | |
2436 | uint32_t * tagbase; | |
2437 | uint32_t count, block, blocks, idx; | |
2438 | size_t pages; | |
2439 | ||
2440 | // set tag base for each page | |
2441 | pages = atop(size); | |
2442 | tagbase = ZTAGBASE(zone, mem); | |
2443 | block = tagbase[0]; | |
2444 | for (idx = 0; idx < pages; idx++) { | |
2445 | tagbase[idx] = 0xFFFFFFFF; | |
2446 | } | |
2447 | ||
2448 | lck_mtx_lock(&ztLock); | |
2449 | if (!zone->tags_inline) { | |
2450 | count = (uint32_t)(size / zone_elem_size(zone)); | |
2451 | blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock); | |
2452 | assert(block != 0xFFFFFFFF); | |
2453 | block /= ztTagsPerBlock; | |
2454 | ztFree(NULL /* zone is unlocked */, block, blocks); | |
2455 | } | |
2456 | ||
2457 | lck_mtx_unlock(&ztLock); | |
2458 | } | |
2459 | ||
2460 | uint32_t | |
2461 | zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size) | |
2462 | { | |
2463 | simple_lock(&all_zones_lock, &zone_locks_grp); | |
2464 | ||
2465 | zone_index_foreach(idx) { | |
2466 | zone_t z = &zone_array[idx]; | |
2467 | if (!z->tags) { | |
2468 | continue; | |
2469 | } | |
2470 | if (tag_zone_index != z->tag_zone_index) { | |
2471 | continue; | |
2472 | } | |
2473 | ||
2474 | *elem_size = zone_elem_size(z); | |
2475 | simple_unlock(&all_zones_lock); | |
2476 | return idx; | |
2477 | } | |
2478 | ||
2479 | simple_unlock(&all_zones_lock); | |
2480 | ||
2481 | return -1U; | |
2482 | } | |
2483 | ||
2484 | #endif /* VM_MAX_TAG_ZONES */ | |
2485 | #endif /* !ZALLOC_TEST */ | |
2486 | #pragma mark zalloc helpers | |
2487 | #if !ZALLOC_TEST | |
2488 | ||
2489 | __pure2 | |
2490 | static inline uint16_t | |
2491 | zc_mag_size(void) | |
2492 | { | |
2493 | return zc_magazine_size; | |
2494 | } | |
2495 | ||
2496 | __attribute__((noinline, cold)) | |
2497 | static void | |
2498 | zone_lock_was_contended(zone_t zone, zone_cache_t zc) | |
2499 | { | |
2500 | lck_spin_lock_nopreempt(&zone->z_lock); | |
2501 | ||
2502 | /* | |
2503 | * If zone caching has been disabled due to memory pressure, | |
2504 | * then recording contention is not useful, give the system | |
2505 | * time to recover. | |
2506 | */ | |
2507 | if (__improbable(zone_caching_disabled)) { | |
2508 | return; | |
2509 | } | |
2510 | ||
2511 | zone->z_contention_cur++; | |
2512 | ||
2513 | if (zc == NULL || zc->zc_depot_max >= INT16_MAX * zc_mag_size()) { | |
2514 | return; | |
2515 | } | |
2516 | ||
2517 | /* | |
2518 | * Let the depot grow based on how bad the contention is, | |
2519 | * and how populated the zone is. | |
2520 | */ | |
2521 | if (zone->z_contention_wma < 2 * Z_CONTENTION_WMA_UNIT) { | |
2522 | if (zc->zc_depot_max * zpercpu_count() * 20u >= | |
2523 | zone->z_elems_avail) { | |
2524 | return; | |
2525 | } | |
2526 | } | |
2527 | if (zone->z_contention_wma < 4 * Z_CONTENTION_WMA_UNIT) { | |
2528 | if (zc->zc_depot_max * zpercpu_count() * 10u >= | |
2529 | zone->z_elems_avail) { | |
2530 | return; | |
2531 | } | |
2532 | } | |
2533 | if (!zc_grow_threshold || zone->z_contention_wma < | |
2534 | zc_grow_threshold * Z_CONTENTION_WMA_UNIT) { | |
2535 | return; | |
2536 | } | |
2537 | ||
2538 | zc->zc_depot_max++; | |
2539 | } | |
2540 | ||
2541 | static inline void | |
2542 | zone_lock_nopreempt_check_contention(zone_t zone, zone_cache_t zc) | |
2543 | { | |
2544 | if (lck_spin_try_lock_nopreempt(&zone->z_lock)) { | |
2545 | return; | |
2546 | } | |
2547 | ||
2548 | zone_lock_was_contended(zone, zc); | |
2549 | } | |
2550 | ||
2551 | static inline void | |
2552 | zone_lock_check_contention(zone_t zone, zone_cache_t zc) | |
2553 | { | |
2554 | disable_preemption(); | |
2555 | zone_lock_nopreempt_check_contention(zone, zc); | |
2556 | } | |
2557 | ||
2558 | static inline void | |
2559 | zone_unlock_nopreempt(zone_t zone) | |
2560 | { | |
2561 | lck_spin_unlock_nopreempt(&zone->z_lock); | |
2562 | } | |
2563 | ||
2564 | static inline void | |
2565 | zone_depot_lock_nopreempt(zone_cache_t zc) | |
2566 | { | |
2567 | hw_lock_bit_nopreempt(&zc->zc_depot_lock, 0, &zone_locks_grp); | |
2568 | } | |
2569 | ||
2570 | static inline void | |
2571 | zone_depot_unlock_nopreempt(zone_cache_t zc) | |
2572 | { | |
2573 | hw_unlock_bit_nopreempt(&zc->zc_depot_lock, 0); | |
2574 | } | |
2575 | ||
2576 | static inline void | |
2577 | zone_depot_lock(zone_cache_t zc) | |
2578 | { | |
2579 | hw_lock_bit(&zc->zc_depot_lock, 0, &zone_locks_grp); | |
2580 | } | |
2581 | ||
2582 | static inline void | |
2583 | zone_depot_unlock(zone_cache_t zc) | |
2584 | { | |
2585 | hw_unlock_bit(&zc->zc_depot_lock, 0); | |
2586 | } | |
2587 | ||
2588 | const char * | |
2589 | zone_name(zone_t z) | |
2590 | { | |
2591 | return z->z_name; | |
2592 | } | |
2593 | ||
2594 | const char * | |
2595 | zone_heap_name(zone_t z) | |
2596 | { | |
2597 | if (__probable(z->kalloc_heap < KHEAP_ID_COUNT)) { | |
2598 | return kalloc_heap_names[z->kalloc_heap]; | |
2599 | } | |
2600 | return "invalid"; | |
2601 | } | |
2602 | ||
2603 | static uint32_t | |
2604 | zone_alloc_pages_for_nelems(zone_t z, vm_size_t max_elems) | |
2605 | { | |
2606 | vm_size_t elem_count, chunks; | |
2607 | ||
2608 | elem_count = ptoa(z->z_percpu ? 1 : z->z_chunk_pages) / zone_elem_size(z); | |
2609 | chunks = (max_elems + elem_count - 1) / elem_count; | |
2610 | ||
2611 | return (uint32_t)MIN(UINT32_MAX, chunks * z->z_chunk_pages); | |
2612 | } | |
2613 | ||
2614 | static inline vm_size_t | |
2615 | zone_submaps_approx_size(void) | |
2616 | { | |
2617 | vm_size_t size = 0; | |
2618 | ||
2619 | for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) { | |
2620 | size += zone_submaps[idx]->size; | |
2621 | } | |
2622 | ||
2623 | return size; | |
2624 | } | |
2625 | ||
2626 | static void | |
2627 | zone_cache_swap_magazines(zone_cache_t cache) | |
2628 | { | |
2629 | uint16_t count_a = cache->zc_alloc_cur; | |
2630 | uint16_t count_f = cache->zc_free_cur; | |
2631 | zone_element_t *elems_a = cache->zc_alloc_elems; | |
2632 | zone_element_t *elems_f = cache->zc_free_elems; | |
2633 | ||
2634 | z_debug_assert(count_a <= zc_mag_size()); | |
2635 | z_debug_assert(count_f <= zc_mag_size()); | |
2636 | ||
2637 | cache->zc_alloc_cur = count_f; | |
2638 | cache->zc_free_cur = count_a; | |
2639 | cache->zc_alloc_elems = elems_f; | |
2640 | cache->zc_free_elems = elems_a; | |
2641 | } | |
2642 | ||
2643 | /*! | |
2644 | * @function zone_magazine_load | |
2645 | * | |
2646 | * @brief | |
2647 | * Cache the value of @c zm_cur on the cache to avoid a dependent load | |
2648 | * on the allocation fastpath. | |
2649 | */ | |
2650 | static void | |
2651 | zone_magazine_load(uint16_t *count, zone_element_t **elems, zone_magazine_t mag) | |
2652 | { | |
2653 | z_debug_assert(mag->zm_cur <= zc_mag_size()); | |
2654 | *count = mag->zm_cur; | |
2655 | *elems = mag->zm_elems; | |
2656 | } | |
2657 | ||
2658 | /*! | |
2659 | * @function zone_magazine_replace | |
2660 | * | |
2661 | * @brief | |
2662 | * Unlod a magazine and load a new one instead. | |
2663 | */ | |
2664 | static zone_magazine_t | |
2665 | zone_magazine_replace(uint16_t *count, zone_element_t **elems, | |
2666 | zone_magazine_t mag) | |
2667 | { | |
2668 | zone_magazine_t old; | |
2669 | ||
2670 | old = (zone_magazine_t)((uintptr_t)*elems - | |
2671 | offsetof(struct zone_magazine, zm_elems)); | |
2672 | old->zm_cur = *count; | |
2673 | z_debug_assert(old->zm_cur <= zc_mag_size()); | |
2674 | zone_magazine_load(count, elems, mag); | |
2675 | ||
2676 | return old; | |
2677 | } | |
2678 | ||
2679 | static zone_magazine_t | |
2680 | zone_magazine_alloc(zalloc_flags_t flags) | |
2681 | { | |
2682 | return zalloc_ext(zc_magazine_zone, zc_magazine_zone->z_stats, | |
2683 | flags | Z_ZERO); | |
2684 | } | |
2685 | ||
2686 | static void | |
2687 | zone_magazine_free(zone_magazine_t mag) | |
2688 | { | |
2689 | zfree_ext(zc_magazine_zone, zc_magazine_zone->z_stats, mag); | |
2690 | } | |
2691 | ||
2692 | static void | |
2693 | zone_enable_caching(zone_t zone) | |
2694 | { | |
2695 | zone_cache_t caches; | |
2696 | ||
2697 | caches = zalloc_percpu_permanent_type(struct zone_cache); | |
2698 | zpercpu_foreach(zc, caches) { | |
2699 | zone_magazine_load(&zc->zc_alloc_cur, &zc->zc_alloc_elems, | |
2700 | zone_magazine_alloc(Z_WAITOK | Z_NOFAIL)); | |
2701 | zone_magazine_load(&zc->zc_free_cur, &zc->zc_free_elems, | |
2702 | zone_magazine_alloc(Z_WAITOK | Z_NOFAIL)); | |
2703 | STAILQ_INIT(&zc->zc_depot); | |
2704 | } | |
2705 | ||
2706 | if (os_atomic_xchg(&zone->z_pcpu_cache, caches, release)) { | |
2707 | panic("allocating caches for zone %s twice", zone->z_name); | |
2708 | } | |
2709 | } | |
2710 | ||
2711 | bool | |
2712 | zone_maps_owned(vm_address_t addr, vm_size_t size) | |
2713 | { | |
2714 | return from_zone_map(addr, size, ZONE_ADDR_NATIVE); | |
2715 | } | |
2716 | ||
2717 | void | |
2718 | zone_map_sizes( | |
2719 | vm_map_size_t *psize, | |
2720 | vm_map_size_t *pfree, | |
2721 | vm_map_size_t *plargest_free) | |
2722 | { | |
2723 | vm_map_size_t size, free, largest; | |
2724 | ||
2725 | vm_map_sizes(zone_submaps[0], psize, pfree, plargest_free); | |
2726 | ||
2727 | for (uint32_t i = 1; i <= zone_last_submap_idx; i++) { | |
2728 | vm_map_sizes(zone_submaps[i], &size, &free, &largest); | |
2729 | *psize += size; | |
2730 | *pfree += free; | |
2731 | *plargest_free = MAX(*plargest_free, largest); | |
2732 | } | |
2733 | } | |
2734 | ||
2735 | __attribute__((always_inline)) | |
2736 | vm_map_t | |
2737 | zone_submap(zone_t zone) | |
2738 | { | |
2739 | return zone_submaps[zone->z_submap_idx]; | |
2740 | } | |
2741 | ||
2742 | unsigned | |
2743 | zpercpu_count(void) | |
2744 | { | |
2745 | return zpercpu_early_count; | |
2746 | } | |
2747 | ||
2748 | int | |
2749 | track_this_zone(const char *zonename, const char *logname) | |
2750 | { | |
2751 | unsigned int len; | |
2752 | const char *zc = zonename; | |
2753 | const char *lc = logname; | |
2754 | ||
2755 | /* | |
2756 | * Compare the strings. We bound the compare by MAX_ZONE_NAME. | |
2757 | */ | |
2758 | ||
2759 | for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) { | |
2760 | /* | |
2761 | * If the current characters don't match, check for a space in | |
2762 | * in the zone name and a corresponding period in the log name. | |
2763 | * If that's not there, then the strings don't match. | |
2764 | */ | |
2765 | ||
2766 | if (*zc != *lc && !(*zc == ' ' && *lc == '.')) { | |
2767 | break; | |
2768 | } | |
2769 | ||
2770 | /* | |
2771 | * The strings are equal so far. If we're at the end, then it's a match. | |
2772 | */ | |
2773 | ||
2774 | if (*zc == '\0') { | |
2775 | return TRUE; | |
2776 | } | |
2777 | } | |
2778 | ||
2779 | return FALSE; | |
2780 | } | |
2781 | ||
2782 | #if DEBUG || DEVELOPMENT | |
2783 | ||
2784 | vm_size_t | |
2785 | zone_element_info(void *addr, vm_tag_t * ptag) | |
2786 | { | |
2787 | vm_size_t size = 0; | |
2788 | vm_tag_t tag = VM_KERN_MEMORY_NONE; | |
2789 | struct zone *src_zone; | |
2790 | ||
2791 | if (from_zone_map(addr, sizeof(void *), ZONE_ADDR_NATIVE) || | |
2792 | from_zone_map(addr, sizeof(void *), ZONE_ADDR_FOREIGN)) { | |
2793 | src_zone = &zone_array[zone_index_from_ptr(addr)]; | |
2794 | #if VM_MAX_TAG_ZONES | |
2795 | if (__improbable(src_zone->tags)) { | |
2796 | tag = *ztSlot(src_zone, (vm_offset_t)addr) >> 1; | |
2797 | } | |
2798 | #endif /* VM_MAX_TAG_ZONES */ | |
2799 | size = zone_elem_size(src_zone); | |
2800 | } else { | |
2801 | #if CONFIG_GZALLOC | |
2802 | gzalloc_element_size(addr, NULL, &size); | |
2803 | #endif /* CONFIG_GZALLOC */ | |
2804 | } | |
2805 | *ptag = tag; | |
2806 | return size; | |
2807 | } | |
2808 | ||
2809 | #endif /* DEBUG || DEVELOPMENT */ | |
2810 | ||
2811 | /* The backup pointer is stored in the last pointer-sized location in an element. */ | |
2812 | __header_always_inline vm_offset_t * | |
2813 | get_primary_ptr(vm_offset_t elem) | |
2814 | { | |
2815 | return (vm_offset_t *)elem; | |
2816 | } | |
2817 | ||
2818 | __header_always_inline vm_offset_t * | |
2819 | get_backup_ptr(vm_offset_t elem, vm_size_t elem_size) | |
2820 | { | |
2821 | return (vm_offset_t *)(elem + elem_size - sizeof(vm_offset_t)); | |
2822 | } | |
2823 | ||
2824 | #endif /* !ZALLOC_TEST */ | |
2825 | #pragma mark Zone poisoning/zeroing and early random | |
2826 | #if !ZALLOC_TEST | |
2827 | ||
2828 | #define ZONE_ENTROPY_CNT 2 | |
2829 | static struct zone_bool_gen { | |
2830 | struct bool_gen zbg_bg; | |
2831 | uint32_t zbg_entropy[ZONE_ENTROPY_CNT]; | |
2832 | } zone_bool_gen[MAX_CPUS]; | |
2833 | ||
2834 | /* | |
2835 | * Initialize zone poisoning | |
2836 | * called from zone_bootstrap before any allocations are made from zalloc | |
2837 | */ | |
2838 | __startup_func | |
2839 | static void | |
2840 | zp_bootstrap(void) | |
2841 | { | |
2842 | char temp_buf[16]; | |
2843 | ||
2844 | /* | |
2845 | * Initialize canary random cookie. | |
2846 | * | |
2847 | * Make sure that (zp_canary ^ pointer) have non zero low bits (01) | |
2848 | * different from ZONE_POISON (11). | |
2849 | * | |
2850 | * On LP64, have (zp_canary ^ pointer) have the high bits equal 0xC0FFEE... | |
2851 | */ | |
2852 | static_assert(ZONE_POISON % 4 == 3); | |
2853 | zp_canary = (uintptr_t)early_random(); | |
2854 | #if __LP64__ | |
2855 | zp_canary &= 0x000000fffffffffc; | |
2856 | zp_canary |= 0xc0ffee0000000001 ^ 0xffffff0000000000; | |
2857 | #else | |
2858 | zp_canary &= 0xfffffffc; | |
2859 | zp_canary |= 0x00000001; | |
2860 | #endif | |
2861 | ||
2862 | /* -zp: enable poisoning for every alloc and free */ | |
2863 | if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) { | |
2864 | zp_factor = 1; | |
2865 | } | |
2866 | ||
2867 | /* -no-zp: disable poisoning */ | |
2868 | if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) { | |
2869 | zp_factor = 0; | |
2870 | printf("Zone poisoning disabled\n"); | |
2871 | } | |
2872 | ||
2873 | zpercpu_foreach_cpu(cpu) { | |
2874 | random_bool_init(&zone_bool_gen[cpu].zbg_bg); | |
2875 | } | |
2876 | } | |
2877 | ||
2878 | static inline uint32_t | |
2879 | zone_poison_count_init(zone_t zone) | |
2880 | { | |
2881 | return zp_factor + (((uint32_t)zone_elem_size(zone)) >> zp_scale) ^ | |
2882 | (mach_absolute_time() & 0x7); | |
2883 | } | |
2884 | ||
2885 | /* | |
2886 | * Zero the element if zone has z_free_zeroes flag set else poison | |
2887 | * the element if zs_poison_seqno hits 0. | |
2888 | */ | |
2889 | static zprot_mode_t | |
2890 | zfree_clear_or_poison(zone_t zone, vm_offset_t addr, vm_offset_t elem_size) | |
2891 | { | |
2892 | if (zone->z_free_zeroes) { | |
2893 | if (zone->z_percpu) { | |
2894 | zpercpu_foreach_cpu(i) { | |
2895 | bzero((void *)(addr + ptoa(i)), elem_size); | |
2896 | } | |
2897 | } else { | |
2898 | bzero((void *)addr, elem_size); | |
2899 | } | |
2900 | return ZPM_ZERO; | |
2901 | } | |
2902 | ||
2903 | zprot_mode_t poison = ZPM_AUTO; | |
2904 | #if ZALLOC_ENABLE_POISONING | |
2905 | if (__improbable(zp_factor == 1)) { | |
2906 | poison = ZPM_POISON; | |
2907 | } else if (__probable(zp_factor != 0)) { | |
2908 | uint32_t *seqnop = &zpercpu_get(zone->z_stats)->zs_poison_seqno; | |
2909 | uint32_t seqno = os_atomic_load(seqnop, relaxed); | |
2910 | if (seqno == 0) { | |
2911 | os_atomic_store(seqnop, zone_poison_count_init(zone), relaxed); | |
2912 | poison = ZPM_POISON; | |
2913 | } else { | |
2914 | os_atomic_store(seqnop, seqno - 1, relaxed); | |
2915 | } | |
2916 | } | |
2917 | if (poison == ZPM_POISON) { | |
2918 | /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */ | |
2919 | for (size_t i = 0; i < elem_size / sizeof(vm_offset_t); i++) { | |
2920 | ((vm_offset_t *)addr)[i] = ZONE_POISON; | |
2921 | } | |
2922 | } else { | |
2923 | /* | |
2924 | * Set a canary at the extremities. | |
2925 | * | |
2926 | * Zero first zp_min_size bytes of elements that aren't being | |
2927 | * poisoned. | |
2928 | * | |
2929 | * Element size is larger than zp_min_size in this path, | |
2930 | * zones with smaller elements have z_free_zeroes set. | |
2931 | */ | |
2932 | *get_primary_ptr(addr) = zp_canary ^ (uintptr_t)addr; | |
2933 | bzero((void *)addr + sizeof(vm_offset_t), | |
2934 | zp_min_size - sizeof(vm_offset_t)); | |
2935 | *get_backup_ptr(addr, elem_size) = zp_canary ^ (uintptr_t)addr; | |
2936 | ||
2937 | poison = ZPM_CANARY; | |
2938 | } | |
2939 | #endif /* ZALLOC_ENABLE_POISONING */ | |
2940 | ||
2941 | return poison; | |
2942 | } | |
2943 | ||
2944 | #if ZALLOC_ENABLE_POISONING | |
2945 | ||
2946 | __abortlike | |
2947 | static void | |
2948 | zalloc_uaf_panic(zone_t z, uintptr_t elem, size_t size, zprot_mode_t zpm) | |
2949 | { | |
2950 | uint32_t esize = (uint32_t)zone_elem_size(z); | |
2951 | uint32_t first_offs = ~0u; | |
2952 | uintptr_t first_bits = 0, v; | |
2953 | char buf[1024]; | |
2954 | int pos = 0; | |
2955 | const char *how; | |
2956 | ||
2957 | #if __LP64__ | |
2958 | #define ZPF "0x%016lx" | |
2959 | #else | |
2960 | #define ZPF "0x%08lx" | |
2961 | #endif | |
2962 | ||
2963 | buf[0] = '\0'; | |
2964 | ||
2965 | if (zpm == ZPM_CANARY) { | |
2966 | how = "canaries"; | |
2967 | ||
2968 | v = *get_primary_ptr(elem); | |
2969 | if (v != (elem ^ zp_canary)) { | |
2970 | pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n" | |
2971 | "%5d: got "ZPF", want "ZPF" (xor: "ZPF")", | |
2972 | 0, v, (elem ^ zp_canary), (v ^ elem ^ zp_canary)); | |
2973 | if (first_offs > 0) { | |
2974 | first_offs = 0; | |
2975 | first_bits = v; | |
2976 | } | |
2977 | } | |
2978 | ||
2979 | v = *get_backup_ptr(elem, esize); | |
2980 | if (v != (elem ^ zp_canary)) { | |
2981 | pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n" | |
2982 | "%5d: got "ZPF", want "ZPF" (xor: "ZPF")", | |
2983 | esize - (int)sizeof(v), v, (elem ^ zp_canary), | |
2984 | (v ^ elem ^ zp_canary)); | |
2985 | if (first_offs > esize - sizeof(v)) { | |
2986 | first_offs = esize - sizeof(v); | |
2987 | first_bits = v; | |
2988 | } | |
2989 | } | |
2990 | ||
2991 | for (uint32_t o = sizeof(v); o < zp_min_size; o += sizeof(v)) { | |
2992 | if ((v = *(uintptr_t *)(elem + o)) == 0) { | |
2993 | continue; | |
2994 | } | |
2995 | pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n" | |
2996 | "%5d: "ZPF, o, v); | |
2997 | if (first_offs > o) { | |
2998 | first_offs = o; | |
2999 | first_bits = v; | |
3000 | } | |
3001 | } | |
3002 | } else if (zpm == ZPM_ZERO) { | |
3003 | how = "zero"; | |
3004 | ||
3005 | for (uint32_t o = 0; o < size; o += sizeof(v)) { | |
3006 | if ((v = *(uintptr_t *)(elem + o)) == 0) { | |
3007 | continue; | |
3008 | } | |
3009 | pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n" | |
3010 | "%5d: "ZPF, o, v); | |
3011 | if (first_offs > o) { | |
3012 | first_offs = o; | |
3013 | first_bits = v; | |
3014 | } | |
3015 | } | |
3016 | } else { | |
3017 | how = "poison"; | |
3018 | ||
3019 | for (uint32_t o = 0; o < size; o += sizeof(v)) { | |
3020 | if ((v = *(uintptr_t *)(elem + o)) == ZONE_POISON) { | |
3021 | continue; | |
3022 | } | |
3023 | pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n" | |
3024 | "%5d: "ZPF" (xor: "ZPF")", | |
3025 | o, v, (v ^ ZONE_POISON)); | |
3026 | if (first_offs > o) { | |
3027 | first_offs = o; | |
3028 | first_bits = v; | |
3029 | } | |
3030 | } | |
3031 | } | |
3032 | ||
3033 | (panic)("[%s%s]: element modified after free " | |
3034 | "(off:%d, val:"ZPF", sz:%d, ptr:%p, prot:%s)%s", | |
3035 | zone_heap_name(z), zone_name(z), | |
3036 | first_offs, first_bits, esize, (void *)elem, how, buf); | |
3037 | ||
3038 | #undef ZPF | |
3039 | } | |
3040 | ||
3041 | static void | |
3042 | zalloc_validate_element_zero(zone_t zone, vm_offset_t elem, vm_size_t size) | |
3043 | { | |
3044 | if (memcmp_zero_ptr_aligned((void *)elem, size)) { | |
3045 | zalloc_uaf_panic(zone, elem, size, ZPM_ZERO); | |
3046 | } | |
3047 | if (!zone->z_percpu) { | |
3048 | return; | |
3049 | } | |
3050 | for (size_t i = zpercpu_count(); --i > 0;) { | |
3051 | elem += PAGE_SIZE; | |
3052 | if (memcmp_zero_ptr_aligned((void *)elem, size)) { | |
3053 | zalloc_uaf_panic(zone, elem, size, ZPM_ZERO); | |
3054 | } | |
3055 | } | |
3056 | } | |
3057 | ||
3058 | #if __arm64__ || __arm__ | |
3059 | typedef __attribute__((ext_vector_type(2))) vm_offset_t zpair_t; | |
3060 | #else | |
3061 | typedef struct { | |
3062 | vm_offset_t x; | |
3063 | vm_offset_t y; | |
3064 | } zpair_t; | |
3065 | #endif | |
3066 | ||
3067 | ||
3068 | __attribute__((noinline)) | |
3069 | static void | |
3070 | zalloc_validate_element_poison(zone_t zone, vm_offset_t elem, vm_size_t size) | |
3071 | { | |
3072 | vm_offset_t p = elem; | |
3073 | vm_offset_t end = elem + size; | |
3074 | ||
3075 | const zpair_t poison = { ZONE_POISON, ZONE_POISON }; | |
3076 | zpair_t a, b; | |
3077 | ||
3078 | a.x = *(const vm_offset_t *)p; | |
3079 | a.y = *(const vm_offset_t *)(end - sizeof(vm_offset_t)); | |
3080 | ||
3081 | a.x ^= poison.x; | |
3082 | a.y ^= poison.y; | |
3083 | ||
3084 | /* | |
3085 | * align p to the next double-wide boundary | |
3086 | * align end to the previous double-wide boundary | |
3087 | */ | |
3088 | p = (p + sizeof(zpair_t) - 1) & -sizeof(zpair_t); | |
3089 | end &= -sizeof(zpair_t); | |
3090 | ||
3091 | if ((end - p) % (2 * sizeof(zpair_t)) == 0) { | |
3092 | b.y = 0; | |
3093 | b.y = 0; | |
3094 | } else { | |
3095 | end -= sizeof(zpair_t); | |
3096 | b.x = ((zpair_t *)end)[0].x ^ poison.x; | |
3097 | b.y = ((zpair_t *)end)[0].y ^ poison.y; | |
3098 | } | |
3099 | ||
3100 | for (; p < end; p += 2 * sizeof(zpair_t)) { | |
3101 | a.x |= ((zpair_t *)p)[0].x ^ poison.x; | |
3102 | a.y |= ((zpair_t *)p)[0].y ^ poison.y; | |
3103 | b.x |= ((zpair_t *)p)[1].x ^ poison.x; | |
3104 | b.y |= ((zpair_t *)p)[1].y ^ poison.y; | |
3105 | } | |
3106 | ||
3107 | a.x |= b.x; | |
3108 | a.y |= b.y; | |
3109 | ||
3110 | if (a.x || a.y) { | |
3111 | zalloc_uaf_panic(zone, elem, size, ZPM_POISON); | |
3112 | } | |
3113 | } | |
3114 | ||
3115 | static void | |
3116 | zalloc_validate_element(zone_t zone, vm_offset_t elem, vm_size_t size, | |
3117 | zprot_mode_t zpm) | |
3118 | { | |
3119 | vm_offset_t *primary = get_primary_ptr(elem); | |
3120 | vm_offset_t *backup = get_backup_ptr(elem, size); | |
3121 | ||
3122 | #if CONFIG_GZALLOC | |
3123 | if (zone->gzalloc_tracked) { | |
3124 | return; | |
3125 | } | |
3126 | #endif /* CONFIG_GZALLOC */ | |
3127 | ||
3128 | if (zone->z_free_zeroes) { | |
3129 | return zalloc_validate_element_zero(zone, elem, size); | |
3130 | } | |
3131 | ||
3132 | switch (zpm) { | |
3133 | case ZPM_AUTO: | |
3134 | if (*backup == 0) { | |
3135 | size -= sizeof(vm_size_t); | |
3136 | return zalloc_validate_element_zero(zone, elem, size); | |
3137 | } | |
3138 | if (*backup == ZONE_POISON) { | |
3139 | size -= sizeof(vm_size_t); | |
3140 | return zalloc_validate_element_poison(zone, elem, size); | |
3141 | } | |
3142 | OS_FALLTHROUGH; | |
3143 | ||
3144 | case ZPM_CANARY: | |
3145 | if ((*primary ^ zp_canary) != elem || (*backup ^ zp_canary) != elem) { | |
3146 | zalloc_uaf_panic(zone, elem, size, ZPM_CANARY); | |
3147 | } | |
3148 | *primary = *backup = 0; | |
3149 | size = zp_min_size; | |
3150 | OS_FALLTHROUGH; | |
3151 | ||
3152 | case ZPM_ZERO: | |
3153 | return zalloc_validate_element_zero(zone, elem, size); | |
3154 | ||
3155 | case ZPM_POISON: | |
3156 | return zalloc_validate_element_poison(zone, elem, size); | |
3157 | } | |
3158 | } | |
3159 | ||
3160 | #endif /* ZALLOC_ENABLE_POISONING */ | |
3161 | #if ZALLOC_EARLY_GAPS | |
3162 | ||
3163 | __attribute__((noinline)) | |
3164 | static void | |
3165 | zone_early_gap_drop(int n) | |
3166 | { | |
3167 | while (n-- > 0) { | |
3168 | zone_t zone0 = &zone_array[0]; | |
3169 | struct zone_page_metadata *meta = NULL; | |
3170 | vm_offset_t addr; | |
3171 | uint16_t pages; | |
3172 | vm_map_t map; | |
3173 | ||
3174 | lck_mtx_lock(&zone_metadata_region_lck); | |
3175 | ||
3176 | if (!zone_pva_is_null(zone0->z_pageq_va)) { | |
3177 | meta = zone_meta_queue_pop_native(zone0, | |
3178 | &zone0->z_pageq_va, &addr); | |
3179 | map = zone_submaps[meta->zm_chunk_len]; | |
3180 | pages = meta->zm_alloc_size; | |
3181 | __builtin_bzero(meta, sizeof(struct zone_page_metadata)); | |
3182 | } | |
3183 | ||
3184 | lck_mtx_unlock(&zone_metadata_region_lck); | |
3185 | ||
3186 | if (!meta) { | |
3187 | break; | |
3188 | } | |
3189 | ||
3190 | kmem_free(map, addr, ptoa(pages)); | |
3191 | } | |
3192 | } | |
3193 | ||
3194 | static void | |
3195 | zone_early_gap_add(zone_t z, uint16_t pages) | |
3196 | { | |
3197 | struct zone_page_metadata *meta = NULL; | |
3198 | zone_t zone0 = &zone_array[0]; | |
3199 | kern_return_t kr; | |
3200 | vm_offset_t addr; | |
3201 | ||
3202 | kma_flags_t kmaflags = KMA_KOBJECT | KMA_ZERO | KMA_VAONLY; | |
3203 | if (z->z_submap_idx == Z_SUBMAP_IDX_GENERAL && | |
3204 | z->kalloc_heap != KHEAP_ID_NONE) { | |
3205 | kmaflags |= KMA_KHEAP; | |
3206 | } | |
3207 | ||
3208 | kr = kernel_memory_allocate(zone_submap(z), &addr, ptoa(pages), 0, | |
3209 | kmaflags, VM_KERN_MEMORY_ZONE); | |
3210 | ||
3211 | if (kr != KERN_SUCCESS) { | |
3212 | panic("unable to allocate early gap (%d pages): %d", pages, kr); | |
3213 | } | |
3214 | ||
3215 | zone_meta_populate(addr, ptoa(pages)); | |
3216 | ||
3217 | meta = zone_meta_from_addr(addr); | |
3218 | meta->zm_alloc_size = pages; | |
3219 | meta->zm_chunk_len = z->z_submap_idx; | |
3220 | ||
3221 | lck_mtx_lock(&zone_metadata_region_lck); | |
3222 | zone_meta_queue_push(zone0, &zone0->z_pageq_va, meta); | |
3223 | lck_mtx_unlock(&zone_metadata_region_lck); | |
3224 | } | |
3225 | ||
3226 | /* | |
3227 | * Roughly until pd1 is made, introduce random gaps | |
3228 | * between allocated pages. | |
3229 | * | |
3230 | * This way the early boot allocations are not in a completely | |
3231 | * predictible order and relative position. | |
3232 | * | |
3233 | * Those gaps are returned to the maps afterwards. | |
3234 | * | |
3235 | * We abuse the zone 0 (which is unused) "va" pageq to remember | |
3236 | * those ranges. | |
3237 | */ | |
3238 | __attribute__((noinline)) | |
3239 | static void | |
3240 | zone_allocate_random_early_gap(zone_t z) | |
3241 | { | |
3242 | int16_t pages = early_random() % 16; | |
3243 | ||
3244 | /* | |
3245 | * 6% of the time: drop 2 gaps | |
3246 | * 25% of the time: drop 1 gap | |
3247 | * 37% of the time: do nothing | |
3248 | * 18% of the time: add 1 gap | |
3249 | * 12% of the time: add 2 gaps | |
3250 | */ | |
3251 | if (pages > 10) { | |
3252 | zone_early_gap_drop(pages == 15 ? 2 : 1); | |
3253 | } | |
3254 | if (pages < 5) { | |
3255 | /* values are 6 through 16 */ | |
3256 | zone_early_gap_add(z, 6 + 2 * pages); | |
3257 | } | |
3258 | if (pages < 2) { | |
3259 | zone_early_gap_add(z, 6 + early_random() % 16); | |
3260 | } | |
3261 | } | |
3262 | ||
3263 | static inline void | |
3264 | zone_cleanup_early_gaps_if_needed(void) | |
3265 | { | |
3266 | if (__improbable(!zone_pva_is_null(zone_array[0].z_pageq_va))) { | |
3267 | zone_early_gap_drop(10); | |
3268 | } | |
3269 | } | |
3270 | ||
3271 | #endif /* ZALLOC_EARLY_GAPS */ | |
3272 | ||
3273 | static void | |
3274 | zone_early_scramble_rr(zone_t zone, zone_stats_t zstats) | |
3275 | { | |
3276 | int cpu = cpu_number(); | |
3277 | zone_stats_t zs = zpercpu_get_cpu(zstats, cpu); | |
3278 | uint32_t bits; | |
3279 | ||
3280 | bits = random_bool_gen_bits(&zone_bool_gen[cpu].zbg_bg, | |
3281 | zone_bool_gen[cpu].zbg_entropy, ZONE_ENTROPY_CNT, 8); | |
3282 | ||
3283 | zs->zs_alloc_rr += bits; | |
3284 | zs->zs_alloc_rr %= zone->z_chunk_elems; | |
3285 | } | |
3286 | ||
3287 | #endif /* !ZALLOC_TEST */ | |
3288 | #pragma mark Zone Leak Detection | |
3289 | #if !ZALLOC_TEST | |
3290 | ||
3291 | /* | |
3292 | * Zone leak debugging code | |
3293 | * | |
3294 | * When enabled, this code keeps a log to track allocations to a particular zone that have not | |
3295 | * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated | |
3296 | * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is | |
3297 | * off by default. | |
3298 | * | |
3299 | * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone> | |
3300 | * is the name of the zone you wish to log. | |
3301 | * | |
3302 | * This code only tracks one zone, so you need to identify which one is leaking first. | |
3303 | * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone | |
3304 | * garbage collector. Note that the zone name printed in the panic message is not necessarily the one | |
3305 | * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This | |
3306 | * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The | |
3307 | * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs. | |
3308 | * See the help in the kgmacros for usage info. | |
3309 | * | |
3310 | * | |
3311 | * Zone corruption logging | |
3312 | * | |
3313 | * Logging can also be used to help identify the source of a zone corruption. First, identify the zone | |
3314 | * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args. When -zc is used in conjunction | |
3315 | * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the | |
3316 | * corruption is detected, examining the log will show you the stack traces of the callers who last allocated | |
3317 | * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been | |
3318 | * corrupted to examine its history. This should lead to the source of the corruption. | |
3319 | */ | |
3320 | ||
3321 | /* Returns TRUE if we rolled over the counter at factor */ | |
3322 | __header_always_inline bool | |
3323 | sample_counter(volatile uint32_t *count_p, uint32_t factor) | |
3324 | { | |
3325 | uint32_t old_count, new_count = 0; | |
3326 | if (count_p != NULL) { | |
3327 | os_atomic_rmw_loop(count_p, old_count, new_count, relaxed, { | |
3328 | new_count = old_count + 1; | |
3329 | if (new_count >= factor) { | |
3330 | new_count = 0; | |
3331 | } | |
3332 | }); | |
3333 | } | |
3334 | ||
3335 | return new_count == 0; | |
3336 | } | |
3337 | ||
3338 | #if ZONE_ENABLE_LOGGING | |
3339 | /* Log allocations and frees to help debug a zone element corruption */ | |
3340 | static TUNABLE(bool, corruption_debug_flag, "-zc", false); | |
3341 | ||
3342 | #define MAX_NUM_ZONES_ALLOWED_LOGGING 10 /* Maximum 10 zones can be logged at once */ | |
3343 | ||
3344 | static int max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING; | |
3345 | static int num_zones_logged = 0; | |
3346 | ||
3347 | /* | |
3348 | * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to | |
3349 | * the number of records you want in the log. For example, "zrecs=10" sets it to 10 records. Since this | |
3350 | * is the number of stacks suspected of leaking, we don't need many records. | |
3351 | */ | |
3352 | ||
3353 | #if defined(__LP64__) | |
3354 | #define ZRECORDS_MAX 2560 /* Max records allowed in the log */ | |
3355 | #else | |
3356 | #define ZRECORDS_MAX 1536 /* Max records allowed in the log */ | |
3357 | #endif | |
3358 | #define ZRECORDS_DEFAULT 1024 /* default records in log if zrecs is not specificed in boot-args */ | |
3359 | ||
3360 | static TUNABLE(uint32_t, log_records, "zrecs", ZRECORDS_DEFAULT); | |
3361 | ||
3362 | static void | |
3363 | zone_enable_logging(zone_t z) | |
3364 | { | |
3365 | z->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, | |
3366 | (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */); | |
3367 | ||
3368 | if (z->zlog_btlog) { | |
3369 | printf("zone: logging started for zone %s%s\n", | |
3370 | zone_heap_name(z), z->z_name); | |
3371 | } else { | |
3372 | printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n"); | |
3373 | z->zone_logging = false; | |
3374 | } | |
3375 | } | |
3376 | ||
3377 | /** | |
3378 | * @function zone_setup_logging | |
3379 | * | |
3380 | * @abstract | |
3381 | * Optionally sets up a zone for logging. | |
3382 | * | |
3383 | * @discussion | |
3384 | * We recognized two boot-args: | |
3385 | * | |
3386 | * zlog=<zone_to_log> | |
3387 | * zrecs=<num_records_in_log> | |
3388 | * | |
3389 | * The zlog arg is used to specify the zone name that should be logged, | |
3390 | * and zrecs is used to control the size of the log. | |
3391 | * | |
3392 | * If zrecs is not specified, a default value is used. | |
3393 | */ | |
3394 | static void | |
3395 | zone_setup_logging(zone_t z) | |
3396 | { | |
3397 | char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */ | |
3398 | char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */ | |
3399 | char zlog_val[MAX_ZONE_NAME]; /* the zone name we're logging, if any */ | |
3400 | ||
3401 | /* | |
3402 | * Don't allow more than ZRECORDS_MAX records even if the user asked for more. | |
3403 | * | |
3404 | * This prevents accidentally hogging too much kernel memory | |
3405 | * and making the system unusable. | |
3406 | */ | |
3407 | if (log_records > ZRECORDS_MAX) { | |
3408 | log_records = ZRECORDS_MAX; | |
3409 | } | |
3410 | ||
3411 | /* | |
3412 | * Append kalloc heap name to zone name (if zone is used by kalloc) | |
3413 | */ | |
3414 | snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name); | |
3415 | ||
3416 | /* zlog0 isn't allowed. */ | |
3417 | for (int i = 1; i <= max_num_zones_to_log; i++) { | |
3418 | snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i); | |
3419 | ||
3420 | if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val)) && | |
3421 | track_this_zone(zone_name, zlog_val)) { | |
3422 | z->zone_logging = true; | |
3423 | num_zones_logged++; | |
3424 | break; | |
3425 | } | |
3426 | } | |
3427 | ||
3428 | /* | |
3429 | * Backwards compat. with the old boot-arg used to specify single zone | |
3430 | * logging i.e. zlog Needs to happen after the newer zlogn checks | |
3431 | * because the prefix will match all the zlogn | |
3432 | * boot-args. | |
3433 | */ | |
3434 | if (!z->zone_logging && | |
3435 | PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val)) && | |
3436 | track_this_zone(zone_name, zlog_val)) { | |
3437 | z->zone_logging = true; | |
3438 | num_zones_logged++; | |
3439 | } | |
3440 | ||
3441 | ||
3442 | /* | |
3443 | * If we want to log a zone, see if we need to allocate buffer space for | |
3444 | * the log. | |
3445 | * | |
3446 | * Some vm related zones are zinit'ed before we can do a kmem_alloc, so | |
3447 | * we have to defer allocation in that case. | |
3448 | * | |
3449 | * zone_init() will finish the job. | |
3450 | * | |
3451 | * If we want to log one of the VM related zones that's set up early on, | |
3452 | * we will skip allocation of the log until zinit is called again later | |
3453 | * on some other zone. | |
3454 | */ | |
3455 | if (z->zone_logging && startup_phase >= STARTUP_SUB_KMEM_ALLOC) { | |
3456 | zone_enable_logging(z); | |
3457 | } | |
3458 | } | |
3459 | ||
3460 | /* | |
3461 | * Each record in the log contains a pointer to the zone element it refers to, | |
3462 | * and a small array to hold the pc's from the stack trace. A | |
3463 | * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging, | |
3464 | * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees. | |
3465 | * If the log fills, old records are replaced as if it were a circular buffer. | |
3466 | */ | |
3467 | ||
3468 | ||
3469 | /* | |
3470 | * Decide if we want to log this zone by doing a string compare between a zone name and the name | |
3471 | * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not | |
3472 | * possible to include spaces in strings passed in via the boot-args, a period in the logname will | |
3473 | * match a space in the zone name. | |
3474 | */ | |
3475 | ||
3476 | /* | |
3477 | * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and | |
3478 | * the buffer for the records has been allocated. | |
3479 | */ | |
3480 | ||
3481 | #define DO_LOGGING(z) (z->zlog_btlog != NULL) | |
3482 | #else /* !ZONE_ENABLE_LOGGING */ | |
3483 | #define DO_LOGGING(z) 0 | |
3484 | #endif /* !ZONE_ENABLE_LOGGING */ | |
3485 | #if CONFIG_ZLEAKS | |
3486 | ||
3487 | /* | |
3488 | * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding | |
3489 | * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a | |
3490 | * backtrace. Every free, we examine the table and determine if the allocation was being tracked, | |
3491 | * and stop tracking it if it was being tracked. | |
3492 | * | |
3493 | * We track the allocations in the zallocations hash table, which stores the address that was returned from | |
3494 | * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which | |
3495 | * stores the backtrace associated with that allocation. This provides uniquing for the relatively large | |
3496 | * backtraces - we don't store them more than once. | |
3497 | * | |
3498 | * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up | |
3499 | * a large amount of virtual space. | |
3500 | */ | |
3501 | #define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */ | |
3502 | #define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */ | |
3503 | #define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */ | |
3504 | #define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */ | |
3505 | static uint32_t zleak_state = 0; /* State of collection, as above */ | |
3506 | static unsigned int zleak_sample_factor = 1000; /* Allocations per sample attempt */ | |
3507 | ||
3508 | bool panic_include_ztrace = FALSE; /* Enable zleak logging on panic */ | |
3509 | vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */ | |
3510 | vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */ | |
3511 | ||
3512 | /* | |
3513 | * Counters for allocation statistics. | |
3514 | */ | |
3515 | ||
3516 | /* Times two active records want to occupy the same spot */ | |
3517 | static unsigned int z_alloc_collisions = 0; | |
3518 | static unsigned int z_trace_collisions = 0; | |
3519 | ||
3520 | /* Times a new record lands on a spot previously occupied by a freed allocation */ | |
3521 | static unsigned int z_alloc_overwrites = 0; | |
3522 | static unsigned int z_trace_overwrites = 0; | |
3523 | ||
3524 | /* Times a new alloc or trace is put into the hash table */ | |
3525 | static unsigned int z_alloc_recorded = 0; | |
3526 | static unsigned int z_trace_recorded = 0; | |
3527 | ||
3528 | /* Times zleak_log returned false due to not being able to acquire the lock */ | |
3529 | static unsigned int z_total_conflicts = 0; | |
3530 | ||
3531 | /* | |
3532 | * Structure for keeping track of an allocation | |
3533 | * An allocation bucket is in use if its element is not NULL | |
3534 | */ | |
3535 | struct zallocation { | |
3536 | uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */ | |
3537 | vm_size_t za_size; /* how much memory did this allocation take up? */ | |
3538 | uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */ | |
3539 | /* TODO: #if this out */ | |
3540 | uint32_t za_hit_count; /* for determining effectiveness of hash function */ | |
3541 | }; | |
3542 | ||
3543 | /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */ | |
3544 | static uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM; | |
3545 | static uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM; | |
3546 | ||
3547 | vm_size_t zleak_max_zonemap_size; | |
3548 | ||
3549 | /* Hashmaps of allocations and their corresponding traces */ | |
3550 | static struct zallocation* zallocations; | |
3551 | static struct ztrace* ztraces; | |
3552 | ||
3553 | /* not static so that panic can see this, see kern/debug.c */ | |
3554 | struct ztrace* top_ztrace; | |
3555 | ||
3556 | /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */ | |
3557 | static LCK_GRP_DECLARE(zleak_lock_grp, "zleak_lock"); | |
3558 | static LCK_SPIN_DECLARE(zleak_lock, &zleak_lock_grp); | |
3559 | ||
3560 | /* | |
3561 | * Initializes the zone leak monitor. Called from zone_init() | |
3562 | */ | |
3563 | __startup_func | |
3564 | static void | |
3565 | zleak_init(vm_size_t max_zonemap_size) | |
3566 | { | |
3567 | char scratch_buf[16]; | |
3568 | boolean_t zleak_enable_flag = FALSE; | |
3569 | ||
3570 | zleak_max_zonemap_size = max_zonemap_size; | |
3571 | zleak_global_tracking_threshold = max_zonemap_size / 2; | |
3572 | zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8; | |
3573 | ||
3574 | #if CONFIG_EMBEDDED | |
3575 | if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) { | |
3576 | zleak_enable_flag = TRUE; | |
3577 | printf("zone leak detection enabled\n"); | |
3578 | } else { | |
3579 | zleak_enable_flag = FALSE; | |
3580 | printf("zone leak detection disabled\n"); | |
3581 | } | |
3582 | #else /* CONFIG_EMBEDDED */ | |
3583 | /* -zleakoff (flag to disable zone leak monitor) */ | |
3584 | if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) { | |
3585 | zleak_enable_flag = FALSE; | |
3586 | printf("zone leak detection disabled\n"); | |
3587 | } else { | |
3588 | zleak_enable_flag = TRUE; | |
3589 | printf("zone leak detection enabled\n"); | |
3590 | } | |
3591 | #endif /* CONFIG_EMBEDDED */ | |
3592 | ||
3593 | /* zfactor=XXXX (override how often to sample the zone allocator) */ | |
3594 | if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) { | |
3595 | printf("Zone leak factor override: %u\n", zleak_sample_factor); | |
3596 | } | |
3597 | ||
3598 | /* zleak-allocs=XXXX (override number of buckets in zallocations) */ | |
3599 | if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) { | |
3600 | printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets); | |
3601 | /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ | |
3602 | if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) { | |
3603 | printf("Override isn't a power of two, bad things might happen!\n"); | |
3604 | } | |
3605 | } | |
3606 | ||
3607 | /* zleak-traces=XXXX (override number of buckets in ztraces) */ | |
3608 | if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) { | |
3609 | printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets); | |
3610 | /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ | |
3611 | if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) { | |
3612 | printf("Override isn't a power of two, bad things might happen!\n"); | |
3613 | } | |
3614 | } | |
3615 | ||
3616 | if (zleak_enable_flag) { | |
3617 | zleak_state = ZLEAK_STATE_ENABLED; | |
3618 | } | |
3619 | } | |
3620 | ||
3621 | /* | |
3622 | * Support for kern.zleak.active sysctl - a simplified | |
3623 | * version of the zleak_state variable. | |
3624 | */ | |
3625 | int | |
3626 | get_zleak_state(void) | |
3627 | { | |
3628 | if (zleak_state & ZLEAK_STATE_FAILED) { | |
3629 | return -1; | |
3630 | } | |
3631 | if (zleak_state & ZLEAK_STATE_ACTIVE) { | |
3632 | return 1; | |
3633 | } | |
3634 | return 0; | |
3635 | } | |
3636 | ||
3637 | kern_return_t | |
3638 | zleak_activate(void) | |
3639 | { | |
3640 | kern_return_t retval; | |
3641 | vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation); | |
3642 | vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace); | |
3643 | void *allocations_ptr = NULL; | |
3644 | void *traces_ptr = NULL; | |
3645 | ||
3646 | /* Only one thread attempts to activate at a time */ | |
3647 | if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { | |
3648 | return KERN_SUCCESS; | |
3649 | } | |
3650 | ||
3651 | /* Indicate that we're doing the setup */ | |
3652 | lck_spin_lock(&zleak_lock); | |
3653 | if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { | |
3654 | lck_spin_unlock(&zleak_lock); | |
3655 | return KERN_SUCCESS; | |
3656 | } | |
3657 | ||
3658 | zleak_state |= ZLEAK_STATE_ACTIVATING; | |
3659 | lck_spin_unlock(&zleak_lock); | |
3660 | ||
3661 | /* Allocate and zero tables */ | |
3662 | retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_DIAG); | |
3663 | if (retval != KERN_SUCCESS) { | |
3664 | goto fail; | |
3665 | } | |
3666 | ||
3667 | retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_DIAG); | |
3668 | if (retval != KERN_SUCCESS) { | |
3669 | goto fail; | |
3670 | } | |
3671 | ||
3672 | bzero(allocations_ptr, z_alloc_size); | |
3673 | bzero(traces_ptr, z_trace_size); | |
3674 | ||
3675 | /* Everything's set. Install tables, mark active. */ | |
3676 | zallocations = allocations_ptr; | |
3677 | ztraces = traces_ptr; | |
3678 | ||
3679 | /* | |
3680 | * Initialize the top_ztrace to the first entry in ztraces, | |
3681 | * so we don't have to check for null in zleak_log | |
3682 | */ | |
3683 | top_ztrace = &ztraces[0]; | |
3684 | ||
3685 | /* | |
3686 | * Note that we do need a barrier between installing | |
3687 | * the tables and setting the active flag, because the zfree() | |
3688 | * path accesses the table without a lock if we're active. | |
3689 | */ | |
3690 | lck_spin_lock(&zleak_lock); | |
3691 | zleak_state |= ZLEAK_STATE_ACTIVE; | |
3692 | zleak_state &= ~ZLEAK_STATE_ACTIVATING; | |
3693 | lck_spin_unlock(&zleak_lock); | |
3694 | ||
3695 | return 0; | |
3696 | ||
3697 | fail: | |
3698 | /* | |
3699 | * If we fail to allocate memory, don't further tax | |
3700 | * the system by trying again. | |
3701 | */ | |
3702 | lck_spin_lock(&zleak_lock); | |
3703 | zleak_state |= ZLEAK_STATE_FAILED; | |
3704 | zleak_state &= ~ZLEAK_STATE_ACTIVATING; | |
3705 | lck_spin_unlock(&zleak_lock); | |
3706 | ||
3707 | if (allocations_ptr != NULL) { | |
3708 | kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size); | |
3709 | } | |
3710 | ||
3711 | if (traces_ptr != NULL) { | |
3712 | kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size); | |
3713 | } | |
3714 | ||
3715 | return retval; | |
3716 | } | |
3717 | ||
3718 | static inline void | |
3719 | zleak_activate_if_needed(void) | |
3720 | { | |
3721 | if (__probable((zleak_state & ZLEAK_STATE_ENABLED) == 0)) { | |
3722 | return; | |
3723 | } | |
3724 | if (zleak_state & ZLEAK_STATE_ACTIVE) { | |
3725 | return; | |
3726 | } | |
3727 | if (zone_submaps_approx_size() < zleak_global_tracking_threshold) { | |
3728 | return; | |
3729 | } | |
3730 | ||
3731 | kern_return_t kr = zleak_activate(); | |
3732 | if (kr != KERN_SUCCESS) { | |
3733 | printf("Failed to activate live zone leak debugging (%d).\n", kr); | |
3734 | } | |
3735 | } | |
3736 | ||
3737 | static inline void | |
3738 | zleak_track_if_needed(zone_t z) | |
3739 | { | |
3740 | if (__improbable(zleak_state & ZLEAK_STATE_ACTIVE)) { | |
3741 | if (!z->zleak_on && | |
3742 | zone_size_wired(z) >= zleak_per_zone_tracking_threshold) { | |
3743 | z->zleak_on = true; | |
3744 | } | |
3745 | } | |
3746 | } | |
3747 | ||
3748 | /* | |
3749 | * TODO: What about allocations that never get deallocated, | |
3750 | * especially ones with unique backtraces? Should we wait to record | |
3751 | * until after boot has completed? | |
3752 | * (How many persistent zallocs are there?) | |
3753 | */ | |
3754 | ||
3e170ce0 | 3755 | /* |
c3c9b80d A |
3756 | * This function records the allocation in the allocations table, |
3757 | * and stores the associated backtrace in the traces table | |
3758 | * (or just increments the refcount if the trace is already recorded) | |
3759 | * If the allocation slot is in use, the old allocation is replaced with the new allocation, and | |
3760 | * the associated trace's refcount is decremented. | |
3761 | * If the trace slot is in use, it returns. | |
3762 | * The refcount is incremented by the amount of memory the allocation consumes. | |
3763 | * The return value indicates whether to try again next time. | |
3e170ce0 | 3764 | */ |
c3c9b80d A |
3765 | static boolean_t |
3766 | zleak_log(uintptr_t* bt, | |
3767 | uintptr_t addr, | |
3768 | uint32_t depth, | |
3769 | vm_size_t allocation_size) | |
3770 | { | |
3771 | /* Quit if there's someone else modifying the hash tables */ | |
3772 | if (!lck_spin_try_lock(&zleak_lock)) { | |
3773 | z_total_conflicts++; | |
3774 | return FALSE; | |
3775 | } | |
3776 | ||
3777 | struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; | |
3778 | ||
3779 | uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets); | |
3780 | struct ztrace* trace = &ztraces[trace_index]; | |
3781 | ||
3782 | allocation->za_hit_count++; | |
3783 | trace->zt_hit_count++; | |
3784 | ||
3785 | /* | |
3786 | * If the allocation bucket we want to be in is occupied, and if the occupier | |
3787 | * has the same trace as us, just bail. | |
3788 | */ | |
3789 | if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) { | |
3790 | z_alloc_collisions++; | |
3791 | ||
3792 | lck_spin_unlock(&zleak_lock); | |
3793 | return TRUE; | |
3794 | } | |
3795 | ||
3796 | /* STEP 1: Store the backtrace in the traces array. */ | |
3797 | /* A size of zero indicates that the trace bucket is free. */ | |
3798 | ||
3799 | if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) { | |
3800 | /* | |
3801 | * Different unique trace with same hash! | |
3802 | * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated | |
3803 | * and get out of the way for later chances | |
3804 | */ | |
3805 | trace->zt_collisions++; | |
3806 | z_trace_collisions++; | |
3807 | ||
3808 | lck_spin_unlock(&zleak_lock); | |
3809 | return TRUE; | |
3810 | } else if (trace->zt_size > 0) { | |
3811 | /* Same trace, already added, so increment refcount */ | |
3812 | trace->zt_size += allocation_size; | |
3813 | } else { | |
3814 | /* Found an unused trace bucket, record the trace here! */ | |
3815 | if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */ | |
3816 | z_trace_overwrites++; | |
3817 | } | |
3818 | ||
3819 | z_trace_recorded++; | |
3820 | trace->zt_size = allocation_size; | |
3821 | memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t))); | |
3822 | ||
3823 | trace->zt_depth = depth; | |
3824 | trace->zt_collisions = 0; | |
3825 | } | |
3826 | ||
3827 | /* STEP 2: Store the allocation record in the allocations array. */ | |
3828 | ||
3829 | if (allocation->za_element != (uintptr_t) 0) { | |
3830 | /* | |
3831 | * Straight up replace any allocation record that was there. We don't want to do the work | |
3832 | * to preserve the allocation entries that were there, because we only record a subset of the | |
3833 | * allocations anyways. | |
3834 | */ | |
3835 | ||
3836 | z_alloc_collisions++; | |
3837 | ||
3838 | struct ztrace* associated_trace = &ztraces[allocation->za_trace_index]; | |
3839 | /* Knock off old allocation's size, not the new allocation */ | |
3840 | associated_trace->zt_size -= allocation->za_size; | |
3841 | } else if (allocation->za_trace_index != 0) { | |
3842 | /* Slot previously used but not currently in use */ | |
3843 | z_alloc_overwrites++; | |
3844 | } | |
3845 | ||
3846 | allocation->za_element = addr; | |
3847 | allocation->za_trace_index = trace_index; | |
3848 | allocation->za_size = allocation_size; | |
3849 | ||
3850 | z_alloc_recorded++; | |
3851 | ||
3852 | if (top_ztrace->zt_size < trace->zt_size) { | |
3853 | top_ztrace = trace; | |
3854 | } | |
3855 | ||
3856 | lck_spin_unlock(&zleak_lock); | |
3857 | return TRUE; | |
3858 | } | |
3859 | ||
3860 | /* | |
3861 | * Free the allocation record and release the stacktrace. | |
3862 | * This should be as fast as possible because it will be called for every free. | |
3863 | */ | |
3864 | __attribute__((noinline)) | |
3865 | static void | |
3866 | zleak_free(uintptr_t addr, | |
3867 | vm_size_t allocation_size) | |
f427ee49 | 3868 | { |
c3c9b80d A |
3869 | if (addr == (uintptr_t) 0) { |
3870 | return; | |
3871 | } | |
3872 | ||
3873 | struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; | |
3874 | ||
3875 | /* Double-checked locking: check to find out if we're interested, lock, check to make | |
3876 | * sure it hasn't changed, then modify it, and release the lock. | |
3877 | */ | |
3878 | ||
3879 | if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { | |
3880 | /* if the allocation was the one, grab the lock, check again, then delete it */ | |
3881 | lck_spin_lock(&zleak_lock); | |
3882 | ||
3883 | if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { | |
3884 | struct ztrace *trace; | |
3885 | ||
3886 | /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */ | |
3887 | if (allocation->za_size != allocation_size) { | |
3888 | panic("Freeing as size %lu memory that was allocated with size %lu\n", | |
3889 | (uintptr_t)allocation_size, (uintptr_t)allocation->za_size); | |
3890 | } | |
3891 | ||
3892 | trace = &ztraces[allocation->za_trace_index]; | |
3893 | ||
3894 | /* size of 0 indicates trace bucket is unused */ | |
3895 | if (trace->zt_size > 0) { | |
3896 | trace->zt_size -= allocation_size; | |
3897 | } | |
1c79356b | 3898 | |
c3c9b80d A |
3899 | /* A NULL element means the allocation bucket is unused */ |
3900 | allocation->za_element = 0; | |
3901 | } | |
3902 | lck_spin_unlock(&zleak_lock); | |
f427ee49 | 3903 | } |
c3c9b80d | 3904 | } |
2d21ac55 | 3905 | |
c3c9b80d A |
3906 | #else |
3907 | static inline void | |
3908 | zleak_activate_if_needed(void) | |
3909 | { | |
f427ee49 | 3910 | } |
1c79356b | 3911 | |
c3c9b80d A |
3912 | static inline void |
3913 | zleak_track_if_needed(__unused zone_t z) | |
f427ee49 | 3914 | { |
f427ee49 | 3915 | } |
c3c9b80d A |
3916 | #endif /* CONFIG_ZLEAKS */ |
3917 | #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS | |
1c79356b | 3918 | |
c3c9b80d A |
3919 | __attribute__((noinline)) |
3920 | static void | |
3921 | zalloc_log_or_trace_leaks(zone_t zone, vm_offset_t addr, void *fp) | |
f427ee49 | 3922 | { |
c3c9b80d A |
3923 | uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */ |
3924 | unsigned int numsaved = 0; | |
3925 | ||
3926 | #if ZONE_ENABLE_LOGGING | |
3927 | if (DO_LOGGING(zone)) { | |
3928 | numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL); | |
3929 | btlog_add_entry(zone->zlog_btlog, (void *)addr, | |
3930 | ZOP_ALLOC, (void **)zbt, numsaved); | |
3931 | } | |
3932 | #endif /* ZONE_ENABLE_LOGGING */ | |
3933 | ||
3934 | #if CONFIG_ZLEAKS | |
3935 | /* | |
3936 | * Zone leak detection: capture a backtrace every zleak_sample_factor | |
3937 | * allocations in this zone. | |
3938 | */ | |
3939 | if (__improbable(zone->zleak_on)) { | |
3940 | if (sample_counter(&zone->zleak_capture, zleak_sample_factor)) { | |
3941 | /* Avoid backtracing twice if zone logging is on */ | |
3942 | if (numsaved == 0) { | |
3943 | numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, fp, NULL); | |
3944 | } | |
3945 | /* Sampling can fail if another sample is happening at the same time in a different zone. */ | |
3946 | if (!zleak_log(zbt, addr, numsaved, zone_elem_size(zone))) { | |
3947 | /* If it failed, roll back the counter so we sample the next allocation instead. */ | |
3948 | zone->zleak_capture = zleak_sample_factor; | |
f427ee49 A |
3949 | } |
3950 | } | |
c3c9b80d | 3951 | } |
f427ee49 | 3952 | |
c3c9b80d A |
3953 | if (__improbable(zone_leaks_scan_enable && |
3954 | !(zone_elem_size(zone) & (sizeof(uintptr_t) - 1)))) { | |
3955 | unsigned int count, idx; | |
3956 | /* Fill element, from tail, with backtrace in reverse order */ | |
3957 | if (numsaved == 0) { | |
3958 | numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, fp, NULL); | |
f427ee49 | 3959 | } |
c3c9b80d A |
3960 | count = (unsigned int)(zone_elem_size(zone) / sizeof(uintptr_t)); |
3961 | if (count >= numsaved) { | |
3962 | count = numsaved - 1; | |
3963 | } | |
3964 | for (idx = 0; idx < count; idx++) { | |
3965 | ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1]; | |
f427ee49 A |
3966 | } |
3967 | } | |
c3c9b80d | 3968 | #endif /* CONFIG_ZLEAKS */ |
f427ee49 | 3969 | } |
f427ee49 | 3970 | |
c3c9b80d A |
3971 | static inline bool |
3972 | zalloc_should_log_or_trace_leaks(zone_t zone, vm_size_t elem_size) | |
3973 | { | |
3974 | #if ZONE_ENABLE_LOGGING | |
3975 | if (DO_LOGGING(zone)) { | |
3976 | return true; | |
3977 | } | |
3978 | #endif /* ZONE_ENABLE_LOGGING */ | |
3979 | #if CONFIG_ZLEAKS | |
3980 | /* | |
3981 | * Zone leak detection: capture a backtrace every zleak_sample_factor | |
3982 | * allocations in this zone. | |
3983 | */ | |
3984 | if (zone->zleak_on) { | |
3985 | return true; | |
3986 | } | |
3987 | if (zone_leaks_scan_enable && !(elem_size & (sizeof(uintptr_t) - 1))) { | |
3988 | return true; | |
3989 | } | |
3990 | #endif /* CONFIG_ZLEAKS */ | |
3991 | return false; | |
3992 | } | |
39236c6e | 3993 | |
c3c9b80d A |
3994 | #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */ |
3995 | #if ZONE_ENABLE_LOGGING | |
c910b4d9 | 3996 | |
c3c9b80d A |
3997 | __attribute__((noinline)) |
3998 | static void | |
3999 | zfree_log_trace(zone_t zone, vm_offset_t addr, void *fp) | |
f427ee49 | 4000 | { |
c3c9b80d A |
4001 | /* |
4002 | * See if we're doing logging on this zone. | |
4003 | * | |
4004 | * There are two styles of logging used depending on | |
4005 | * whether we're trying to catch a leak or corruption. | |
4006 | */ | |
4007 | if (__improbable(DO_LOGGING(zone))) { | |
4008 | if (corruption_debug_flag) { | |
4009 | uintptr_t zbt[MAX_ZTRACE_DEPTH]; | |
4010 | unsigned int numsaved; | |
4011 | /* | |
4012 | * We're logging to catch a corruption. | |
4013 | * | |
4014 | * Add a record of this zfree operation to log. | |
4015 | */ | |
4016 | numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, fp, NULL); | |
4017 | btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE, | |
4018 | (void **)zbt, numsaved); | |
4019 | } else { | |
4020 | /* | |
4021 | * We're logging to catch a leak. | |
4022 | * | |
4023 | * Remove any record we might have for this element | |
4024 | * since it's being freed. Note that we may not find it | |
4025 | * if the buffer overflowed and that's OK. | |
4026 | * | |
4027 | * Since the log is of a limited size, old records get | |
4028 | * overwritten if there are more zallocs than zfrees. | |
4029 | */ | |
4030 | btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr); | |
4031 | } | |
f427ee49 | 4032 | } |
c3c9b80d | 4033 | } |
39037602 | 4034 | |
c3c9b80d A |
4035 | #endif /* ZONE_ENABLE_LOGGING */ |
4036 | ||
4037 | /* These functions outside of CONFIG_ZLEAKS because they are also used in | |
4038 | * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix. | |
4039 | */ | |
4040 | ||
4041 | /* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */ | |
4042 | uintptr_t | |
4043 | hash_mix(uintptr_t x) | |
4044 | { | |
4045 | #ifndef __LP64__ | |
4046 | x += ~(x << 15); | |
4047 | x ^= (x >> 10); | |
4048 | x += (x << 3); | |
4049 | x ^= (x >> 6); | |
4050 | x += ~(x << 11); | |
4051 | x ^= (x >> 16); | |
4052 | #else | |
4053 | x += ~(x << 32); | |
4054 | x ^= (x >> 22); | |
4055 | x += ~(x << 13); | |
4056 | x ^= (x >> 8); | |
4057 | x += (x << 3); | |
4058 | x ^= (x >> 15); | |
4059 | x += ~(x << 27); | |
4060 | x ^= (x >> 31); | |
4061 | #endif | |
4062 | return x; | |
f427ee49 | 4063 | } |
c910b4d9 | 4064 | |
c3c9b80d A |
4065 | uint32_t |
4066 | hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size) | |
4067 | { | |
4068 | uintptr_t hash = 0; | |
4069 | uintptr_t mask = max_size - 1; | |
39037602 | 4070 | |
c3c9b80d A |
4071 | while (depth) { |
4072 | hash += bt[--depth]; | |
4073 | } | |
39037602 | 4074 | |
c3c9b80d A |
4075 | hash = hash_mix(hash) & mask; |
4076 | ||
4077 | assert(hash < max_size); | |
4078 | ||
4079 | return (uint32_t) hash; | |
4080 | } | |
39236c6e | 4081 | |
c910b4d9 | 4082 | /* |
c3c9b80d A |
4083 | * TODO: Determine how well distributed this is |
4084 | * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask | |
4085 | */ | |
4086 | uint32_t | |
4087 | hashaddr(uintptr_t pt, uint32_t max_size) | |
4088 | { | |
4089 | uintptr_t hash = 0; | |
4090 | uintptr_t mask = max_size - 1; | |
4091 | ||
4092 | hash = hash_mix(pt) & mask; | |
4093 | ||
4094 | assert(hash < max_size); | |
4095 | ||
4096 | return (uint32_t) hash; | |
4097 | } | |
4098 | ||
4099 | #endif /* !ZALLOC_TEST */ | |
4100 | #pragma mark zone (re)fill | |
4101 | #if !ZALLOC_TEST | |
4102 | ||
4103 | /*! | |
4104 | * @defgroup Zone Refill | |
4105 | * @{ | |
4106 | * | |
4107 | * @brief | |
4108 | * Functions handling The zone refill machinery. | |
4109 | * | |
4110 | * @discussion | |
4111 | * Zones are refilled based on 3 mechanisms: direct expansion, async expansion, | |
4112 | * VM-specific replenishment. Zones using VM-specific replenishment are marked | |
4113 | * with the @c z_replenishes property set. | |
4114 | * | |
4115 | * @c zalloc_ext() is the codepath that kicks the zone refill when the zone is | |
4116 | * dropping below half of its @c z_elems_rsv (0 for most zones) and will: | |
4117 | * | |
4118 | * - call @c zone_expand_locked() directly if the caller is allowed to block, | |
4119 | * | |
4120 | * - wakeup the asynchroous expansion thread call if the caller is not allowed | |
4121 | * to block. | |
4122 | * | |
4123 | * - call @c zone_replenish_locked() to kick the replenish state machine. | |
4124 | * | |
4125 | * | |
4126 | * <h2>Synchronous expansion</h2> | |
4127 | * | |
4128 | * This mechanism is actually the only one that may refill a zone, and all the | |
4129 | * other ones funnel through this one eventually. | |
4130 | * | |
4131 | * @c zone_expand_locked() implements the core of the expansion mechanism, | |
4132 | * and will do so while a caller specified predicate is true. | |
4133 | * | |
4134 | * Zone expansion allows for up to 2 threads to concurrently refill the zone: | |
4135 | * - one VM privileged thread, | |
4136 | * - one regular thread. | |
4137 | * | |
4138 | * Regular threads that refill will put down their identity in @c z_expander, | |
4139 | * so that priority inversion avoidance can be implemented. | |
4140 | * | |
4141 | * However, VM privileged threads are allowed to use VM page reserves, | |
4142 | * which allows for the system to recover from extreme memory pressure | |
4143 | * situations, allowing for the few allocations that @c zone_gc() or | |
4144 | * killing processes require. | |
4145 | * | |
4146 | * When a VM privileged thread is also expanding, the @c z_expander_vm_priv bit | |
4147 | * is set. @c z_expander is not necessarily the identity of this VM privileged | |
4148 | * thread (it is if the VM privileged thread came in first, but wouldn't be, and | |
4149 | * could even be @c THREAD_NULL otherwise). | |
4150 | * | |
4151 | * Note that the pageout-scan daemon might be BG and is VM privileged. To avoid | |
4152 | * spending a whole pointer on priority inheritance for VM privileged threads | |
4153 | * (and other issues related to having two owners), we use the rwlock boost as | |
4154 | * a stop gap to avoid priority inversions. | |
4155 | * | |
4156 | * | |
4157 | * <h2>Chunk wiring policies</h2> | |
4158 | * | |
4159 | * Zones allocate memory in chunks of @c zone_t::z_chunk_pages pages at a time | |
4160 | * to try to minimize fragmentation relative to element sizes not aligning with | |
4161 | * a chunk size well. However, this can grow large and be hard to fulfill on | |
4162 | * a system under a lot of memory pressure (chunks can be as long as 8 pages on | |
4163 | * 4k page systems). | |
4164 | * | |
4165 | * This is why, when under memory pressure the system allows chunks to be | |
4166 | * partially populated. The metadata of the first page in the chunk maintains | |
4167 | * the count of actually populated pages. | |
4168 | * | |
4169 | * The metadata for addresses assigned to a zone are found of 4 queues: | |
4170 | * - @c z_pageq_empty has chunk heads with populated pages and no allocated | |
4171 | * elements (those can be targeted by @c zone_gc()), | |
4172 | * - @c z_pageq_partial has chunk heads with populated pages that are partially | |
4173 | * used, | |
4174 | * - @c z_pageq_full has chunk heads with populated pages with no free elements | |
4175 | * left, | |
4176 | * - @c z_pageq_va has either chunk heads for sequestered VA space assigned to | |
4177 | * the zone forever (if @c z_va_sequester is enabled), or the first secondary | |
4178 | * metadata for a chunk whose corresponding page is not populated in the | |
4179 | * chunk. | |
4180 | * | |
4181 | * When new pages need to be wired/populated, chunks from the @c z_pageq_va | |
4182 | * queues are preferred. | |
4183 | * | |
4184 | * | |
4185 | * <h2>Asynchronous expansion</h2> | |
4186 | * | |
4187 | * This mechanism allows for refilling zones used mostly with non blocking | |
4188 | * callers. It relies on a thread call (@c zone_expand_callout) which will | |
4189 | * iterate all zones and refill the ones marked with @c z_async_refilling. | |
4190 | * | |
4191 | * NOTE: If the calling thread for zalloc_noblock is lower priority than | |
4192 | * the thread_call, then zalloc_noblock to an empty zone may succeed. | |
4193 | * | |
4194 | * | |
4195 | * <h2>Dealing with zone allocations from the mach VM code</h2> | |
4196 | * | |
4197 | * The implementation of the mach VM itself uses the zone allocator | |
4198 | * for things like the vm_map_entry data structure. In order to prevent | |
4199 | * an infinite recursion problem when adding more pages to a zone, @c zalloc | |
4200 | * uses a replenish thread to refill the VM layer's zones before they have | |
4201 | * too few remaining free entries. The reserved remaining free entries | |
4202 | * guarantee that the VM routines can get entries from already mapped pages. | |
4203 | * | |
4204 | * In order for that to work, the amount of allocations in the nested | |
4205 | * case have to be bounded. There are currently 2 replenish zones, and | |
4206 | * if each needs 1 element of each zone to add a new page to itself, that | |
4207 | * gives us a minumum reserve of 2 elements. | |
4208 | * | |
4209 | * There is also a deadlock issue with the zone garbage collection thread, | |
4210 | * or any thread that is trying to free zone pages. While holding | |
4211 | * the kernel's map lock they may need to allocate new VM map entries, hence | |
4212 | * we need enough reserve to allow them to get past the point of holding the | |
4213 | * map lock. After freeing that page, the GC thread will wait in | |
4214 | * @c zone_reclaim() until the replenish threads can finish. | |
4215 | * Since there's only 1 GC thread at a time, that adds a minimum of 1 to the | |
4216 | * reserve size. | |
4217 | * | |
4218 | * Since the minumum amount you can add to a zone is 1 page, | |
4219 | * we'll use 16K (from ARM) as the refill size on all platforms. | |
4220 | * | |
4221 | * When a refill zone drops to half that available, i.e. REFILL_SIZE / 2, | |
4222 | * @c zalloc_ext() will wake the replenish thread. The replenish thread runs | |
4223 | * until at least REFILL_SIZE worth of free elements exist, before sleeping again. | |
4224 | * In the meantime threads may continue to use the reserve until there are only | |
4225 | * REFILL_SIZE / 4 elements left. Below that point only the replenish threads | |
4226 | * themselves and the GC thread may continue to use from the reserve. | |
c910b4d9 | 4227 | */ |
316670eb | 4228 | |
c3c9b80d | 4229 | static thread_call_data_t zone_expand_callout; |
c910b4d9 | 4230 | |
c3c9b80d A |
4231 | static inline kma_flags_t |
4232 | zone_kma_flags(zone_t z, zalloc_flags_t flags) | |
f427ee49 | 4233 | { |
c3c9b80d | 4234 | kma_flags_t kmaflags = KMA_KOBJECT | KMA_ZERO; |
c910b4d9 | 4235 | |
c3c9b80d A |
4236 | if (z->z_noencrypt) { |
4237 | kmaflags |= KMA_NOENCRYPT; | |
4238 | } | |
4239 | if (flags & Z_NOPAGEWAIT) { | |
4240 | kmaflags |= KMA_NOPAGEWAIT; | |
4241 | } | |
4242 | if (z->z_permanent || (!z->z_destructible && z->z_va_sequester)) { | |
4243 | kmaflags |= KMA_PERMANENT; | |
f427ee49 | 4244 | } |
c3c9b80d A |
4245 | if (z->z_submap_idx == Z_SUBMAP_IDX_GENERAL && |
4246 | z->kalloc_heap != KHEAP_ID_NONE) { | |
4247 | kmaflags |= KMA_KHEAP; | |
4248 | } | |
4249 | ||
4250 | return kmaflags; | |
f427ee49 | 4251 | } |
c910b4d9 | 4252 | |
c3c9b80d A |
4253 | /*! |
4254 | * @function zcram_and_lock() | |
f427ee49 | 4255 | * |
c3c9b80d A |
4256 | * @brief |
4257 | * Prepare some memory for being usable for allocation purposes. | |
f427ee49 A |
4258 | * |
4259 | * @discussion | |
c3c9b80d A |
4260 | * Prepare memory in <code>[addr + ptoa(pg_start), addr + ptoa(pg_end))</code> |
4261 | * to be usable in the zone. | |
f427ee49 | 4262 | * |
c3c9b80d | 4263 | * This function assumes the metadata is already populated for the range. |
f427ee49 | 4264 | * |
c3c9b80d A |
4265 | * Calling this function with @c pg_start being 0 means that the memory |
4266 | * is either a partial chunk, or a full chunk, that isn't published anywhere | |
4267 | * and the initialization can happen without locks held. | |
f427ee49 | 4268 | * |
c3c9b80d A |
4269 | * Calling this function with a non zero @c pg_start means that we are extending |
4270 | * an existing chunk: the memory in <code>[addr, addr + ptoa(pg_start))</code>, | |
4271 | * is already usable and published in the zone, so extending it requires holding | |
4272 | * the zone lock. | |
4273 | * | |
4274 | * @param zone The zone to cram new populated pages into | |
4275 | * @param addr The base address for the chunk(s) | |
4276 | * @param pg_va_new The number of virtual pages newly assigned to the zone | |
4277 | * @param pg_start The first newly populated page relative to @a addr. | |
4278 | * @param pg_end The after-last newly populated page relative to @a addr. | |
4279 | * @param kind The kind of memory assigned to the zone. | |
f427ee49 A |
4280 | */ |
4281 | static void | |
c3c9b80d A |
4282 | zcram_and_lock(zone_t zone, vm_offset_t addr, uint32_t pg_va_new, |
4283 | uint32_t pg_start, uint32_t pg_end, zone_addr_kind_t kind) | |
c910b4d9 | 4284 | { |
c3c9b80d A |
4285 | zone_id_t zindex = zone_index(zone); |
4286 | vm_offset_t elem_size = zone_elem_size(zone); | |
4287 | uint32_t free_start = 0, free_end = 0; | |
c910b4d9 | 4288 | |
c3c9b80d A |
4289 | struct zone_page_metadata *meta = zone_meta_from_addr(addr); |
4290 | uint32_t chunk_pages = zone->z_chunk_pages; | |
c910b4d9 | 4291 | |
c3c9b80d | 4292 | assert(pg_start < pg_end && pg_end <= chunk_pages); |
c910b4d9 | 4293 | |
c3c9b80d A |
4294 | if (pg_start == 0) { |
4295 | uint16_t chunk_len = (uint16_t)pg_end; | |
4296 | uint16_t secondary_len = ZM_SECONDARY_PAGE; | |
4297 | bool inline_bitmap = false; | |
f427ee49 | 4298 | |
c3c9b80d A |
4299 | if (zone->z_percpu) { |
4300 | chunk_len = 1; | |
4301 | secondary_len = ZM_SECONDARY_PCPU_PAGE; | |
4302 | assert(pg_end == zpercpu_count()); | |
4303 | } | |
4304 | if (!zone->z_permanent) { | |
4305 | inline_bitmap = zone->z_chunk_elems <= 32 * chunk_pages; | |
4306 | } | |
4307 | ||
4308 | meta[0] = (struct zone_page_metadata){ | |
4309 | .zm_index = zindex, | |
4310 | .zm_inline_bitmap = inline_bitmap, | |
4311 | .zm_chunk_len = chunk_len, | |
4312 | }; | |
4313 | if (kind == ZONE_ADDR_FOREIGN) { | |
4314 | /* Never hit z_pageq_empty */ | |
4315 | meta[0].zm_alloc_size = ZM_ALLOC_SIZE_LOCK; | |
4316 | } | |
4317 | ||
4318 | for (uint16_t i = 1; i < chunk_pages; i++) { | |
4319 | meta[i] = (struct zone_page_metadata){ | |
4320 | .zm_index = zindex, | |
4321 | .zm_inline_bitmap = inline_bitmap, | |
4322 | .zm_chunk_len = secondary_len, | |
4323 | .zm_page_index = i, | |
4324 | }; | |
4325 | } | |
4326 | ||
4327 | free_end = (uint32_t)ptoa(chunk_len) / elem_size; | |
4328 | if (!zone->z_permanent) { | |
4329 | zone_meta_bits_init(meta, free_end, zone->z_chunk_elems); | |
0a7de745 | 4330 | } |
c3c9b80d A |
4331 | } else { |
4332 | assert(!zone->z_percpu && !zone->z_permanent); | |
4333 | ||
4334 | free_end = (uint32_t)ptoa(pg_end) / elem_size; | |
4335 | free_start = (uint32_t)ptoa(pg_start) / elem_size; | |
4336 | } | |
4337 | ||
4338 | #if VM_MAX_TAG_ZONES | |
4339 | if (__improbable(zone->tags)) { | |
4340 | assert(kind == ZONE_ADDR_NATIVE && !zone->z_percpu); | |
4341 | ztMemoryAdd(zone, addr + ptoa(pg_start), | |
4342 | ptoa(pg_end - pg_start)); | |
f427ee49 | 4343 | } |
c3c9b80d | 4344 | #endif /* VM_MAX_TAG_ZONES */ |
c910b4d9 | 4345 | |
f427ee49 | 4346 | /* |
c3c9b80d | 4347 | * Insert the initialized pages / metadatas into the right lists. |
f427ee49 | 4348 | */ |
c3c9b80d A |
4349 | |
4350 | zone_lock(zone); | |
4351 | assert(zone->z_self == zone); | |
4352 | ||
4353 | if (pg_start != 0) { | |
4354 | assert(meta->zm_chunk_len == pg_start); | |
4355 | ||
4356 | zone_meta_bits_merge(meta, free_start, free_end); | |
4357 | meta->zm_chunk_len = (uint16_t)pg_end; | |
4358 | ||
4359 | /* | |
4360 | * consume the zone_meta_lock_in_partial() | |
4361 | * done in zone_expand_locked() | |
4362 | */ | |
4363 | zone_meta_alloc_size_sub(zone, meta, ZM_ALLOC_SIZE_LOCK); | |
4364 | zone_meta_remqueue(zone, meta); | |
c910b4d9 A |
4365 | } |
4366 | ||
c3c9b80d A |
4367 | if (zone->z_permanent || meta->zm_alloc_size) { |
4368 | zone_meta_queue_push(zone, &zone->z_pageq_partial, meta); | |
4369 | } else { | |
4370 | zone_meta_queue_push(zone, &zone->z_pageq_empty, meta); | |
4371 | zone->z_wired_empty += zone->z_percpu ? 1 : pg_end; | |
4372 | } | |
4373 | if (pg_end < chunk_pages) { | |
4374 | /* push any non populated residual VA on z_pageq_va */ | |
4375 | zone_meta_queue_push(zone, &zone->z_pageq_va, meta + pg_end); | |
4376 | } | |
f427ee49 | 4377 | |
c3c9b80d A |
4378 | zone_elems_free_add(zone, free_end - free_start); |
4379 | zone->z_elems_avail += free_end - free_start; | |
4380 | zone->z_wired_cur += zone->z_percpu ? 1 : pg_end - pg_start; | |
4381 | if (pg_va_new) { | |
4382 | zone->z_va_cur += zone->z_percpu ? 1 : pg_va_new; | |
4383 | } | |
4384 | if (zone->z_wired_hwm < zone->z_wired_cur) { | |
4385 | zone->z_wired_hwm = zone->z_wired_cur; | |
f427ee49 | 4386 | } |
c3c9b80d A |
4387 | |
4388 | os_atomic_add(&zones_phys_page_mapped_count, pg_end - pg_start, relaxed); | |
c910b4d9 A |
4389 | } |
4390 | ||
c3c9b80d A |
4391 | static void |
4392 | zcram(zone_t zone, vm_offset_t addr, uint32_t pages, zone_addr_kind_t kind) | |
4393 | { | |
4394 | uint32_t chunk_pages = zone->z_chunk_pages; | |
f427ee49 | 4395 | |
c3c9b80d A |
4396 | assert(pages % chunk_pages == 0); |
4397 | for (; pages > 0; pages -= chunk_pages, addr += ptoa(chunk_pages)) { | |
4398 | zcram_and_lock(zone, addr, chunk_pages, 0, chunk_pages, kind); | |
4399 | zone_unlock(zone); | |
4400 | } | |
4401 | } | |
f427ee49 | 4402 | |
c3c9b80d A |
4403 | void |
4404 | zone_cram_foreign(zone_t zone, vm_offset_t newmem, vm_size_t size) | |
4405 | { | |
4406 | uint32_t pages = (uint32_t)atop(size); | |
c910b4d9 | 4407 | |
c3c9b80d A |
4408 | if (!from_zone_map(newmem, size, ZONE_ADDR_FOREIGN)) { |
4409 | panic("zone_cram_foreign: foreign memory [%p] being crammed is " | |
4410 | "outside of expected range", (void *)newmem); | |
4411 | } | |
4412 | if (!zone->z_allows_foreign) { | |
4413 | panic("zone_cram_foreign: foreign memory [%p] being crammed in " | |
4414 | "zone '%s%s' not expecting it", (void *)newmem, | |
4415 | zone_heap_name(zone), zone_name(zone)); | |
4416 | } | |
4417 | if (size % ptoa(zone->z_chunk_pages)) { | |
4418 | panic("zone_cram_foreign: foreign memory [%p] being crammed has " | |
4419 | "invalid size %zx", (void *)newmem, (size_t)size); | |
4420 | } | |
4421 | if (startup_phase >= STARTUP_SUB_ZALLOC) { | |
4422 | panic("zone_cram_foreign: foreign memory [%p] being crammed " | |
4423 | "after zalloc is initialized", (void *)newmem); | |
4424 | } | |
c910b4d9 | 4425 | |
c3c9b80d A |
4426 | bzero((void *)newmem, size); |
4427 | zcram(zone, newmem, pages, ZONE_ADDR_FOREIGN); | |
4428 | } | |
4429 | ||
4430 | void | |
4431 | zone_fill_initially(zone_t zone, vm_size_t nelems) | |
4432 | { | |
4433 | kma_flags_t kmaflags; | |
4434 | kern_return_t kr; | |
4435 | vm_offset_t addr; | |
4436 | uint32_t pages; | |
4437 | ||
4438 | assert(!zone->z_permanent && !zone->collectable && !zone->z_destructible); | |
4439 | assert(zone->z_elems_avail == 0); | |
4440 | ||
4441 | kmaflags = zone_kma_flags(zone, Z_WAITOK) | KMA_PERMANENT; | |
4442 | pages = zone_alloc_pages_for_nelems(zone, nelems); | |
4443 | kr = kernel_memory_allocate(zone_submap(zone), &addr, ptoa(pages), | |
4444 | 0, kmaflags, VM_KERN_MEMORY_ZONE); | |
4445 | if (kr != KERN_SUCCESS) { | |
4446 | panic("kernel_memory_allocate() of %u pages failed", pages); | |
4447 | } | |
4448 | ||
4449 | zone_meta_populate(addr, ptoa(pages)); | |
4450 | zcram(zone, addr, pages, ZONE_ADDR_NATIVE); | |
4451 | } | |
4452 | ||
4453 | static vm_offset_t | |
4454 | zone_allocate_va(zone_t z, zalloc_flags_t flags) | |
4455 | { | |
4456 | kma_flags_t kmaflags = zone_kma_flags(z, flags) | KMA_VAONLY; | |
4457 | vm_size_t size = ptoa(z->z_chunk_pages); | |
4458 | kern_return_t kr; | |
4459 | vm_offset_t addr; | |
4460 | ||
4461 | kr = kernel_memory_allocate(zone_submap(z), &addr, size, 0, | |
4462 | kmaflags, VM_KERN_MEMORY_ZONE); | |
4463 | ||
4464 | #if !__LP64__ | |
4465 | if (kr == KERN_NO_SPACE && z->z_replenishes) { | |
4466 | /* | |
4467 | * On 32bit the zone submaps do not have as much VA | |
4468 | * available, so use the VA reserved map for this | |
4469 | * purpose. | |
4470 | */ | |
4471 | vm_map_t map = zone_submaps[Z_SUBMAP_IDX_VA_RESERVE]; | |
4472 | kr = kernel_memory_allocate(map, &addr, size, 0, | |
4473 | kmaflags, VM_KERN_MEMORY_ZONE); | |
4474 | } | |
4475 | #endif | |
4476 | ||
4477 | if (kr == KERN_SUCCESS) { | |
4478 | #if ZALLOC_EARLY_GAPS | |
4479 | if (__improbable(zone_caching_disabled < 0)) { | |
4480 | zone_allocate_random_early_gap(z); | |
4481 | } | |
4482 | #endif /* ZALLOC_EARLY_GAPS */ | |
4483 | zone_meta_populate(addr, size); | |
4484 | return addr; | |
4485 | } | |
c910b4d9 | 4486 | |
c3c9b80d | 4487 | panic_include_zprint = TRUE; |
6d2010ae | 4488 | #if CONFIG_ZLEAKS |
c3c9b80d A |
4489 | if ((zleak_state & ZLEAK_STATE_ACTIVE)) { |
4490 | panic_include_ztrace = TRUE; | |
4491 | } | |
4492 | #endif /* CONFIG_ZLEAKS */ | |
4493 | zone_t zone_largest = zone_find_largest(); | |
4494 | panic("zalloc: zone map exhausted while allocating from zone [%s%s], " | |
4495 | "likely due to memory leak in zone [%s%s] " | |
4496 | "(%luM, %d elements allocated)", | |
4497 | zone_heap_name(z), zone_name(z), | |
4498 | zone_heap_name(zone_largest), zone_name(zone_largest), | |
4499 | (unsigned long)zone_size_wired(zone_largest) >> 20, | |
4500 | zone_count_allocated(zone_largest)); | |
4501 | } | |
6d2010ae | 4502 | |
c3c9b80d A |
4503 | static bool |
4504 | zone_expand_pred_nope(__unused zone_t z) | |
4505 | { | |
4506 | return false; | |
4507 | } | |
6d2010ae | 4508 | |
c3c9b80d A |
4509 | static inline void |
4510 | ZONE_TRACE_VM_KERN_REQUEST_START(vm_size_t size) | |
4511 | { | |
4512 | #if DEBUG || DEVELOPMENT | |
4513 | VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, | |
4514 | size, 0, 0, 0); | |
4515 | #else | |
4516 | (void)size; | |
4517 | #endif | |
4518 | } | |
6d2010ae | 4519 | |
c3c9b80d A |
4520 | static inline void |
4521 | ZONE_TRACE_VM_KERN_REQUEST_END(uint32_t pages) | |
4522 | { | |
4523 | #if DEBUG || DEVELOPMENT | |
4524 | task_t task = current_task(); | |
4525 | if (pages && task) { | |
4526 | ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, pages); | |
4527 | } | |
4528 | VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, | |
4529 | pages, 0, 0, 0); | |
4530 | #else | |
4531 | (void)pages; | |
4532 | #endif | |
4533 | } | |
4534 | ||
4535 | static void | |
4536 | zone_expand_locked(zone_t z, zalloc_flags_t flags, bool (*pred)(zone_t)) | |
4537 | { | |
4538 | thread_t self = current_thread(); | |
4539 | bool vm_priv = (self->options & TH_OPT_VMPRIV); | |
4540 | bool clear_vm_priv; | |
4541 | ||
4542 | for (;;) { | |
4543 | if (!pred) { | |
4544 | /* NULL pred means "try just once" */ | |
4545 | pred = zone_expand_pred_nope; | |
4546 | } else if (!pred(z)) { | |
4547 | return; | |
4548 | } | |
4549 | ||
4550 | if (vm_priv && !z->z_expander_vm_priv) { | |
4551 | /* | |
4552 | * Claim the vm priv overcommit slot | |
4553 | * | |
4554 | * We do not track exact ownership for VM privileged | |
4555 | * threads, so use the rwlock boost as a stop-gap | |
4556 | * just in case. | |
4557 | */ | |
4558 | set_thread_rwlock_boost(); | |
4559 | z->z_expander_vm_priv = true; | |
4560 | clear_vm_priv = true; | |
4561 | } else { | |
4562 | clear_vm_priv = false; | |
4563 | } | |
6d2010ae | 4564 | |
c3c9b80d A |
4565 | if (z->z_expander == NULL) { |
4566 | z->z_expander = self; | |
4567 | break; | |
4568 | } | |
4569 | if (clear_vm_priv) { | |
4570 | break; | |
4571 | } | |
6d2010ae | 4572 | |
c3c9b80d A |
4573 | if (flags & Z_NOPAGEWAIT) { |
4574 | return; | |
4575 | } | |
6d2010ae | 4576 | |
c3c9b80d A |
4577 | z->z_expanding_wait = true; |
4578 | lck_spin_sleep_with_inheritor(&z->z_lock, LCK_SLEEP_DEFAULT, | |
4579 | &z->z_expander, z->z_expander, | |
4580 | TH_UNINT, TIMEOUT_WAIT_FOREVER); | |
4581 | } | |
6d2010ae | 4582 | |
c3c9b80d A |
4583 | do { |
4584 | struct zone_page_metadata *meta = NULL; | |
4585 | uint32_t new_va = 0, cur_pages = 0, min_pages = 0, pages = 0; | |
4586 | vm_page_t page_list = NULL; | |
4587 | vm_offset_t addr = 0; | |
4588 | int waited = 0; | |
6d2010ae | 4589 | |
c3c9b80d A |
4590 | /* |
4591 | * While we hold the zone lock, look if there's VA we can: | |
4592 | * - complete from partial pages, | |
4593 | * - reuse from the sequester list. | |
4594 | * | |
4595 | * When the page is being populated we pretend we allocated | |
4596 | * an extra element so that zone_gc() can't attempt to free | |
4597 | * the chunk (as it could become empty while we wait for pages). | |
4598 | */ | |
4599 | if (!zone_pva_is_null(z->z_pageq_va)) { | |
4600 | meta = zone_meta_queue_pop_native(z, | |
4601 | &z->z_pageq_va, &addr); | |
4602 | if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { | |
4603 | cur_pages = meta->zm_page_index; | |
4604 | meta -= cur_pages; | |
4605 | addr -= ptoa(cur_pages); | |
4606 | zone_meta_lock_in_partial(z, meta, cur_pages); | |
4607 | } | |
4608 | } | |
4609 | zone_unlock(z); | |
6d2010ae | 4610 | |
c3c9b80d A |
4611 | /* |
4612 | * Do the zone leak activation here because zleak_activate() | |
4613 | * may block, and can't be done on the way out. | |
4614 | * | |
4615 | * Trigger jetsams via the vm_pageout_garbage_collect thread if | |
4616 | * we're running out of zone memory | |
4617 | */ | |
4618 | zleak_activate_if_needed(); | |
4619 | if (zone_map_nearing_exhaustion()) { | |
4620 | thread_wakeup((event_t)&vm_pageout_garbage_collect); | |
4621 | } | |
6d2010ae | 4622 | |
c3c9b80d A |
4623 | /* |
4624 | * And now allocate pages to populate our VA. | |
4625 | */ | |
4626 | if (z->z_percpu) { | |
4627 | min_pages = z->z_chunk_pages; | |
4628 | } else { | |
4629 | min_pages = (uint32_t)atop(round_page(zone_elem_size(z))); | |
4630 | } | |
6d2010ae | 4631 | |
c3c9b80d | 4632 | ZONE_TRACE_VM_KERN_REQUEST_START(ptoa(z->z_chunk_pages - cur_pages)); |
6d2010ae | 4633 | |
c3c9b80d A |
4634 | while (pages < z->z_chunk_pages - cur_pages) { |
4635 | vm_page_t m = vm_page_grab(); | |
6d2010ae | 4636 | |
c3c9b80d A |
4637 | if (m) { |
4638 | pages++; | |
4639 | m->vmp_snext = page_list; | |
4640 | page_list = m; | |
4641 | vm_page_zero_fill(m); | |
4642 | continue; | |
4643 | } | |
6d2010ae | 4644 | |
c3c9b80d A |
4645 | if (pages >= min_pages && (vm_pool_low() || waited)) { |
4646 | break; | |
4647 | } | |
6d2010ae | 4648 | |
c3c9b80d A |
4649 | if ((flags & Z_NOPAGEWAIT) == 0) { |
4650 | waited++; | |
4651 | VM_PAGE_WAIT(); | |
4652 | continue; | |
4653 | } | |
6d2010ae | 4654 | |
c3c9b80d A |
4655 | /* |
4656 | * Undo everything and bail out: | |
4657 | * | |
4658 | * - free pages | |
4659 | * - undo the fake allocation if any | |
4660 | * - put the VA back on the VA page queue. | |
4661 | */ | |
4662 | vm_page_free_list(page_list, FALSE); | |
4663 | ZONE_TRACE_VM_KERN_REQUEST_END(pages); | |
0a7de745 | 4664 | |
c3c9b80d | 4665 | zone_lock(z); |
316670eb | 4666 | |
c3c9b80d A |
4667 | if (cur_pages) { |
4668 | zone_meta_unlock_from_partial(z, meta, cur_pages); | |
4669 | } | |
4670 | if (meta) { | |
4671 | zone_meta_queue_push(z, &z->z_pageq_va, | |
4672 | meta + cur_pages); | |
4673 | } | |
4674 | goto page_shortage; | |
6d2010ae | 4675 | } |
0a7de745 | 4676 | |
c3c9b80d A |
4677 | /* |
4678 | * If we didn't find pre-allocated VA, then allocate a chunk | |
4679 | * of VA here. | |
4680 | */ | |
4681 | if (addr == 0) { | |
4682 | addr = zone_allocate_va(z, flags); | |
4683 | meta = zone_meta_from_addr(addr); | |
4684 | new_va = z->z_chunk_pages; | |
6d2010ae | 4685 | } |
0a7de745 | 4686 | |
c3c9b80d A |
4687 | kernel_memory_populate_with_pages(zone_submap(z), |
4688 | addr + ptoa(cur_pages), ptoa(pages), page_list, | |
4689 | zone_kma_flags(z, flags), VM_KERN_MEMORY_ZONE); | |
6d2010ae | 4690 | |
c3c9b80d A |
4691 | ZONE_TRACE_VM_KERN_REQUEST_END(pages); |
4692 | ||
4693 | zcram_and_lock(z, addr, new_va, cur_pages, cur_pages + pages, | |
4694 | ZONE_ADDR_NATIVE); | |
4695 | } while (pred(z)); | |
4696 | ||
4697 | page_shortage: | |
4698 | zleak_track_if_needed(z); | |
4699 | ||
4700 | if (clear_vm_priv) { | |
4701 | z->z_expander_vm_priv = false; | |
4702 | clear_thread_rwlock_boost(); | |
0a7de745 | 4703 | } |
c3c9b80d A |
4704 | if (z->z_expander == self) { |
4705 | z->z_expander = THREAD_NULL; | |
4706 | } | |
4707 | if (z->z_expanding_wait) { | |
4708 | z->z_expanding_wait = false; | |
4709 | wakeup_all_with_inheritor(&z->z_expander, THREAD_AWAKENED); | |
0a7de745 | 4710 | } |
6d2010ae A |
4711 | } |
4712 | ||
c3c9b80d A |
4713 | static bool |
4714 | zalloc_needs_refill(zone_t zone) | |
6d2010ae | 4715 | { |
c3c9b80d A |
4716 | if (zone->z_elems_free > zone->z_elems_rsv) { |
4717 | return false; | |
6d2010ae | 4718 | } |
c3c9b80d A |
4719 | if (zone->z_wired_cur < zone->z_wired_max) { |
4720 | return true; | |
6d2010ae | 4721 | } |
c3c9b80d A |
4722 | if (zone->exhaustible) { |
4723 | return false; | |
6d2010ae | 4724 | } |
c3c9b80d A |
4725 | if (zone->expandable) { |
4726 | /* | |
4727 | * If we're expandable, just don't go through this again. | |
4728 | */ | |
4729 | zone->z_wired_max = ~0u; | |
4730 | return true; | |
6d2010ae | 4731 | } |
c3c9b80d | 4732 | zone_unlock(zone); |
6d2010ae | 4733 | |
c3c9b80d A |
4734 | panic_include_zprint = true; |
4735 | #if CONFIG_ZLEAKS | |
4736 | if (zleak_state & ZLEAK_STATE_ACTIVE) { | |
4737 | panic_include_ztrace = true; | |
4738 | } | |
4739 | #endif /* CONFIG_ZLEAKS */ | |
4740 | panic("zone '%s%s' exhausted", zone_heap_name(zone), zone_name(zone)); | |
4741 | } | |
6d2010ae | 4742 | |
c3c9b80d A |
4743 | static void |
4744 | zone_expand_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1) | |
4745 | { | |
4746 | zone_foreach(z) { | |
4747 | if (z->no_callout) { | |
4748 | /* z_async_refilling will never be set */ | |
4749 | continue; | |
4750 | } | |
0a7de745 | 4751 | |
c3c9b80d A |
4752 | if (z->z_replenishes) { |
4753 | /* those use the zone_replenish_thread */ | |
4754 | continue; | |
4755 | } | |
6d2010ae | 4756 | |
c3c9b80d A |
4757 | zone_lock(z); |
4758 | if (z->z_self && z->z_async_refilling) { | |
4759 | z->z_async_refilling = false; | |
4760 | zone_expand_locked(z, Z_WAITOK, zalloc_needs_refill); | |
4761 | } | |
4762 | zone_unlock(z); | |
4763 | } | |
4764 | } | |
6d2010ae | 4765 | |
c3c9b80d A |
4766 | static inline void |
4767 | zone_expand_async_schedule_if_needed(zone_t zone) | |
4768 | { | |
4769 | if (zone->z_elems_free > zone->z_elems_rsv || zone->z_async_refilling || | |
4770 | zone->no_callout) { | |
4771 | return; | |
6d2010ae A |
4772 | } |
4773 | ||
c3c9b80d A |
4774 | if (!zone->expandable && zone->z_wired_cur >= zone->z_wired_max) { |
4775 | return; | |
6d2010ae A |
4776 | } |
4777 | ||
c3c9b80d A |
4778 | if (zone->z_elems_free == 0 || !vm_pool_low()) { |
4779 | zone->z_async_refilling = true; | |
4780 | thread_call_enter(&zone_expand_callout); | |
4781 | } | |
6d2010ae A |
4782 | } |
4783 | ||
c3c9b80d A |
4784 | #endif /* !ZALLOC_TEST */ |
4785 | #pragma mark zone replenishing (VM allocations) | |
4786 | #if !ZALLOC_TEST | |
6d2010ae A |
4787 | |
4788 | /* | |
c3c9b80d A |
4789 | * Tracks how many zone_replenish threads are active, because zone_gc() wants |
4790 | * for those to be finished before it proceeds. | |
4791 | * | |
4792 | * This counts how many replenish threads are active in | |
4793 | * ZONE_REPLENISH_ACTIVE_INC increments, | |
4794 | * and uses the low bit to track if there are any waiters. | |
6d2010ae | 4795 | */ |
c3c9b80d A |
4796 | #define ZONE_REPLENISH_ACTIVE_NONE 0u |
4797 | #define ZONE_REPLENISH_ACTIVE_WAITER_BIT 1u | |
4798 | #define ZONE_REPLENISH_ACTIVE_INC 2u | |
4799 | #define ZONE_REPLENISH_ACTIVE_MASK (~ZONE_REPLENISH_ACTIVE_WAITER_BIT) | |
4800 | static unsigned _Atomic zone_replenish_active; | |
4801 | static unsigned zone_replenish_wakeups; | |
4802 | static unsigned zone_replenish_wakeups_initiated; | |
4803 | static unsigned zone_replenish_throttle_count; | |
4804 | ||
4805 | #define ZONE_REPLENISH_TARGET (16 * 1024) | |
4806 | ||
4807 | static void | |
4808 | zone_replenish_wait_if_needed(void) | |
6d2010ae | 4809 | { |
c3c9b80d A |
4810 | /* |
4811 | * This check can be racy, the reserves ought to be enough | |
4812 | * to compensate for a little race | |
4813 | */ | |
4814 | while (os_atomic_load(&zone_replenish_active, relaxed) != | |
4815 | ZONE_REPLENISH_ACTIVE_NONE) { | |
4816 | unsigned o_active, n_active; | |
4817 | ||
4818 | assert_wait(&zone_replenish_active, THREAD_UNINT); | |
4819 | ||
4820 | os_atomic_rmw_loop(&zone_replenish_active, o_active, n_active, relaxed, { | |
4821 | if (o_active == ZONE_REPLENISH_ACTIVE_NONE) { | |
4822 | os_atomic_rmw_loop_give_up({ | |
4823 | clear_wait(current_thread(), THREAD_AWAKENED); | |
4824 | return; | |
4825 | }); | |
4826 | } | |
4827 | if (o_active & ZONE_REPLENISH_ACTIVE_WAITER_BIT) { | |
4828 | os_atomic_rmw_loop_give_up(break); | |
4829 | } | |
4830 | n_active = o_active | ZONE_REPLENISH_ACTIVE_WAITER_BIT; | |
4831 | }); | |
4832 | thread_block(THREAD_CONTINUE_NULL); | |
6d2010ae | 4833 | } |
c3c9b80d | 4834 | } |
0a7de745 | 4835 | |
c3c9b80d A |
4836 | __attribute__((noinline)) |
4837 | static void | |
4838 | zone_replenish_locked(zone_t zone) | |
4839 | { | |
4840 | thread_t thr = current_thread(); | |
4841 | uint32_t min_free; | |
0a7de745 | 4842 | |
c3c9b80d | 4843 | zone_replenish_wakeups++; |
0a7de745 A |
4844 | |
4845 | /* | |
c3c9b80d A |
4846 | * We'll let threads continue to allocate under the reserve: |
4847 | * - until it depleted to 50% for regular threads, | |
4848 | * - until it depleted to 25% for VM_PRIV threads. | |
4849 | * | |
4850 | * After that only TH_OPT_ZONE_PRIV threads may continue. | |
6d2010ae | 4851 | */ |
c3c9b80d A |
4852 | if (thr->options & TH_OPT_VMPRIV) { |
4853 | min_free = zone->z_elems_rsv / 4; | |
4854 | } else { | |
4855 | min_free = zone->z_elems_rsv / 2; | |
6d2010ae | 4856 | } |
0a7de745 | 4857 | |
c3c9b80d | 4858 | while (zone->z_elems_free <= zone->z_elems_rsv) { |
0a7de745 | 4859 | /* |
c3c9b80d | 4860 | * Wakeup the replenish thread if not running. |
6d2010ae | 4861 | */ |
c3c9b80d A |
4862 | if (!zone->z_async_refilling) { |
4863 | os_atomic_add(&zone_replenish_active, | |
4864 | ZONE_REPLENISH_ACTIVE_INC, relaxed); | |
4865 | zone->z_async_refilling = true; | |
4866 | zone_replenish_wakeups_initiated++; | |
4867 | thread_wakeup(&zone->z_elems_rsv); | |
0a7de745 A |
4868 | } |
4869 | ||
c3c9b80d A |
4870 | if (zone->z_elems_free > min_free) { |
4871 | break; | |
4872 | } | |
0a7de745 | 4873 | |
0a7de745 | 4874 | /* |
c3c9b80d A |
4875 | * TH_OPT_ZONE_PRIV threads are the GC thread and a replenish |
4876 | * thread itself. | |
4877 | * | |
4878 | * Replenish threads *need* to use the reserve. GC threads need | |
4879 | * to get through the current allocation, but then will wait at | |
4880 | * a higher level after they've dropped any locks which would | |
4881 | * deadlock the replenish thread. | |
4882 | * | |
4883 | * The value of (refill_level / 2) in the previous bit of code | |
4884 | * should have given us headroom even though this thread didn't | |
4885 | * wait. | |
6d2010ae | 4886 | */ |
c3c9b80d A |
4887 | if (thr->options & TH_OPT_ZONE_PRIV) { |
4888 | assert(zone->z_elems_free != 0); | |
4889 | break; | |
4890 | } | |
0a7de745 | 4891 | |
c3c9b80d A |
4892 | if (startup_phase < STARTUP_SUB_MACH_IPC) { |
4893 | panic("vm_map_steal_memory didn't steal enough memory: " | |
4894 | "trying to grow [%s%s] before the scheduler has started", | |
4895 | zone_heap_name(zone), zone_name(zone)); | |
4896 | } | |
0a7de745 | 4897 | |
c3c9b80d A |
4898 | /* |
4899 | * Wait for the replenish threads to add more elements | |
4900 | * for us to allocate from. | |
4901 | */ | |
4902 | zone_replenish_throttle_count++; | |
4903 | zone->z_replenish_wait = true; | |
4904 | assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC); | |
4905 | zone_unlock(zone); | |
4906 | thread_block(THREAD_CONTINUE_NULL); | |
4907 | zone_lock(zone); | |
4908 | zone->z_replenish_wait = false; | |
0a7de745 | 4909 | |
c3c9b80d | 4910 | assert(zone->z_self == zone); |
0a7de745 | 4911 | } |
c3c9b80d | 4912 | } |
0a7de745 | 4913 | |
c3c9b80d A |
4914 | static bool |
4915 | zone_replenish_needed(zone_t z) | |
4916 | { | |
4917 | return z->z_elems_free <= z->z_elems_rsv; | |
6d2010ae A |
4918 | } |
4919 | ||
4920 | /* | |
c3c9b80d A |
4921 | * High priority VM privileged thread used to asynchronously refill a given zone. |
4922 | * These are needed for data structures used by the lower level VM itself. The | |
4923 | * replenish thread maintains a reserve of elements, so that the VM will never | |
4924 | * block in the zone allocator. | |
6d2010ae | 4925 | */ |
c3c9b80d | 4926 | __dead2 |
6d2010ae | 4927 | static void |
c3c9b80d | 4928 | zone_replenish_thread(void *_z, wait_result_t __unused wr) |
6d2010ae | 4929 | { |
c3c9b80d A |
4930 | unsigned o_active, n_active; |
4931 | zone_t z = _z; | |
4932 | ||
4933 | zone_lock(z); | |
4934 | assert(z->z_self == z); | |
4935 | assert(z->z_async_refilling && z->z_replenishes); | |
4936 | ||
4937 | zone_expand_locked(z, Z_WAITOK, zone_replenish_needed); | |
4938 | ||
4939 | if (z->z_replenish_wait) { | |
4940 | /* Wakeup any potentially throttled allocations */ | |
4941 | z->z_replenish_wait = false; | |
4942 | thread_wakeup(z); | |
0a7de745 A |
4943 | } |
4944 | ||
c3c9b80d A |
4945 | /* wakeup zone_reclaim() callers that were possibly waiting */ |
4946 | os_atomic_rmw_loop(&zone_replenish_active, o_active, n_active, relaxed, { | |
4947 | if (os_sub_overflow(o_active, ZONE_REPLENISH_ACTIVE_INC, &n_active)) { | |
4948 | panic("zone_replenish_active corrupt: %d", o_active); | |
4949 | } | |
4950 | if ((n_active & ZONE_REPLENISH_ACTIVE_MASK) == 0) { | |
4951 | n_active = ZONE_REPLENISH_ACTIVE_NONE; | |
4952 | } | |
4953 | }); | |
0a7de745 | 4954 | |
c3c9b80d A |
4955 | if (n_active == ZONE_REPLENISH_ACTIVE_NONE && |
4956 | (o_active & ZONE_REPLENISH_ACTIVE_WAITER_BIT)) { | |
4957 | thread_wakeup(&zone_replenish_active); | |
4958 | } | |
0a7de745 | 4959 | |
c3c9b80d A |
4960 | z->z_async_refilling = false; |
4961 | assert_wait(&z->z_elems_rsv, THREAD_UNINT); | |
0a7de745 | 4962 | |
c3c9b80d | 4963 | zone_unlock(z); |
6d2010ae | 4964 | |
c3c9b80d A |
4965 | thread_block_parameter(zone_replenish_thread, z); |
4966 | __builtin_unreachable(); | |
4967 | } | |
0a7de745 | 4968 | |
c3c9b80d A |
4969 | void |
4970 | zone_replenish_configure(zone_t z) | |
4971 | { | |
4972 | thread_t th; | |
4973 | kern_return_t kr; | |
4974 | char name[MAXTHREADNAMESIZE]; | |
0a7de745 | 4975 | |
c3c9b80d A |
4976 | zone_lock(z); |
4977 | assert(!z->z_replenishes && !z->z_destructible); | |
4978 | z->z_elems_rsv = (uint16_t)(ZONE_REPLENISH_TARGET / zone_elem_size(z)); | |
4979 | z->z_replenishes = true; | |
4980 | os_atomic_add(&zone_replenish_active, ZONE_REPLENISH_ACTIVE_INC, relaxed); | |
4981 | z->z_async_refilling = true; | |
4982 | zone_unlock(z); | |
0a7de745 | 4983 | |
c3c9b80d A |
4984 | kr = kernel_thread_create(zone_replenish_thread, z, MAXPRI_KERNEL, &th); |
4985 | if (kr != KERN_SUCCESS) { | |
4986 | panic("zone_replenish_configure, thread create: 0x%x", kr); | |
6d2010ae | 4987 | } |
c3c9b80d A |
4988 | /* make sure this thread can't lose its stack */ |
4989 | assert(th->reserved_stack == th->kernel_stack); | |
4990 | ||
4991 | snprintf(name, sizeof(name), "z_replenish(%s)", zone_name(z)); | |
4992 | thread_set_thread_name(th, name); | |
4993 | ||
4994 | thread_mtx_lock(th); | |
4995 | th->options |= TH_OPT_VMPRIV | TH_OPT_ZONE_PRIV; | |
4996 | thread_start(th); | |
4997 | thread_mtx_unlock(th); | |
4998 | ||
4999 | thread_deallocate(th); | |
6d2010ae A |
5000 | } |
5001 | ||
c3c9b80d A |
5002 | /*! @} */ |
5003 | #endif /* !ZALLOC_TEST */ | |
5004 | #pragma mark zone jetsam integration | |
5005 | #if !ZALLOC_TEST | |
6d2010ae | 5006 | |
c3c9b80d A |
5007 | /* |
5008 | * We're being very conservative here and picking a value of 95%. We might need to lower this if | |
5009 | * we find that we're not catching the problem and are still hitting zone map exhaustion panics. | |
6d2010ae | 5010 | */ |
c3c9b80d | 5011 | #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95 |
6d2010ae | 5012 | |
c3c9b80d A |
5013 | /* |
5014 | * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit. | |
5015 | * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default. | |
5016 | */ | |
5017 | TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit", | |
5018 | ZONE_MAP_JETSAM_LIMIT_DEFAULT); | |
5019 | ||
5020 | void | |
5021 | get_zone_map_size(uint64_t *current_size, uint64_t *capacity) | |
6d2010ae | 5022 | { |
c3c9b80d A |
5023 | vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed); |
5024 | *current_size = ptoa_64(phys_pages); | |
5025 | *capacity = ptoa_64(zone_phys_mapped_max_pages); | |
6d2010ae A |
5026 | } |
5027 | ||
c3c9b80d A |
5028 | void |
5029 | get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size) | |
6d2010ae | 5030 | { |
c3c9b80d | 5031 | zone_t largest_zone = zone_find_largest(); |
6d2010ae | 5032 | |
c3c9b80d A |
5033 | /* |
5034 | * Append kalloc heap name to zone name (if zone is used by kalloc) | |
5035 | */ | |
5036 | snprintf(zone_name, zone_name_len, "%s%s", | |
5037 | zone_heap_name(largest_zone), largest_zone->z_name); | |
5038 | ||
5039 | *zone_size = zone_size_wired(largest_zone); | |
5040 | } | |
5041 | ||
5042 | bool | |
5043 | zone_map_nearing_exhaustion(void) | |
5044 | { | |
5045 | uint64_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed); | |
5046 | return phys_pages * 100 > zone_phys_mapped_max_pages * zone_map_jetsam_limit; | |
5047 | } | |
5048 | ||
5049 | ||
5050 | #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98 | |
5051 | ||
5052 | /* | |
5053 | * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread | |
5054 | * to walk through the jetsam priority bands and kill processes. | |
5055 | */ | |
5056 | static void | |
5057 | kill_process_in_largest_zone(void) | |
5058 | { | |
5059 | pid_t pid = -1; | |
5060 | zone_t largest_zone = zone_find_largest(); | |
5061 | ||
5062 | printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, capacity %lld [jetsam limit %d%%]\n", | |
5063 | ptoa_64(os_atomic_load(&zones_phys_page_mapped_count, relaxed)), | |
5064 | ptoa_64(zone_phys_mapped_max_pages), | |
5065 | (uint64_t)zone_submaps_approx_size(), | |
5066 | (uint64_t)(zone_foreign_size() + zone_native_size()), | |
5067 | zone_map_jetsam_limit); | |
5068 | printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone), | |
5069 | largest_zone->z_name, (uintptr_t)zone_size_wired(largest_zone)); | |
5070 | ||
5071 | /* | |
5072 | * We want to make sure we don't call this function from userspace. | |
5073 | * Or we could end up trying to synchronously kill the process | |
5074 | * whose context we're in, causing the system to hang. | |
5075 | */ | |
5076 | assert(current_task() == kernel_task); | |
5077 | ||
5078 | /* | |
5079 | * If vm_object_zone is the largest, check to see if the number of | |
5080 | * elements in vm_map_entry_zone is comparable. | |
5081 | * | |
5082 | * If so, consider vm_map_entry_zone as the largest. This lets us target | |
5083 | * a specific process to jetsam to quickly recover from the zone map | |
5084 | * bloat. | |
5085 | */ | |
5086 | if (largest_zone == vm_object_zone) { | |
5087 | unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone); | |
5088 | unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone); | |
5089 | /* Is the VM map entries zone count >= 98% of the VM objects zone count? */ | |
5090 | if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) { | |
5091 | largest_zone = vm_map_entry_zone; | |
5092 | printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n", | |
5093 | (uintptr_t)zone_size_wired(largest_zone)); | |
5094 | } | |
5095 | } | |
5096 | ||
5097 | /* TODO: Extend this to check for the largest process in other zones as well. */ | |
5098 | if (largest_zone == vm_map_entry_zone) { | |
5099 | pid = find_largest_process_vm_map_entries(); | |
5100 | } else { | |
5101 | printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. " | |
5102 | "Waking up memorystatus thread.\n", zone_heap_name(largest_zone), | |
5103 | largest_zone->z_name); | |
6d2010ae | 5104 | } |
c3c9b80d A |
5105 | if (!memorystatus_kill_on_zone_map_exhaustion(pid)) { |
5106 | printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid); | |
5107 | } | |
5108 | } | |
6d2010ae | 5109 | |
c3c9b80d A |
5110 | #endif /* !ZALLOC_TEST */ |
5111 | #pragma mark zfree | |
5112 | #if !ZALLOC_TEST | |
5113 | #if KASAN_ZALLOC | |
6d2010ae | 5114 | |
c3c9b80d A |
5115 | /*! |
5116 | * @defgroup zfree | |
5117 | * @{ | |
5118 | * | |
5119 | * @brief | |
5120 | * The codepath for zone frees. | |
5121 | * | |
5122 | * @discussion | |
5123 | * There are 4 major ways to allocate memory that end up in the zone allocator: | |
5124 | * - @c zfree() | |
5125 | * - @c zfree_percpu() | |
5126 | * - @c kfree*() | |
5127 | * - @c zfree_permanent() | |
5128 | * | |
5129 | * While permanent zones have their own allocation scheme, all other codepaths | |
5130 | * will eventually go through the @c zfree_ext() choking point. | |
5131 | * | |
5132 | * Ignoring the @c gzalloc_free() codepath, the decision tree looks like this: | |
5133 | * <code> | |
5134 | * zfree_ext() | |
5135 | * ├───> zfree_cached() ────────────────╮ | |
5136 | * │ │ │ | |
5137 | * │ │ │ | |
5138 | * │ ├───> zfree_cached_slow() ───┤ | |
5139 | * │ │ │ │ | |
5140 | * │ │ v │ | |
5141 | * ╰───────┴───> zfree_item() ──────────┴───> | |
5142 | * </code> | |
5143 | * | |
5144 | * @c zfree_ext() takes care of all the generic work to perform on an element | |
5145 | * before it is freed (zeroing, logging, tagging, ...) then will hand it off to: | |
5146 | * - @c zfree_item() if zone caching is off | |
5147 | * - @c zfree_cached() if zone caching is on. | |
5148 | * | |
5149 | * @c zfree_cached can take a number of decisions: | |
5150 | * - a fast path if the (f) or (a) magazines have space (preemption disabled), | |
5151 | * - using the cpu local or recirculation depot calling @c zfree_cached_slow(), | |
5152 | * - falling back to @c zfree_item() when CPU caching has been disabled. | |
5153 | */ | |
6d2010ae A |
5154 | |
5155 | /* | |
c3c9b80d A |
5156 | * Called from zfree() to add the element being freed to the KASan quarantine. |
5157 | * | |
5158 | * Returns true if the newly-freed element made it into the quarantine without | |
5159 | * displacing another, false otherwise. In the latter case, addrp points to the | |
5160 | * address of the displaced element, which will be freed by the zone. | |
6d2010ae | 5161 | */ |
c3c9b80d A |
5162 | static bool |
5163 | kasan_quarantine_freed_element( | |
5164 | zone_t *zonep, /* the zone the element is being freed to */ | |
5165 | void **addrp) /* address of the element being freed */ | |
6d2010ae | 5166 | { |
c3c9b80d A |
5167 | zone_t zone = *zonep; |
5168 | void *addr = *addrp; | |
6d2010ae | 5169 | |
c3c9b80d A |
5170 | /* |
5171 | * Resize back to the real allocation size and hand off to the KASan | |
5172 | * quarantine. `addr` may then point to a different allocation, if the | |
5173 | * current element replaced another in the quarantine. The zone then | |
5174 | * takes ownership of the swapped out free element. | |
5175 | */ | |
5176 | vm_size_t usersz = zone_elem_size(zone) - 2 * zone->z_kasan_redzone; | |
5177 | vm_size_t sz = usersz; | |
6d2010ae | 5178 | |
c3c9b80d A |
5179 | if (addr && zone->z_kasan_redzone) { |
5180 | kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC); | |
5181 | addr = (void *)kasan_dealloc((vm_address_t)addr, &sz); | |
5182 | assert(sz == zone_elem_size(zone)); | |
5183 | } | |
5184 | if (addr && !zone->kasan_noquarantine) { | |
5185 | kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true); | |
5186 | if (!addr) { | |
5187 | return TRUE; | |
5188 | } | |
5189 | } | |
5190 | if (addr && zone->kasan_noquarantine) { | |
5191 | kasan_unpoison(addr, zone_elem_size(zone)); | |
5192 | } | |
5193 | *addrp = addr; | |
5194 | return FALSE; | |
6d2010ae A |
5195 | } |
5196 | ||
c3c9b80d | 5197 | #endif /* KASAN_ZALLOC */ |
39037602 | 5198 | |
c3c9b80d A |
5199 | __header_always_inline void |
5200 | zfree_drop(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze, | |
5201 | bool recirc) | |
5ba3f43e | 5202 | { |
c3c9b80d | 5203 | vm_offset_t esize = zone_elem_size(zone); |
5ba3f43e | 5204 | |
c3c9b80d A |
5205 | if (zone_meta_mark_free(meta, ze) == recirc) { |
5206 | zone_meta_double_free_panic(zone, ze, __func__); | |
5207 | } | |
5ba3f43e | 5208 | |
c3c9b80d A |
5209 | vm_offset_t old_size = meta->zm_alloc_size; |
5210 | vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK; | |
5211 | vm_offset_t new_size = zone_meta_alloc_size_sub(zone, meta, esize); | |
5ba3f43e | 5212 | |
c3c9b80d A |
5213 | if (new_size == 0) { |
5214 | /* whether the page was on the intermediate or all_used, queue, move it to free */ | |
5215 | zone_meta_requeue(zone, &zone->z_pageq_empty, meta); | |
5216 | zone->z_wired_empty += meta->zm_chunk_len; | |
5217 | } else if (old_size + esize > max_size) { | |
5218 | /* first free element on page, move from all_used */ | |
5219 | zone_meta_requeue(zone, &zone->z_pageq_partial, meta); | |
5220 | } | |
f427ee49 | 5221 | } |
d9a64523 | 5222 | |
d9a64523 | 5223 | static void |
c3c9b80d | 5224 | zfree_item(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze) |
f427ee49 | 5225 | { |
c3c9b80d A |
5226 | /* transfer preemption count to lock */ |
5227 | zone_lock_nopreempt_check_contention(zone, NULL); | |
d9a64523 | 5228 | |
c3c9b80d A |
5229 | zfree_drop(zone, meta, ze, false); |
5230 | zone_elems_free_add(zone, 1); | |
5231 | ||
5232 | zone_unlock(zone); | |
d9a64523 A |
5233 | } |
5234 | ||
c3c9b80d A |
5235 | __attribute__((noinline)) |
5236 | static void | |
5237 | zfree_cached_slow(zone_t zone, struct zone_page_metadata *meta, | |
5238 | zone_element_t ze, zone_cache_t cache) | |
d9a64523 | 5239 | { |
c3c9b80d A |
5240 | struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags); |
5241 | zone_magazine_t mag = NULL; | |
5242 | uint16_t n = 0; | |
5243 | ||
5244 | if (zone_meta_is_free(meta, ze)) { | |
5245 | zone_meta_double_free_panic(zone, ze, __func__); | |
d9a64523 | 5246 | } |
d9a64523 | 5247 | |
c3c9b80d A |
5248 | if (zone == zc_magazine_zone) { |
5249 | mag = (zone_magazine_t)zone_element_addr(ze, | |
5250 | zone_elem_size(zone)); | |
5251 | #if KASAN_ZALLOC | |
5252 | kasan_poison_range((vm_offset_t)mag, zone_elem_size(zone), | |
5253 | ASAN_VALID); | |
5254 | #endif | |
5255 | } else { | |
5256 | mag = zone_magazine_alloc(Z_NOWAIT); | |
5257 | if (__improbable(mag == NULL)) { | |
5258 | return zfree_item(zone, meta, ze); | |
5259 | } | |
5260 | mag->zm_cur = 1; | |
5261 | mag->zm_elems[0] = ze; | |
f427ee49 | 5262 | } |
d9a64523 | 5263 | |
c3c9b80d A |
5264 | mag = zone_magazine_replace(&cache->zc_free_cur, |
5265 | &cache->zc_free_elems, mag); | |
7ddcb079 | 5266 | |
c3c9b80d A |
5267 | z_debug_assert(cache->zc_free_cur <= 1); |
5268 | z_debug_assert(mag->zm_cur == zc_mag_size()); | |
5ba3f43e | 5269 | |
c3c9b80d A |
5270 | STAILQ_INSERT_HEAD(&mags, mag, zm_link); |
5271 | n = 1; | |
5ba3f43e | 5272 | |
c3c9b80d | 5273 | if (cache->zc_depot_max >= 2 * zc_mag_size()) { |
5ba3f43e | 5274 | /* |
c3c9b80d A |
5275 | * If we can use the local depot (zc_depot_max allows for |
5276 | * 2 magazines worth of elements) then: | |
5277 | * | |
5278 | * 1. if we have space for an extra depot locally, | |
5279 | * push it, and leave. | |
5280 | * | |
5281 | * 2. if we overflow, then take (1 / zc_recirc_denom) | |
5282 | * of the depot out, in order to migrate it to the | |
5283 | * recirculation depot. | |
5ba3f43e | 5284 | */ |
c3c9b80d | 5285 | zone_depot_lock_nopreempt(cache); |
5ba3f43e | 5286 | |
c3c9b80d A |
5287 | if ((cache->zc_depot_cur + 2) * zc_mag_size() <= |
5288 | cache->zc_depot_max) { | |
5289 | cache->zc_depot_cur++; | |
5290 | STAILQ_INSERT_TAIL(&cache->zc_depot, mag, zm_link); | |
5291 | return zone_depot_unlock(cache); | |
f427ee49 | 5292 | } |
c3c9b80d A |
5293 | |
5294 | while (zc_recirc_denom * cache->zc_depot_cur * zc_mag_size() >= | |
5295 | (zc_recirc_denom - 1) * cache->zc_depot_max) { | |
5296 | mag = STAILQ_FIRST(&cache->zc_depot); | |
5297 | STAILQ_REMOVE_HEAD(&cache->zc_depot, zm_link); | |
5298 | STAILQ_INSERT_TAIL(&mags, mag, zm_link); | |
5299 | cache->zc_depot_cur--; | |
5300 | n++; | |
f427ee49 | 5301 | } |
c3c9b80d A |
5302 | |
5303 | zone_depot_unlock(cache); | |
f427ee49 | 5304 | } else { |
c3c9b80d A |
5305 | enable_preemption(); |
5306 | } | |
5ba3f43e | 5307 | |
c3c9b80d A |
5308 | /* |
5309 | * Preflight validity of all the elements before we touch the zone | |
5310 | * metadata, and then insert them into the recirculation depot. | |
5311 | */ | |
5312 | STAILQ_FOREACH(mag, &mags, zm_link) { | |
5313 | for (uint16_t i = 0; i < zc_mag_size(); i++) { | |
5314 | zone_element_validate(zone, mag->zm_elems[i]); | |
f427ee49 | 5315 | } |
c3c9b80d | 5316 | } |
1c79356b | 5317 | |
c3c9b80d | 5318 | zone_lock_check_contention(zone, cache); |
39236c6e | 5319 | |
c3c9b80d A |
5320 | STAILQ_FOREACH(mag, &mags, zm_link) { |
5321 | for (uint16_t i = 0; i < zc_mag_size(); i++) { | |
5322 | zone_element_t e = mag->zm_elems[i]; | |
5323 | ||
5324 | if (!zone_meta_mark_free(zone_meta_from_element(e), e)) { | |
5325 | zone_meta_double_free_panic(zone, e, __func__); | |
5326 | } | |
5327 | } | |
f427ee49 | 5328 | } |
c3c9b80d A |
5329 | STAILQ_CONCAT(&zone->z_recirc, &mags); |
5330 | zone->z_recirc_cur += n; | |
5ba3f43e | 5331 | |
c3c9b80d | 5332 | zone_elems_free_add(zone, n * zc_mag_size()); |
39236c6e | 5333 | |
c3c9b80d | 5334 | zone_unlock(zone); |
f427ee49 | 5335 | } |
5ba3f43e | 5336 | |
f427ee49 | 5337 | static void |
c3c9b80d | 5338 | zfree_cached(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze) |
f427ee49 | 5339 | { |
c3c9b80d A |
5340 | zone_cache_t cache = zpercpu_get(zone->z_pcpu_cache); |
5341 | ||
5342 | if (cache->zc_free_cur >= zc_mag_size()) { | |
5343 | if (cache->zc_alloc_cur >= zc_mag_size()) { | |
5344 | return zfree_cached_slow(zone, meta, ze, cache); | |
5345 | } | |
5346 | zone_cache_swap_magazines(cache); | |
5347 | } | |
5348 | ||
5349 | if (__improbable(cache->zc_alloc_elems == NULL)) { | |
5350 | return zfree_item(zone, meta, ze); | |
5351 | } | |
5352 | ||
5353 | if (zone_meta_is_free(meta, ze)) { | |
5354 | zone_meta_double_free_panic(zone, ze, __func__); | |
5355 | } | |
5356 | ||
5357 | uint16_t idx = cache->zc_free_cur++; | |
5358 | if (idx >= zc_mag_size()) { | |
5359 | zone_accounting_panic(zone, "zc_free_cur overflow"); | |
1c79356b | 5360 | } |
c3c9b80d A |
5361 | cache->zc_free_elems[idx] = ze; |
5362 | ||
5363 | enable_preemption(); | |
5364 | } | |
5ba3f43e | 5365 | |
f427ee49 | 5366 | /* |
c3c9b80d A |
5367 | * The function is noinline when zlog can be used so that the backtracing can |
5368 | * reliably skip the zfree_ext() and zfree_log_trace() | |
5369 | * boring frames. | |
f427ee49 | 5370 | */ |
c3c9b80d A |
5371 | #if ZONE_ENABLE_LOGGING |
5372 | __attribute__((noinline)) | |
5373 | #endif /* ZONE_ENABLE_LOGGING */ | |
5374 | void | |
5375 | zfree_ext(zone_t zone, zone_stats_t zstats, void *addr) | |
f427ee49 | 5376 | { |
c3c9b80d A |
5377 | struct zone_page_metadata *page_meta; |
5378 | vm_offset_t elem = (vm_offset_t)addr; | |
5379 | vm_size_t elem_size = zone_elem_size(zone); | |
5380 | zone_element_t ze; | |
5381 | ||
5382 | DTRACE_VM2(zfree, zone_t, zone, void*, addr); | |
5383 | TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, elem_size, elem); | |
5384 | #if VM_MAX_TAG_ZONES | |
5385 | if (__improbable(zone->tags)) { | |
5386 | vm_tag_t tag = *ztSlot(zone, elem) >> 1; | |
5387 | // set the tag with b0 clear so the block remains inuse | |
5388 | *ztSlot(zone, elem) = 0xFFFE; | |
5389 | vm_tag_update_zone_size(tag, zone->tag_zone_index, | |
5390 | -(long)elem_size); | |
5391 | } | |
5392 | #endif /* VM_MAX_TAG_ZONES */ | |
5393 | ||
5394 | #if KASAN_ZALLOC | |
5395 | if (kasan_quarantine_freed_element(&zone, &addr)) { | |
5396 | return; | |
5397 | } | |
5398 | /* | |
5399 | * kasan_quarantine_freed_element() might return a different | |
5400 | * {zone, addr} than the one being freed for kalloc heaps. | |
5401 | * | |
5402 | * Make sure we reload everything. | |
5403 | */ | |
5404 | elem = (vm_offset_t)addr; | |
5405 | elem_size = zone_elem_size(zone); | |
5406 | #endif | |
5407 | #if CONFIG_ZLEAKS | |
5ba3f43e | 5408 | /* |
c3c9b80d | 5409 | * Zone leak detection: un-track the allocation |
5ba3f43e | 5410 | */ |
c3c9b80d A |
5411 | if (__improbable(zone->zleak_on)) { |
5412 | zleak_free(elem, elem_size); | |
5413 | } | |
5414 | #endif /* CONFIG_ZLEAKS */ | |
5415 | #if ZONE_ENABLE_LOGGING | |
5416 | if (__improbable(DO_LOGGING(zone))) { | |
5417 | zfree_log_trace(zone, elem, __builtin_frame_address(0)); | |
5418 | } | |
5419 | #endif /* ZONE_ENABLE_LOGGING */ | |
5420 | #if CONFIG_GZALLOC | |
5421 | if (__improbable(zone->gzalloc_tracked)) { | |
5422 | return gzalloc_free(zone, zstats, addr); | |
5ba3f43e | 5423 | } |
c3c9b80d | 5424 | #endif /* CONFIG_GZALLOC */ |
1c79356b | 5425 | |
c3c9b80d A |
5426 | page_meta = zone_element_resolve(zone, elem, elem_size, &ze); |
5427 | ze.ze_value |= zfree_clear_or_poison(zone, elem, elem_size); | |
f427ee49 | 5428 | #if KASAN_ZALLOC |
c3c9b80d A |
5429 | if (zone->z_percpu) { |
5430 | zpercpu_foreach_cpu(i) { | |
5431 | kasan_poison_range(elem + ptoa(i), elem_size, | |
5432 | ASAN_HEAP_FREED); | |
39037602 | 5433 | } |
f427ee49 | 5434 | } else { |
c3c9b80d | 5435 | kasan_poison_range(elem, elem_size, ASAN_HEAP_FREED); |
f427ee49 A |
5436 | } |
5437 | #endif | |
c3c9b80d A |
5438 | |
5439 | disable_preemption(); | |
5440 | zpercpu_get(zstats)->zs_mem_freed += elem_size; | |
5441 | ||
5442 | if (zone->z_pcpu_cache) { | |
5443 | return zfree_cached(zone, page_meta, ze); | |
5444 | } | |
5445 | ||
5446 | return zfree_item(zone, page_meta, ze); | |
f427ee49 A |
5447 | } |
5448 | ||
c3c9b80d A |
5449 | void |
5450 | (zfree)(union zone_or_view zov, void *addr) | |
f427ee49 | 5451 | { |
c3c9b80d A |
5452 | zone_t zone = zov.zov_view->zv_zone; |
5453 | zone_stats_t zstats = zov.zov_view->zv_stats; | |
5454 | assert(!zone->z_percpu); | |
5455 | zfree_ext(zone, zstats, addr); | |
f427ee49 | 5456 | } |
39037602 | 5457 | |
c3c9b80d A |
5458 | void |
5459 | zfree_percpu(union zone_or_view zov, void *addr) | |
f427ee49 | 5460 | { |
c3c9b80d A |
5461 | zone_t zone = zov.zov_view->zv_zone; |
5462 | zone_stats_t zstats = zov.zov_view->zv_stats; | |
5463 | assert(zone->z_percpu); | |
5464 | zfree_ext(zone, zstats, (void *)__zpcpu_demangle(addr)); | |
f427ee49 | 5465 | } |
39037602 | 5466 | |
c3c9b80d A |
5467 | /*! @} */ |
5468 | #endif /* !ZALLOC_TEST */ | |
5469 | #pragma mark zalloc | |
5470 | #if !ZALLOC_TEST | |
5471 | ||
5472 | /*! | |
5473 | * @defgroup zalloc | |
5474 | * @{ | |
5475 | * | |
5476 | * @brief | |
5477 | * The codepath for zone allocations. | |
5478 | * | |
5479 | * @discussion | |
5480 | * There are 4 major ways to allocate memory that end up in the zone allocator: | |
5481 | * - @c zalloc(), @c zalloc_flags(), ... | |
5482 | * - @c zalloc_percpu() | |
5483 | * - @c kalloc*() | |
5484 | * - @c zalloc_permanent() | |
5485 | * | |
5486 | * While permanent zones have their own allocation scheme, all other codepaths | |
5487 | * will eventually go through the @c zalloc_ext() choking point. | |
5488 | * | |
5489 | * Ignoring the @c zalloc_gz() codepath, the decision tree looks like this: | |
5490 | * <code> | |
5491 | * zalloc_ext() | |
5492 | * │ | |
5493 | * ├───> zalloc_cached() ──────> zalloc_cached_fast() ───╮ | |
5494 | * │ │ ^ │ | |
5495 | * │ │ │ │ | |
5496 | * │ ╰───> zalloc_cached_slow() ───╯ │ | |
5497 | * │ │ │ | |
5498 | * │<─────────────────╮ ├─────────────╮ │ | |
5499 | * │ │ │ │ │ | |
5500 | * │ │ v │ │ | |
5501 | * │<───────╮ â•â”€â”€> zalloc_item_slow() ────┤ │ | |
5502 | * │ │ │ │ │ | |
5503 | * │ │ │ v │ | |
5504 | * ╰───> zalloc_item() ──────────> zalloc_item_fast() ───┤ | |
5505 | * │ | |
5506 | * v | |
5507 | * zalloc_return() | |
5508 | * </code> | |
5509 | * | |
5510 | * | |
5511 | * The @c zalloc_item() track is used when zone caching is off: | |
5512 | * - @c zalloc_item_fast() is used when there are enough elements available, | |
5513 | * - @c zalloc_item_slow() is used when a refill is needed, which can cause | |
5514 | * the zone to grow. This is the only codepath that refills. | |
5515 | * | |
5516 | * This track uses the zone lock for serialization: | |
5517 | * - taken in @c zalloc_item(), | |
5518 | * - maintained during @c zalloc_item_slow() (possibly dropped and re-taken), | |
5519 | * - dropped in @c zalloc_item_fast(). | |
5520 | * | |
5521 | * | |
5522 | * The @c zalloc_cached() track is used when zone caching is on: | |
5523 | * - @c zalloc_cached_fast() is taken when the cache has elements, | |
5524 | * - @c zalloc_cached_slow() is taken if a cache refill is needed. | |
5525 | * It can chose many strategies: | |
5526 | * ~ @c zalloc_cached_from_depot() to try to reuse cpu stashed magazines, | |
5527 | * ~ using the global recirculation depot @c z_recirc, | |
5528 | * ~ using zalloc_import() if the zone has enough elements, | |
5529 | * ~ falling back to the @c zalloc_item() track if zone caching is disabled | |
5530 | * due to VM pressure or the zone has no available elements. | |
5531 | * | |
5532 | * This track disables preemption for serialization: | |
5533 | * - preemption is disabled in @c zalloc_cached(), | |
5534 | * - kept disabled during @c zalloc_cached_slow(), converted into a zone lock | |
5535 | * if switching to @c zalloc_item_slow(), | |
5536 | * - preemption is reenabled in @c zalloc_cached_fast(). | |
5537 | * | |
5538 | * @c zalloc_cached_from_depot() also takes depot locks (taken by the caller, | |
5539 | * released by @c zalloc_cached_from_depot(). | |
5540 | * | |
5541 | * In general the @c zalloc_*_slow() codepaths deal with refilling and will | |
5542 | * tail call into the @c zalloc_*_fast() code to perform the actual allocation. | |
5543 | * | |
5544 | * @c zalloc_return() is the final function everyone tail calls into, | |
5545 | * which prepares the element for consumption by the caller and deals with | |
5546 | * common treatment (zone logging, tags, kasan, validation, ...). | |
5547 | */ | |
5548 | ||
5549 | /*! | |
5550 | * @function zalloc_import | |
5551 | * | |
5552 | * @brief | |
5553 | * Import @c n elements in the specified array, opposite of @c zfree_drop(). | |
5554 | * | |
5555 | * @param zone The zone to import elements from | |
5556 | * @param elems The array to import into | |
5557 | * @param n The number of elements to import. Must be non zero, | |
5558 | * and smaller than @c zone->z_elems_free. | |
5559 | */ | |
5560 | __header_always_inline void | |
5561 | zalloc_import(zone_t zone, zone_element_t *elems, uint32_t n) | |
f427ee49 | 5562 | { |
c3c9b80d A |
5563 | vm_size_t esize = zone_elem_size(zone); |
5564 | uint32_t i = 0; | |
f427ee49 | 5565 | |
c3c9b80d A |
5566 | assertf(STAILQ_EMPTY(&zone->z_recirc), |
5567 | "Trying to import from zone %p [%s%s] with non empty recirc", | |
5568 | zone, zone_heap_name(zone), zone_name(zone)); | |
f427ee49 | 5569 | |
c3c9b80d A |
5570 | do { |
5571 | vm_offset_t page, eidx, size = 0; | |
5572 | struct zone_page_metadata *meta; | |
5573 | ||
5574 | if (!zone_pva_is_null(zone->z_pageq_partial)) { | |
5575 | meta = zone_pva_to_meta(zone->z_pageq_partial); | |
5576 | page = zone_pva_to_addr(zone->z_pageq_partial); | |
5577 | } else if (!zone_pva_is_null(zone->z_pageq_empty)) { | |
5578 | meta = zone_pva_to_meta(zone->z_pageq_empty); | |
5579 | page = zone_pva_to_addr(zone->z_pageq_empty); | |
5580 | zone_counter_sub(zone, z_wired_empty, meta->zm_chunk_len); | |
5581 | } else { | |
5582 | zone_accounting_panic(zone, "z_elems_free corruption"); | |
5583 | } | |
f427ee49 | 5584 | |
c3c9b80d A |
5585 | if (!zone_has_index(zone, meta->zm_index)) { |
5586 | zone_page_metadata_index_confusion_panic(zone, page, meta); | |
5587 | } | |
f427ee49 | 5588 | |
c3c9b80d A |
5589 | vm_offset_t old_size = meta->zm_alloc_size; |
5590 | vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK; | |
5591 | ||
5592 | do { | |
5593 | eidx = zone_meta_find_and_clear_bit(zone, meta); | |
5594 | elems[i++] = zone_element_encode(page, eidx, ZPM_AUTO); | |
5595 | size += esize; | |
5596 | } while (i < n && old_size + size + esize <= max_size); | |
5597 | ||
5598 | vm_offset_t new_size = zone_meta_alloc_size_add(zone, meta, size); | |
5599 | ||
5600 | if (new_size + esize > max_size) { | |
5601 | zone_meta_requeue(zone, &zone->z_pageq_full, meta); | |
5602 | } else if (old_size == 0) { | |
5603 | /* remove from free, move to intermediate */ | |
5604 | zone_meta_requeue(zone, &zone->z_pageq_partial, meta); | |
5605 | } | |
5606 | } while (i < n); | |
5607 | } | |
5608 | ||
5609 | /*! | |
5610 | * @function zalloc_return | |
5611 | * | |
5612 | * @brief | |
5613 | * Performs the tail-end of the work required on allocations before the caller | |
5614 | * uses them. | |
5615 | * | |
5616 | * @discussion | |
5617 | * This function is called without any zone lock held, | |
5618 | * and preemption back to the state it had when @c zalloc_ext() was called. | |
5619 | * | |
5620 | * @param zone The zone we're allocating from. | |
5621 | * @param ze The encoded element we just allocated. | |
5622 | * @param flags The flags passed to @c zalloc_ext() (for Z_ZERO). | |
5623 | * @param elem_size The element size for this zone. | |
5624 | * @param freemag An optional magazine that needs to be freed. | |
5625 | */ | |
5626 | __attribute__((noinline)) | |
5627 | static void * | |
5628 | zalloc_return(zone_t zone, zone_element_t ze, zalloc_flags_t flags, | |
5629 | vm_offset_t elem_size, zone_magazine_t freemag) | |
5630 | { | |
5631 | vm_offset_t addr = zone_element_addr(ze, elem_size); | |
5632 | ||
5633 | #if KASAN_ZALLOC | |
5634 | if (zone->z_percpu) { | |
5635 | zpercpu_foreach_cpu(i) { | |
5636 | kasan_poison_range(addr + ptoa(i), elem_size, | |
5637 | ASAN_VALID); | |
5638 | } | |
f427ee49 | 5639 | } else { |
c3c9b80d A |
5640 | kasan_poison_range(addr, elem_size, ASAN_VALID); |
5641 | } | |
5642 | #endif | |
5643 | #if ZALLOC_ENABLE_POISONING | |
5644 | zalloc_validate_element(zone, addr, elem_size, zone_element_prot(ze)); | |
5645 | #endif /* ZALLOC_ENABLE_POISONING */ | |
5646 | #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS | |
5647 | if (__improbable(zalloc_should_log_or_trace_leaks(zone, elem_size))) { | |
5648 | zalloc_log_or_trace_leaks(zone, addr, __builtin_frame_address(0)); | |
5649 | } | |
5650 | #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */ | |
5651 | #if KASAN_ZALLOC | |
5652 | if (zone->z_kasan_redzone) { | |
5653 | addr = kasan_alloc(addr, elem_size, | |
5654 | elem_size - 2 * zone->z_kasan_redzone, | |
5655 | zone->z_kasan_redzone); | |
5656 | elem_size -= 2 * zone->z_kasan_redzone; | |
f427ee49 A |
5657 | } |
5658 | /* | |
c3c9b80d A |
5659 | * Initialize buffer with unique pattern only if memory |
5660 | * wasn't expected to be zeroed. | |
f427ee49 | 5661 | */ |
c3c9b80d A |
5662 | if (!zone->z_free_zeroes && !(flags & Z_ZERO)) { |
5663 | kasan_leak_init(addr, elem_size); | |
5664 | } | |
5665 | #endif /* KASAN_ZALLOC */ | |
5666 | if ((flags & Z_ZERO) && !zone->z_free_zeroes) { | |
5667 | bzero((void *)addr, elem_size); | |
f427ee49 A |
5668 | } |
5669 | ||
c3c9b80d A |
5670 | #if VM_MAX_TAG_ZONES |
5671 | if (__improbable(zone->tags)) { | |
5672 | vm_tag_t tag = zalloc_flags_get_tag(flags); | |
5673 | if (tag == VM_KERN_MEMORY_NONE) { | |
5674 | tag = VM_KERN_MEMORY_KALLOC; | |
5675 | } | |
5676 | // set the tag with b0 clear so the block remains inuse | |
5677 | *ztSlot(zone, addr) = (vm_tag_t)(tag << 1); | |
5678 | vm_tag_update_zone_size(tag, zone->tag_zone_index, | |
5679 | (long)elem_size); | |
5680 | } | |
5681 | #endif /* VM_MAX_TAG_ZONES */ | |
39037602 | 5682 | |
c3c9b80d A |
5683 | TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, elem_size, addr); |
5684 | DTRACE_VM2(zalloc, zone_t, zone, void*, addr); | |
5685 | if (freemag) { | |
5686 | zone_magazine_free(freemag); | |
f427ee49 | 5687 | } |
c3c9b80d A |
5688 | return (void *)addr; |
5689 | } | |
39037602 | 5690 | |
c3c9b80d A |
5691 | #if CONFIG_GZALLOC |
5692 | /*! | |
5693 | * @function zalloc_gz | |
5694 | * | |
5695 | * @brief | |
5696 | * Performs allocations for zones using gzalloc. | |
5697 | * | |
5698 | * @discussion | |
5699 | * This function is noinline so that it doesn't affect the codegen | |
5700 | * of the fastpath. | |
5701 | */ | |
5702 | __attribute__((noinline)) | |
5703 | static void * | |
5704 | zalloc_gz(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) | |
5705 | { | |
5706 | vm_offset_t addr = gzalloc_alloc(zone, zstats, flags); | |
5707 | return zalloc_return(zone, zone_element_encode(addr, 0, ZPM_AUTO), | |
5708 | flags, zone_elem_size(zone), NULL); | |
5709 | } | |
5710 | #endif /* CONFIG_GZALLOC */ | |
5711 | ||
5712 | static void * | |
5713 | zalloc_item_fast(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) | |
5714 | { | |
5715 | vm_size_t esize = zone_elem_size(zone); | |
5716 | zone_element_t ze; | |
5717 | ||
5718 | zalloc_import(zone, &ze, 1); | |
5719 | zone_elems_free_sub(zone, 1); | |
5720 | zpercpu_get(zstats)->zs_mem_allocated += esize; | |
5721 | zone_unlock(zone); | |
5722 | ||
5723 | return zalloc_return(zone, ze, flags, esize, NULL); | |
5724 | } | |
5725 | ||
5726 | /*! | |
5727 | * @function zalloc_item_slow | |
5728 | * | |
5729 | * @brief | |
5730 | * Performs allocations when the zone is out of elements. | |
5731 | * | |
5732 | * @discussion | |
5733 | * This function might drop the lock and reenable preemption, | |
5734 | * which means the per-CPU caching layer or recirculation depot | |
5735 | * might have received elements. | |
5736 | */ | |
5737 | __attribute__((noinline)) | |
5738 | static void * | |
5739 | zalloc_item_slow(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) | |
5740 | { | |
5741 | if (zone->z_replenishes) { | |
5742 | zone_replenish_locked(zone); | |
5743 | } else { | |
5744 | if ((flags & Z_NOWAIT) == 0) { | |
5745 | zone_expand_locked(zone, flags, zalloc_needs_refill); | |
5746 | } | |
5747 | if (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) { | |
5748 | zone_expand_async_schedule_if_needed(zone); | |
5749 | } | |
5750 | if (__improbable(zone->z_elems_free == 0)) { | |
5751 | zone_unlock(zone); | |
5752 | if (__improbable(flags & Z_NOFAIL)) { | |
5753 | zone_nofail_panic(zone); | |
5754 | } | |
5755 | DTRACE_VM2(zalloc, zone_t, zone, void*, NULL); | |
5756 | return NULL; | |
5757 | } | |
f427ee49 | 5758 | } |
39037602 | 5759 | |
f427ee49 | 5760 | /* |
c3c9b80d A |
5761 | * We might have changed core or got preempted/blocked while expanding |
5762 | * the zone. Allocating from the zone when the recirculation depot | |
5763 | * is not empty is not allowed. | |
5764 | * | |
5765 | * It will be rare but possible for the depot to refill while we were | |
5766 | * waiting for pages. If that happens we need to start over. | |
f427ee49 | 5767 | */ |
c3c9b80d A |
5768 | if (!STAILQ_EMPTY(&zone->z_recirc)) { |
5769 | zone_unlock(zone); | |
5770 | return zalloc_ext(zone, zstats, flags); | |
f427ee49 | 5771 | } |
39037602 | 5772 | |
c3c9b80d A |
5773 | return zalloc_item_fast(zone, zstats, flags); |
5774 | } | |
5775 | ||
5776 | /*! | |
5777 | * @function zalloc_item | |
5778 | * | |
5779 | * @brief | |
5780 | * Performs allocations when zone caching is off. | |
5781 | * | |
5782 | * @discussion | |
5783 | * This function calls @c zalloc_item_slow() when refilling the zone | |
5784 | * is needed, or @c zalloc_item_fast() if the zone has enough free elements. | |
5785 | */ | |
5786 | static void * | |
5787 | zalloc_item(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) | |
5788 | { | |
5789 | zone_lock_check_contention(zone, NULL); | |
5ba3f43e | 5790 | |
f427ee49 | 5791 | /* |
c3c9b80d A |
5792 | * When we commited to the zalloc_item() path, |
5793 | * zone caching might have been flipped/enabled. | |
5794 | * | |
5795 | * If we got preempted for long enough, the recirculation layer | |
5796 | * can have been populated, and allocating from the zone would be | |
5797 | * incorrect. | |
5798 | * | |
5799 | * So double check for this extremely rare race here. | |
f427ee49 | 5800 | */ |
c3c9b80d A |
5801 | if (__improbable(!STAILQ_EMPTY(&zone->z_recirc))) { |
5802 | zone_unlock(zone); | |
5803 | return zalloc_ext(zone, zstats, flags); | |
f427ee49 | 5804 | } |
c3c9b80d A |
5805 | |
5806 | if (__improbable(zone->z_elems_free <= zone->z_elems_rsv)) { | |
5807 | return zalloc_item_slow(zone, zstats, flags); | |
f427ee49 | 5808 | } |
c3c9b80d A |
5809 | |
5810 | return zalloc_item_fast(zone, zstats, flags); | |
5811 | } | |
5812 | ||
5813 | static void * | |
5814 | zalloc_cached_fast(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags, | |
5815 | zone_cache_t cache, zone_magazine_t freemag) | |
5816 | { | |
5817 | vm_offset_t esize = zone_elem_size(zone); | |
5818 | zone_element_t ze; | |
5819 | uint32_t index; | |
5820 | ||
5821 | index = --cache->zc_alloc_cur; | |
5822 | if (index >= zc_mag_size()) { | |
5823 | zone_accounting_panic(zone, "zc_alloc_cur wrap around"); | |
f427ee49 | 5824 | } |
c3c9b80d A |
5825 | ze = cache->zc_alloc_elems[index]; |
5826 | cache->zc_alloc_elems[index].ze_value = 0; | |
f427ee49 | 5827 | |
c3c9b80d A |
5828 | zpercpu_get(zstats)->zs_mem_allocated += esize; |
5829 | enable_preemption(); | |
5830 | ||
5831 | if (zone_meta_is_free(zone_meta_from_element(ze), ze)) { | |
5832 | zone_meta_double_free_panic(zone, ze, __func__); | |
39037602 A |
5833 | } |
5834 | ||
c3c9b80d A |
5835 | return zalloc_return(zone, ze, flags, esize, freemag); |
5836 | } | |
5837 | ||
5838 | static void * | |
5839 | zalloc_cached_from_depot(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags, | |
5840 | zone_cache_t cache, zone_cache_t depot, zone_magazine_t mag) | |
5841 | { | |
5842 | STAILQ_REMOVE_HEAD(&depot->zc_depot, zm_link); | |
5843 | if (depot->zc_depot_cur-- == 0) { | |
5844 | zone_accounting_panic(zone, "zc_depot_cur wrap-around"); | |
f427ee49 | 5845 | } |
c3c9b80d A |
5846 | zone_depot_unlock_nopreempt(depot); |
5847 | ||
5848 | mag = zone_magazine_replace(&cache->zc_alloc_cur, | |
5849 | &cache->zc_alloc_elems, mag); | |
5850 | ||
5851 | z_debug_assert(cache->zc_alloc_cur == zc_mag_size()); | |
5852 | z_debug_assert(mag->zm_cur == 0); | |
5853 | ||
5854 | if (zone == zc_magazine_zone) { | |
5855 | enable_preemption(); | |
5856 | bzero(mag, zone_elem_size(zone)); | |
5857 | return mag; | |
f427ee49 | 5858 | } |
c3c9b80d A |
5859 | |
5860 | return zalloc_cached_fast(zone, zstats, flags, cache, mag); | |
5861 | } | |
5862 | ||
5863 | __attribute__((noinline)) | |
5864 | static void * | |
5865 | zalloc_cached_slow(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags, | |
5866 | zone_cache_t cache) | |
5867 | { | |
5868 | zone_magazine_t mag = NULL; | |
5869 | struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags); | |
5ba3f43e | 5870 | |
f427ee49 | 5871 | /* |
c3c9b80d | 5872 | * Try to allocate from our local depot, if there's one. |
f427ee49 | 5873 | */ |
c3c9b80d A |
5874 | if (STAILQ_FIRST(&cache->zc_depot)) { |
5875 | zone_depot_lock_nopreempt(cache); | |
5876 | ||
5877 | if ((mag = STAILQ_FIRST(&cache->zc_depot)) != NULL) { | |
5878 | return zalloc_cached_from_depot(zone, zstats, flags, | |
5879 | cache, cache, mag); | |
5880 | } | |
5881 | ||
5882 | zone_depot_unlock_nopreempt(cache); | |
d9a64523 | 5883 | } |
c3c9b80d A |
5884 | |
5885 | zone_lock_nopreempt_check_contention(zone, cache); | |
5886 | ||
f427ee49 | 5887 | /* |
c3c9b80d A |
5888 | * If the recirculation depot is empty, we'll need to import. |
5889 | * The system is tuned for this to be extremely rare. | |
f427ee49 | 5890 | */ |
c3c9b80d A |
5891 | if (__improbable(STAILQ_EMPTY(&zone->z_recirc))) { |
5892 | uint16_t n_elems = zc_mag_size(); | |
f427ee49 | 5893 | |
c3c9b80d A |
5894 | if (zone->z_elems_free < n_elems + zone->z_elems_rsv / 2 && |
5895 | os_sub_overflow(zone->z_elems_free, | |
5896 | zone->z_elems_rsv / 2, &n_elems)) { | |
5897 | n_elems = 0; | |
5898 | } | |
f427ee49 | 5899 | |
c3c9b80d A |
5900 | z_debug_assert(n_elems <= zc_mag_size()); |
5901 | ||
5902 | if (__improbable(n_elems == 0)) { | |
5903 | /* | |
5904 | * If importing elements would deplete the zone, | |
5905 | * call zalloc_item_slow() | |
5906 | */ | |
5907 | return zalloc_item_slow(zone, zstats, flags); | |
f427ee49 | 5908 | } |
c3c9b80d A |
5909 | |
5910 | if (__improbable(zone_caching_disabled)) { | |
5911 | if (__improbable(zone_caching_disabled < 0)) { | |
5912 | /* | |
5913 | * In the first 10s after boot, mess with | |
5914 | * the scan position in order to make early | |
5915 | * allocations patterns less predictible. | |
5916 | */ | |
5917 | zone_early_scramble_rr(zone, zstats); | |
5918 | } | |
5919 | return zalloc_item_fast(zone, zstats, flags); | |
5920 | } | |
5921 | ||
5922 | zalloc_import(zone, cache->zc_alloc_elems, n_elems); | |
5923 | ||
5924 | cache->zc_alloc_cur = n_elems; | |
5925 | zone_elems_free_sub(zone, n_elems); | |
5926 | ||
5927 | zone_unlock_nopreempt(zone); | |
5928 | ||
5929 | return zalloc_cached_fast(zone, zstats, flags, cache, NULL); | |
f427ee49 | 5930 | } |
f427ee49 | 5931 | |
c3c9b80d | 5932 | uint16_t n_mags = 0; |
f427ee49 | 5933 | |
c3c9b80d A |
5934 | /* |
5935 | * If the recirculation depot has elements, then try to fill | |
5936 | * the local per-cpu depot to (1 / zc_recirc_denom) | |
5937 | */ | |
5938 | do { | |
5939 | mag = STAILQ_FIRST(&zone->z_recirc); | |
5940 | STAILQ_REMOVE_HEAD(&zone->z_recirc, zm_link); | |
5941 | STAILQ_INSERT_TAIL(&mags, mag, zm_link); | |
5942 | n_mags++; | |
5943 | ||
5944 | for (uint16_t i = 0; i < zc_mag_size(); i++) { | |
5945 | zone_element_t e = mag->zm_elems[i]; | |
5946 | ||
5947 | if (!zone_meta_mark_used(zone_meta_from_element(e), e)) { | |
5948 | zone_meta_double_free_panic(zone, e, __func__); | |
5949 | } | |
5950 | } | |
5951 | } while (!STAILQ_EMPTY(&zone->z_recirc) && | |
5952 | zc_recirc_denom * n_mags * zc_mag_size() <= cache->zc_depot_max); | |
5953 | ||
5954 | zone_elems_free_sub(zone, n_mags * zc_mag_size()); | |
5955 | zone_counter_sub(zone, z_recirc_cur, n_mags); | |
5956 | ||
5957 | zone_unlock_nopreempt(zone); | |
5958 | ||
5959 | /* | |
5960 | * And then incorporate everything into our per-cpu layer. | |
5961 | */ | |
5962 | mag = STAILQ_FIRST(&mags); | |
5963 | STAILQ_REMOVE_HEAD(&mags, zm_link); | |
5964 | mag = zone_magazine_replace(&cache->zc_alloc_cur, | |
5965 | &cache->zc_alloc_elems, mag); | |
5966 | z_debug_assert(cache->zc_alloc_cur == zc_mag_size()); | |
5967 | z_debug_assert(mag->zm_cur == 0); | |
5968 | ||
5969 | if (--n_mags > 0) { | |
5970 | zone_depot_lock_nopreempt(cache); | |
5971 | cache->zc_depot_cur += n_mags; | |
5972 | STAILQ_CONCAT(&cache->zc_depot, &mags); | |
5973 | zone_depot_unlock_nopreempt(cache); | |
5974 | } | |
5975 | ||
5976 | return zalloc_cached_fast(zone, zstats, flags, cache, mag); | |
f427ee49 A |
5977 | } |
5978 | ||
c3c9b80d A |
5979 | /*! |
5980 | * @function zalloc_cached | |
5981 | * | |
5982 | * @brief | |
5983 | * Performs allocations when zone caching is on. | |
5984 | * | |
5985 | * @discussion | |
5986 | * This function calls @c zalloc_cached_fast() when the caches have elements | |
5987 | * ready. | |
5988 | * | |
5989 | * Else it will call @c zalloc_cached_slow() so that the cache is refilled, | |
5990 | * which might switch to the @c zalloc_item_slow() track when the backing zone | |
5991 | * needs to be refilled. | |
5992 | */ | |
5993 | static void * | |
5994 | zalloc_cached(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) | |
f427ee49 | 5995 | { |
c3c9b80d A |
5996 | zone_cache_t cache; |
5997 | ||
5998 | disable_preemption(); | |
5999 | cache = zpercpu_get(zone->z_pcpu_cache); | |
6000 | ||
6001 | if (cache->zc_alloc_cur == 0) { | |
6002 | if (__improbable(cache->zc_free_cur == 0)) { | |
6003 | return zalloc_cached_slow(zone, zstats, flags, cache); | |
6004 | } | |
6005 | zone_cache_swap_magazines(cache); | |
6006 | } | |
6007 | ||
6008 | return zalloc_cached_fast(zone, zstats, flags, cache, NULL); | |
f427ee49 A |
6009 | } |
6010 | ||
c3c9b80d A |
6011 | /*! |
6012 | * @function zalloc_ext | |
6013 | * | |
6014 | * @brief | |
6015 | * The core implementation of @c zalloc(), @c zalloc_flags(), @c zalloc_percpu(). | |
f427ee49 | 6016 | */ |
c3c9b80d A |
6017 | void * |
6018 | zalloc_ext(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) | |
f427ee49 | 6019 | { |
c3c9b80d A |
6020 | /* |
6021 | * KASan uses zalloc() for fakestack, which can be called anywhere. | |
6022 | * However, we make sure these calls can never block. | |
6023 | */ | |
6024 | assert(zone->kasan_fakestacks || | |
6025 | ml_get_interrupts_enabled() || | |
6026 | ml_is_quiescing() || | |
6027 | debug_mode_active() || | |
6028 | startup_phase < STARTUP_SUB_EARLY_BOOT); | |
f427ee49 | 6029 | |
c3c9b80d A |
6030 | /* |
6031 | * Make sure Z_NOFAIL was not obviously misused | |
6032 | */ | |
6033 | if (zone->z_replenishes) { | |
6034 | assert((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0); | |
6035 | } else if (flags & Z_NOFAIL) { | |
6036 | assert(!zone->exhaustible && | |
6037 | (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0); | |
f427ee49 A |
6038 | } |
6039 | ||
c3c9b80d A |
6040 | #if CONFIG_GZALLOC |
6041 | if (__improbable(zone->gzalloc_tracked)) { | |
6042 | return zalloc_gz(zone, zstats, flags); | |
f427ee49 | 6043 | } |
c3c9b80d | 6044 | #endif /* CONFIG_GZALLOC */ |
f427ee49 | 6045 | |
c3c9b80d A |
6046 | if (zone->z_pcpu_cache) { |
6047 | return zalloc_cached(zone, zstats, flags); | |
f427ee49 | 6048 | } |
c3c9b80d A |
6049 | |
6050 | return zalloc_item(zone, zstats, flags); | |
f427ee49 A |
6051 | } |
6052 | ||
c3c9b80d A |
6053 | void * |
6054 | zalloc(union zone_or_view zov) | |
f427ee49 | 6055 | { |
c3c9b80d | 6056 | return zalloc_flags(zov, Z_WAITOK); |
f427ee49 | 6057 | } |
d9a64523 | 6058 | |
c3c9b80d A |
6059 | void * |
6060 | zalloc_noblock(union zone_or_view zov) | |
6061 | { | |
6062 | return zalloc_flags(zov, Z_NOWAIT); | |
6063 | } | |
6064 | ||
6065 | void * | |
6066 | zalloc_flags(union zone_or_view zov, zalloc_flags_t flags) | |
f427ee49 | 6067 | { |
c3c9b80d A |
6068 | zone_t zone = zov.zov_view->zv_zone; |
6069 | zone_stats_t zstats = zov.zov_view->zv_stats; | |
6070 | assert(!zone->z_percpu); | |
6071 | return zalloc_ext(zone, zstats, flags); | |
39037602 | 6072 | } |
eb6b6ca3 | 6073 | |
c3c9b80d A |
6074 | void * |
6075 | zalloc_percpu(union zone_or_view zov, zalloc_flags_t flags) | |
f427ee49 | 6076 | { |
c3c9b80d A |
6077 | zone_t zone = zov.zov_view->zv_zone; |
6078 | zone_stats_t zstats = zov.zov_view->zv_stats; | |
6079 | assert(zone->z_percpu); | |
6080 | return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags)); | |
6081 | } | |
f427ee49 | 6082 | |
c3c9b80d A |
6083 | static void * |
6084 | _zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask) | |
6085 | { | |
6086 | struct zone_page_metadata *page_meta; | |
6087 | vm_offset_t offs, addr; | |
6088 | zone_pva_t pva; | |
f427ee49 | 6089 | |
c3c9b80d A |
6090 | assert(ml_get_interrupts_enabled() || |
6091 | ml_is_quiescing() || | |
6092 | debug_mode_active() || | |
6093 | startup_phase < STARTUP_SUB_EARLY_BOOT); | |
f427ee49 | 6094 | |
c3c9b80d A |
6095 | size = (size + mask) & ~mask; |
6096 | assert(size <= PAGE_SIZE); | |
f427ee49 | 6097 | |
c3c9b80d A |
6098 | zone_lock(zone); |
6099 | assert(zone->z_self == zone); | |
f427ee49 | 6100 | |
c3c9b80d A |
6101 | for (;;) { |
6102 | pva = zone->z_pageq_partial; | |
6103 | while (!zone_pva_is_null(pva)) { | |
6104 | page_meta = zone_pva_to_meta(pva); | |
6105 | if (page_meta->zm_bump + size <= PAGE_SIZE) { | |
6106 | goto found; | |
6107 | } | |
6108 | pva = page_meta->zm_page_next; | |
6109 | } | |
f427ee49 | 6110 | |
c3c9b80d | 6111 | zone_expand_locked(zone, Z_WAITOK, NULL); |
f427ee49 | 6112 | } |
f427ee49 | 6113 | |
c3c9b80d A |
6114 | found: |
6115 | offs = (uint16_t)((page_meta->zm_bump + mask) & ~mask); | |
6116 | page_meta->zm_bump = (uint16_t)(offs + size); | |
6117 | page_meta->zm_alloc_size += size; | |
6118 | zone->z_elems_free -= size; | |
6119 | zpercpu_get(zone->z_stats)->zs_mem_allocated += size; | |
f427ee49 | 6120 | |
c3c9b80d A |
6121 | if (page_meta->zm_alloc_size >= PAGE_SIZE - sizeof(vm_offset_t)) { |
6122 | zone_meta_requeue(zone, &zone->z_pageq_full, page_meta); | |
f427ee49 A |
6123 | } |
6124 | ||
c3c9b80d | 6125 | zone_unlock(zone); |
f427ee49 | 6126 | |
c3c9b80d | 6127 | addr = offs + zone_pva_to_addr(pva); |
f427ee49 | 6128 | |
c3c9b80d A |
6129 | DTRACE_VM2(zalloc, zone_t, zone, void*, addr); |
6130 | return (void *)addr; | |
f427ee49 A |
6131 | } |
6132 | ||
c3c9b80d A |
6133 | static void * |
6134 | _zalloc_permanent_large(size_t size, vm_offset_t mask) | |
6135 | { | |
6136 | kern_return_t kr; | |
6137 | vm_offset_t addr; | |
39037602 | 6138 | |
c3c9b80d A |
6139 | kr = kernel_memory_allocate(kernel_map, &addr, size, mask, |
6140 | KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO, | |
6141 | VM_KERN_MEMORY_KALLOC); | |
6142 | if (kr != 0) { | |
6143 | panic("zalloc_permanent: unable to allocate %zd bytes (%d)", | |
6144 | size, kr); | |
6145 | } | |
6146 | return (void *)addr; | |
6147 | } | |
39037602 | 6148 | |
c3c9b80d A |
6149 | void * |
6150 | zalloc_permanent(vm_size_t size, vm_offset_t mask) | |
39037602 | 6151 | { |
c3c9b80d A |
6152 | if (size <= PAGE_SIZE) { |
6153 | zone_t zone = &zone_array[ZONE_ID_PERMANENT]; | |
6154 | return _zalloc_permanent(zone, size, mask); | |
f427ee49 | 6155 | } |
c3c9b80d | 6156 | return _zalloc_permanent_large(size, mask); |
f427ee49 A |
6157 | } |
6158 | ||
c3c9b80d A |
6159 | void * |
6160 | zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask) | |
f427ee49 | 6161 | { |
c3c9b80d A |
6162 | zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT]; |
6163 | return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask)); | |
6164 | } | |
f427ee49 | 6165 | |
c3c9b80d A |
6166 | /*! @} */ |
6167 | #endif /* !ZALLOC_TEST */ | |
6168 | #pragma mark zone GC / trimming | |
6169 | #if !ZALLOC_TEST | |
39037602 | 6170 | |
c3c9b80d | 6171 | static thread_call_data_t zone_defrag_callout; |
7ddcb079 | 6172 | |
c3c9b80d A |
6173 | static void |
6174 | zone_reclaim_chunk(zone_t z, struct zone_page_metadata *meta, uint32_t free_count) | |
6175 | { | |
6176 | vm_address_t page_addr; | |
6177 | vm_size_t size_to_free; | |
6178 | uint32_t bitmap_ref; | |
6179 | uint32_t page_count; | |
6180 | bool sequester = z->z_va_sequester && !z->z_destroyed; | |
0a7de745 | 6181 | |
c3c9b80d | 6182 | zone_meta_queue_pop_native(z, &z->z_pageq_empty, &page_addr); |
0a7de745 | 6183 | |
c3c9b80d | 6184 | page_count = meta->zm_chunk_len; |
0a7de745 | 6185 | |
c3c9b80d A |
6186 | if (meta->zm_alloc_size) { |
6187 | zone_metadata_corruption(z, meta, "alloc_size"); | |
6188 | } | |
6189 | if (z->z_percpu) { | |
6190 | if (page_count != 1) { | |
6191 | zone_metadata_corruption(z, meta, "page_count"); | |
f427ee49 | 6192 | } |
c3c9b80d A |
6193 | size_to_free = ptoa(z->z_chunk_pages); |
6194 | os_atomic_sub(&zones_phys_page_mapped_count, | |
6195 | z->z_chunk_pages, relaxed); | |
6196 | } else { | |
6197 | if (page_count > z->z_chunk_pages) { | |
6198 | zone_metadata_corruption(z, meta, "page_count"); | |
f427ee49 | 6199 | } |
c3c9b80d A |
6200 | if (page_count < z->z_chunk_pages) { |
6201 | /* Dequeue non populated VA from z_pageq_va */ | |
6202 | zone_meta_remqueue(z, meta + page_count); | |
f427ee49 | 6203 | } |
c3c9b80d A |
6204 | size_to_free = ptoa(page_count); |
6205 | os_atomic_sub(&zones_phys_page_mapped_count, page_count, relaxed); | |
6206 | } | |
f427ee49 | 6207 | |
c3c9b80d A |
6208 | zone_counter_sub(z, z_elems_free, free_count); |
6209 | zone_counter_sub(z, z_elems_avail, free_count); | |
6210 | zone_counter_sub(z, z_wired_empty, page_count); | |
6211 | zone_counter_sub(z, z_wired_cur, page_count); | |
6212 | if (z->z_elems_free_min < free_count) { | |
6213 | z->z_elems_free_min = 0; | |
6214 | } else { | |
6215 | z->z_elems_free_min -= free_count; | |
6216 | } | |
6217 | if (z->z_elems_free_max < free_count) { | |
6218 | z->z_elems_free_max = 0; | |
6219 | } else { | |
6220 | z->z_elems_free_max -= free_count; | |
6221 | } | |
7ddcb079 | 6222 | |
c3c9b80d A |
6223 | bitmap_ref = 0; |
6224 | if (sequester) { | |
6225 | if (meta->zm_inline_bitmap) { | |
6226 | for (int i = 0; i < meta->zm_chunk_len; i++) { | |
6227 | meta[i].zm_bitmap = 0; | |
6228 | } | |
6229 | } else { | |
6230 | bitmap_ref = meta->zm_bitmap; | |
6231 | meta->zm_bitmap = 0; | |
7ddcb079 | 6232 | } |
c3c9b80d A |
6233 | meta->zm_chunk_len = 0; |
6234 | } else { | |
6235 | if (!meta->zm_inline_bitmap) { | |
6236 | bitmap_ref = meta->zm_bitmap; | |
f427ee49 | 6237 | } |
c3c9b80d A |
6238 | zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages); |
6239 | bzero(meta, sizeof(*meta) * z->z_chunk_pages); | |
f427ee49 | 6240 | } |
eb6b6ca3 | 6241 | |
c3c9b80d | 6242 | zone_unlock(z); |
f427ee49 | 6243 | |
c3c9b80d A |
6244 | if (bitmap_ref) { |
6245 | zone_bits_free(bitmap_ref); | |
f427ee49 | 6246 | } |
f427ee49 | 6247 | |
c3c9b80d A |
6248 | /* Free the pages for metadata and account for them */ |
6249 | #if KASAN_ZALLOC | |
6250 | kasan_poison_range(page_addr, size_to_free, ASAN_VALID); | |
6251 | #endif | |
6252 | #if VM_MAX_TAG_ZONES | |
6253 | if (z->tags) { | |
6254 | ztMemoryRemove(z, page_addr, size_to_free); | |
6255 | } | |
6256 | #endif /* VM_MAX_TAG_ZONES */ | |
f427ee49 | 6257 | |
c3c9b80d A |
6258 | if (sequester) { |
6259 | kernel_memory_depopulate(zone_submap(z), page_addr, | |
6260 | size_to_free, KMA_KOBJECT, VM_KERN_MEMORY_ZONE); | |
6261 | } else { | |
6262 | kmem_free(zone_submap(z), page_addr, ptoa(z->z_chunk_pages)); | |
6263 | } | |
f427ee49 | 6264 | |
c3c9b80d A |
6265 | /* |
6266 | * Freeing memory sometimes needs some (for example vm map entries | |
6267 | * to represent holes). | |
6268 | * | |
6269 | * If there are any active replenish threads, we need to let them work | |
6270 | * while we hold no locks. Only do so right after we just freed memory | |
6271 | * once however to give them even more chances to find fresh pages. | |
6272 | */ | |
6273 | zone_replenish_wait_if_needed(); | |
f427ee49 | 6274 | |
c3c9b80d | 6275 | thread_yield_to_preemption(); |
f427ee49 | 6276 | |
c3c9b80d | 6277 | zone_lock(z); |
f427ee49 | 6278 | |
c3c9b80d A |
6279 | if (sequester) { |
6280 | zone_meta_queue_push(z, &z->z_pageq_va, meta); | |
6281 | } | |
6282 | } | |
f427ee49 | 6283 | |
c3c9b80d A |
6284 | static uint16_t |
6285 | zone_reclaim_elements(zone_t z, uint16_t *count, zone_element_t *elems) | |
6286 | { | |
6287 | uint16_t n = *count; | |
eb6b6ca3 | 6288 | |
c3c9b80d | 6289 | z_debug_assert(n <= zc_mag_size()); |
f427ee49 | 6290 | |
c3c9b80d A |
6291 | for (uint16_t i = 0; i < n; i++) { |
6292 | zone_element_t ze = elems[i]; | |
6293 | elems[i].ze_value = 0; | |
6294 | zfree_drop(z, zone_element_validate(z, ze), ze, false); | |
7ddcb079 | 6295 | } |
c3c9b80d A |
6296 | |
6297 | *count = 0; | |
6298 | return n; | |
7ddcb079 A |
6299 | } |
6300 | ||
c3c9b80d A |
6301 | static uint16_t |
6302 | zone_reclaim_recirc_magazine(zone_t z, struct zone_depot *mags) | |
0a7de745 | 6303 | { |
c3c9b80d | 6304 | zone_magazine_t mag = STAILQ_FIRST(&z->z_recirc); |
f427ee49 | 6305 | |
c3c9b80d A |
6306 | STAILQ_REMOVE_HEAD(&z->z_recirc, zm_link); |
6307 | STAILQ_INSERT_TAIL(mags, mag, zm_link); | |
6308 | zone_counter_sub(z, z_recirc_cur, 1); | |
7ddcb079 | 6309 | |
c3c9b80d | 6310 | z_debug_assert(mag->zm_cur == zc_mag_size()); |
7ddcb079 | 6311 | |
c3c9b80d A |
6312 | for (uint16_t i = 0; i < zc_mag_size(); i++) { |
6313 | zone_element_t ze = mag->zm_elems[i]; | |
6314 | mag->zm_elems[i].ze_value = 0; | |
6315 | zfree_drop(z, zone_element_validate(z, ze), ze, true); | |
7ddcb079 A |
6316 | } |
6317 | ||
c3c9b80d A |
6318 | mag->zm_cur = 0; |
6319 | ||
6320 | return zc_mag_size(); | |
5ba3f43e A |
6321 | } |
6322 | ||
f427ee49 | 6323 | static void |
c3c9b80d A |
6324 | zone_depot_trim(zone_cache_t zc, struct zone_depot *head) |
6325 | { | |
6326 | zone_magazine_t mag; | |
6327 | ||
6328 | if (zc->zc_depot_cur == 0 || | |
6329 | 2 * (zc->zc_depot_cur + 1) * zc_mag_size() <= zc->zc_depot_max) { | |
6330 | return; | |
6331 | } | |
5ba3f43e | 6332 | |
c3c9b80d | 6333 | zone_depot_lock(zc); |
5ba3f43e | 6334 | |
c3c9b80d A |
6335 | while (zc->zc_depot_cur && |
6336 | 2 * (zc->zc_depot_cur + 1) * zc_mag_size() > zc->zc_depot_max) { | |
6337 | mag = STAILQ_FIRST(&zc->zc_depot); | |
6338 | STAILQ_REMOVE_HEAD(&zc->zc_depot, zm_link); | |
6339 | STAILQ_INSERT_TAIL(head, mag, zm_link); | |
6340 | zc->zc_depot_cur--; | |
39037602 | 6341 | } |
39037602 | 6342 | |
c3c9b80d | 6343 | zone_depot_unlock(zc); |
4bd07ac2 A |
6344 | } |
6345 | ||
c3c9b80d A |
6346 | __enum_decl(zone_reclaim_mode_t, uint32_t, { |
6347 | ZONE_RECLAIM_TRIM, | |
6348 | ZONE_RECLAIM_DRAIN, | |
6349 | ZONE_RECLAIM_DESTROY, | |
6350 | }); | |
6351 | ||
6352 | /*! | |
6353 | * @function zone_reclaim | |
6354 | * | |
6355 | * @brief | |
6356 | * Drains or trim the zone. | |
6357 | * | |
6358 | * @discussion | |
6359 | * Draining the zone will free it from all its elements. | |
6360 | * | |
6361 | * Trimming the zone tries to respect the working set size, and avoids draining | |
6362 | * the depot when it's not necessary. | |
6363 | * | |
6364 | * @param z The zone to reclaim from | |
6365 | * @param mode The purpose of this reclaim. | |
1c79356b | 6366 | */ |
f427ee49 | 6367 | static void |
c3c9b80d | 6368 | zone_reclaim(zone_t z, zone_reclaim_mode_t mode) |
1c79356b | 6369 | { |
c3c9b80d A |
6370 | struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags); |
6371 | zone_magazine_t mag, tmp; | |
7ddcb079 | 6372 | |
c3c9b80d | 6373 | zone_lock(z); |
39037602 | 6374 | |
c3c9b80d A |
6375 | if (mode == ZONE_RECLAIM_DESTROY) { |
6376 | if (!z->z_destructible || z->z_pcpu_cache || | |
6377 | z->z_elems_rsv || z->z_allows_foreign) { | |
6378 | panic("zdestroy: Zone %s%s isn't destructible", | |
6379 | zone_heap_name(z), z->z_name); | |
6380 | } | |
39236c6e | 6381 | |
c3c9b80d A |
6382 | if (!z->z_self || z->z_expander || z->z_expander_vm_priv || |
6383 | z->z_async_refilling || z->z_expanding_wait) { | |
6384 | panic("zdestroy: Zone %s%s in an invalid state for destruction", | |
6385 | zone_heap_name(z), z->z_name); | |
6386 | } | |
39236c6e | 6387 | |
c3c9b80d | 6388 | #if !KASAN_ZALLOC |
f427ee49 | 6389 | /* |
c3c9b80d A |
6390 | * Unset the valid bit. We'll hit an assert failure on further |
6391 | * operations on this zone, until zinit() is called again. | |
f427ee49 | 6392 | * |
c3c9b80d A |
6393 | * Leave the zone valid for KASan as we will see zfree's on |
6394 | * quarantined free elements even after the zone is destroyed. | |
f427ee49 | 6395 | */ |
c3c9b80d A |
6396 | z->z_self = NULL; |
6397 | #endif | |
6398 | z->z_destroyed = true; | |
6399 | } else if (z->z_destroyed) { | |
6400 | return zone_unlock(z); | |
6401 | } else if (z->z_replenishes && z->z_async_refilling) { | |
f427ee49 | 6402 | /* |
c3c9b80d | 6403 | * If the zone is replenishing, leave it alone. |
f427ee49 | 6404 | */ |
c3c9b80d A |
6405 | return zone_unlock(z); |
6406 | } | |
f427ee49 | 6407 | |
c3c9b80d A |
6408 | if (z->z_pcpu_cache) { |
6409 | if (mode != ZONE_RECLAIM_TRIM) { | |
6410 | zpercpu_foreach(zc, z->z_pcpu_cache) { | |
6411 | zc->zc_depot_max /= 2; | |
6412 | } | |
6413 | } else { | |
6414 | zpercpu_foreach(zc, z->z_pcpu_cache) { | |
6415 | if (zc->zc_depot_max > 0) { | |
6416 | zc->zc_depot_max--; | |
6417 | } | |
6418 | } | |
f427ee49 A |
6419 | } |
6420 | ||
c3c9b80d | 6421 | zone_unlock(z); |
f427ee49 | 6422 | |
c3c9b80d A |
6423 | if (mode == ZONE_RECLAIM_TRIM) { |
6424 | zpercpu_foreach(zc, z->z_pcpu_cache) { | |
6425 | zone_depot_trim(zc, &mags); | |
6426 | } | |
6427 | } else { | |
6428 | zpercpu_foreach(zc, z->z_pcpu_cache) { | |
6429 | zone_depot_lock(zc); | |
6430 | STAILQ_CONCAT(&mags, &zc->zc_depot); | |
6431 | zc->zc_depot_cur = 0; | |
6432 | zone_depot_unlock(zc); | |
6433 | } | |
f427ee49 | 6434 | } |
39236c6e | 6435 | |
c3c9b80d A |
6436 | zone_lock(z); |
6437 | ||
6438 | uint32_t freed = 0; | |
6439 | ||
6440 | STAILQ_FOREACH(mag, &mags, zm_link) { | |
6441 | freed += zone_reclaim_elements(z, | |
6442 | &mag->zm_cur, mag->zm_elems); | |
6443 | ||
6444 | if (freed >= zc_free_batch_size) { | |
6445 | z->z_elems_free_min += freed; | |
6446 | z->z_elems_free_max += freed; | |
6447 | z->z_elems_free += freed; | |
6448 | zone_unlock(z); | |
6449 | thread_yield_to_preemption(); | |
6450 | zone_lock(z); | |
6451 | freed = 0; | |
6452 | } | |
f427ee49 | 6453 | } |
c3c9b80d A |
6454 | |
6455 | if (mode == ZONE_RECLAIM_DESTROY) { | |
6456 | zpercpu_foreach(zc, z->z_pcpu_cache) { | |
6457 | freed += zone_reclaim_elements(z, | |
6458 | &zc->zc_alloc_cur, zc->zc_alloc_elems); | |
6459 | freed += zone_reclaim_elements(z, | |
6460 | &zc->zc_free_cur, zc->zc_free_elems); | |
6461 | } | |
6462 | ||
6463 | z->z_elems_free_wss = 0; | |
6464 | z->z_elems_free_min = 0; | |
6465 | z->z_elems_free_max = 0; | |
6466 | z->z_contention_cur = 0; | |
6467 | z->z_contention_wma = 0; | |
6468 | } else { | |
6469 | z->z_elems_free_min += freed; | |
6470 | z->z_elems_free_max += freed; | |
6471 | } | |
6472 | z->z_elems_free += freed; | |
6473 | } | |
6474 | ||
6475 | for (;;) { | |
6476 | struct zone_page_metadata *meta; | |
6477 | uint32_t count, goal, freed = 0; | |
6478 | ||
6479 | goal = z->z_elems_rsv; | |
6480 | if (mode == ZONE_RECLAIM_TRIM) { | |
6481 | /* | |
6482 | * When trimming, only free elements in excess | |
6483 | * of the working set estimate. | |
6484 | * | |
6485 | * However if we are in a situation where the working | |
6486 | * set estimate is clearly growing, ignore the estimate | |
6487 | * as the next working set update will grow it and | |
6488 | * we want to avoid churn. | |
6489 | */ | |
6490 | goal = MAX(goal, MAX(z->z_elems_free_wss, | |
6491 | z->z_elems_free - z->z_elems_free_min)); | |
6492 | ||
6493 | /* | |
6494 | * Add some slop to account for "the last partial chunk in flight" | |
6495 | * so that we do not deplete the recirculation depot too harshly. | |
6496 | */ | |
6497 | goal += z->z_chunk_elems / 2; | |
6498 | } | |
6499 | ||
6500 | if (z->z_elems_free <= goal) { | |
6501 | break; | |
f427ee49 | 6502 | } |
39236c6e | 6503 | |
f427ee49 | 6504 | /* |
c3c9b80d A |
6505 | * If we're above target, but we have no free page, then drain |
6506 | * the recirculation depot until we get a free chunk or exhaust | |
6507 | * the depot. | |
f427ee49 | 6508 | * |
c3c9b80d A |
6509 | * This is rather abrupt but also somehow will reduce |
6510 | * fragmentation anyway, and the zone code will import | |
6511 | * over time anyway. | |
f427ee49 | 6512 | */ |
c3c9b80d A |
6513 | while (z->z_recirc_cur) { |
6514 | if (z->z_recirc_cur * zc_mag_size() <= goal && | |
6515 | !zone_pva_is_null(z->z_pageq_empty)) { | |
6516 | break; | |
6517 | } | |
6518 | if (freed >= zc_free_batch_size) { | |
6519 | zone_unlock(z); | |
6520 | thread_yield_to_preemption(); | |
6521 | zone_lock(z); | |
6522 | freed = 0; | |
6523 | /* we dropped the lock, needs to reassess */ | |
6524 | continue; | |
6525 | } | |
6526 | freed += zone_reclaim_recirc_magazine(z, &mags); | |
6527 | } | |
f427ee49 | 6528 | |
c3c9b80d A |
6529 | if (zone_pva_is_null(z->z_pageq_empty)) { |
6530 | break; | |
6531 | } | |
f427ee49 | 6532 | |
c3c9b80d A |
6533 | meta = zone_pva_to_meta(z->z_pageq_empty); |
6534 | count = (uint32_t)ptoa(meta->zm_chunk_len) / zone_elem_size(z); | |
6535 | ||
6536 | if (z->z_elems_free - count < goal) { | |
6537 | break; | |
f427ee49 | 6538 | } |
c3c9b80d A |
6539 | |
6540 | zone_reclaim_chunk(z, meta, count); | |
f427ee49 | 6541 | } |
39236c6e | 6542 | |
c3c9b80d A |
6543 | zone_unlock(z); |
6544 | ||
6545 | STAILQ_FOREACH_SAFE(mag, &mags, zm_link, tmp) { | |
6546 | zone_magazine_free(mag); | |
0a7de745 | 6547 | } |
c3c9b80d | 6548 | } |
5ba3f43e | 6549 | |
c3c9b80d A |
6550 | static void |
6551 | zone_reclam_all(zone_reclaim_mode_t mode) | |
6552 | { | |
f427ee49 | 6553 | /* |
c3c9b80d A |
6554 | * Start with zones with VA sequester since depopulating |
6555 | * pages will not need to allocate vm map entries for holes, | |
6556 | * which will give memory back to the system faster. | |
f427ee49 | 6557 | */ |
c3c9b80d A |
6558 | zone_foreach(z) { |
6559 | if (z == zc_magazine_zone) { | |
6560 | continue; | |
6561 | } | |
6562 | if (z->z_va_sequester && z->collectable) { | |
6563 | zone_reclaim(z, mode); | |
6564 | } | |
f427ee49 | 6565 | } |
39037602 | 6566 | |
c3c9b80d A |
6567 | zone_foreach(z) { |
6568 | if (z == zc_magazine_zone) { | |
6569 | continue; | |
39236c6e | 6570 | } |
c3c9b80d A |
6571 | if (!z->z_va_sequester && z->collectable) { |
6572 | zone_reclaim(z, mode); | |
f427ee49 | 6573 | } |
1c79356b | 6574 | } |
4bd07ac2 | 6575 | |
c3c9b80d | 6576 | zone_reclaim(zc_magazine_zone, mode); |
f427ee49 A |
6577 | } |
6578 | ||
6579 | void | |
c3c9b80d | 6580 | zone_gc(zone_gc_level_t level) |
1c79356b | 6581 | { |
c3c9b80d | 6582 | zone_reclaim_mode_t mode; |
5ba3f43e | 6583 | |
c3c9b80d A |
6584 | switch (level) { |
6585 | case ZONE_GC_TRIM: | |
6586 | mode = ZONE_RECLAIM_TRIM; | |
6587 | break; | |
6588 | case ZONE_GC_DRAIN: | |
6589 | mode = ZONE_RECLAIM_DRAIN; | |
6590 | break; | |
6591 | case ZONE_GC_JETSAM: | |
6592 | kill_process_in_largest_zone(); | |
6593 | mode = ZONE_RECLAIM_TRIM; | |
6594 | break; | |
5ba3f43e A |
6595 | } |
6596 | ||
c3c9b80d A |
6597 | current_thread()->options |= TH_OPT_ZONE_PRIV; |
6598 | lck_mtx_lock(&zone_gc_lock); | |
1c79356b | 6599 | |
c3c9b80d | 6600 | zone_reclam_all(mode); |
eb6b6ca3 | 6601 | |
c3c9b80d A |
6602 | if (level == ZONE_GC_JETSAM && zone_map_nearing_exhaustion()) { |
6603 | /* | |
6604 | * If we possibly killed a process, but we're still critical, | |
6605 | * we need to drain harder. | |
6606 | */ | |
6607 | zone_reclam_all(ZONE_RECLAIM_DRAIN); | |
d9a64523 A |
6608 | } |
6609 | ||
c3c9b80d A |
6610 | lck_mtx_unlock(&zone_gc_lock); |
6611 | current_thread()->options &= ~TH_OPT_ZONE_PRIV; | |
1c79356b A |
6612 | } |
6613 | ||
0a7de745 | 6614 | void |
c3c9b80d | 6615 | zone_gc_trim(void) |
5ba3f43e | 6616 | { |
c3c9b80d | 6617 | zone_gc(ZONE_GC_TRIM); |
5ba3f43e A |
6618 | } |
6619 | ||
0a7de745 | 6620 | void |
c3c9b80d | 6621 | zone_gc_drain(void) |
5ba3f43e | 6622 | { |
c3c9b80d | 6623 | zone_gc(ZONE_GC_DRAIN); |
5ba3f43e A |
6624 | } |
6625 | ||
c3c9b80d A |
6626 | static bool |
6627 | zone_defrag_needed(zone_t z) | |
5ba3f43e | 6628 | { |
c3c9b80d | 6629 | uint32_t recirc_size = z->z_recirc_cur * zc_mag_size(); |
5ba3f43e | 6630 | |
c3c9b80d A |
6631 | if (recirc_size <= z->z_chunk_elems / 2) { |
6632 | return false; | |
6633 | } | |
6634 | return recirc_size * zc_defrag_ratio > z->z_elems_free_wss * 100; | |
6635 | } | |
5ba3f43e | 6636 | |
c3c9b80d A |
6637 | /*! |
6638 | * @function zone_defrag_async | |
6639 | * | |
6640 | * @brief | |
6641 | * Resize the recirculation depot to match the working set size. | |
6642 | * | |
6643 | * @discussion | |
6644 | * When zones grow very large due to a spike in usage, and then some of those | |
6645 | * elements get freed, the elements in magazines in the recirculation depot | |
6646 | * are in no particular order. | |
6647 | * | |
6648 | * In order to control fragmentation, we need to detect "empty" pages so that | |
6649 | * they get onto the @c z_pageq_empty freelist, so that allocations re-pack | |
6650 | * naturally. | |
6651 | * | |
6652 | * This is done very gently, never in excess of the working set and some slop. | |
5ba3f43e | 6653 | */ |
0a7de745 | 6654 | static void |
c3c9b80d | 6655 | zone_defrag_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1) |
5ba3f43e | 6656 | { |
c3c9b80d A |
6657 | zone_foreach(z) { |
6658 | struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags); | |
6659 | zone_magazine_t mag, tmp; | |
6660 | uint32_t freed = 0, goal = 0; | |
5ba3f43e | 6661 | |
c3c9b80d A |
6662 | if (!z->collectable || !zone_defrag_needed(z)) { |
6663 | continue; | |
6664 | } | |
5ba3f43e | 6665 | |
c3c9b80d | 6666 | zone_lock(z); |
5ba3f43e | 6667 | |
c3c9b80d A |
6668 | goal = z->z_elems_free_wss + z->z_chunk_elems / 2 + |
6669 | zc_mag_size() - 1; | |
6670 | ||
6671 | while (z->z_recirc_cur * zc_mag_size() > goal) { | |
6672 | if (freed >= zc_free_batch_size) { | |
6673 | zone_unlock(z); | |
6674 | thread_yield_to_preemption(); | |
6675 | zone_lock(z); | |
6676 | freed = 0; | |
6677 | /* we dropped the lock, needs to reassess */ | |
6678 | continue; | |
6679 | } | |
6680 | freed += zone_reclaim_recirc_magazine(z, &mags); | |
5ba3f43e | 6681 | } |
5ba3f43e | 6682 | |
c3c9b80d A |
6683 | zone_unlock(z); |
6684 | ||
6685 | STAILQ_FOREACH_SAFE(mag, &mags, zm_link, tmp) { | |
6686 | zone_magazine_free(mag); | |
6687 | } | |
5ba3f43e A |
6688 | } |
6689 | } | |
6690 | ||
1c79356b | 6691 | void |
c3c9b80d | 6692 | compute_zone_working_set_size(__unused void *param) |
f427ee49 | 6693 | { |
c3c9b80d A |
6694 | uint32_t zc_auto = zc_auto_threshold; |
6695 | bool kick_defrag = false; | |
5ba3f43e | 6696 | |
c3c9b80d A |
6697 | /* |
6698 | * Keep zone caching disabled until the first proc is made. | |
6699 | */ | |
6700 | if (__improbable(zone_caching_disabled < 0)) { | |
6701 | return; | |
0a7de745 | 6702 | } |
1c79356b | 6703 | |
c3c9b80d A |
6704 | zone_caching_disabled = vm_pool_low(); |
6705 | #if ZALLOC_EARLY_GAPS | |
6706 | zone_cleanup_early_gaps_if_needed(); | |
6707 | #endif | |
0a7de745 | 6708 | |
c3c9b80d A |
6709 | if (os_mul_overflow(zc_auto, Z_CONTENTION_WMA_UNIT, &zc_auto)) { |
6710 | zc_auto = 0; | |
6711 | } | |
d9a64523 | 6712 | |
c3c9b80d A |
6713 | zone_foreach(z) { |
6714 | uint32_t wma; | |
6715 | bool needs_caching = false; | |
7ddcb079 | 6716 | |
c3c9b80d A |
6717 | if (z->z_self != z) { |
6718 | continue; | |
6719 | } | |
39037602 | 6720 | |
c3c9b80d | 6721 | zone_lock(z); |
fe8ab488 | 6722 | |
c3c9b80d A |
6723 | wma = z->z_elems_free_max - z->z_elems_free_min; |
6724 | wma = (3 * wma + z->z_elems_free_wss) / 4; | |
6725 | z->z_elems_free_max = z->z_elems_free_min = z->z_elems_free; | |
6726 | z->z_elems_free_wss = wma; | |
fe8ab488 | 6727 | |
c3c9b80d A |
6728 | if (!kick_defrag && zone_defrag_needed(z)) { |
6729 | kick_defrag = true; | |
6730 | } | |
0a7de745 | 6731 | |
c3c9b80d A |
6732 | /* fixed point decimal of contentions per second */ |
6733 | wma = z->z_contention_cur * Z_CONTENTION_WMA_UNIT / | |
6734 | ZONE_WSS_UPDATE_PERIOD; | |
6735 | z->z_contention_cur = 0; | |
6736 | z->z_contention_wma = (3 * wma + z->z_contention_wma) / 4; | |
5ba3f43e | 6737 | |
c3c9b80d A |
6738 | /* |
6739 | * If the zone seems to be very quiet, | |
6740 | * gently lower its cpu-local depot size. | |
6741 | */ | |
6742 | if (z->z_pcpu_cache && wma < Z_CONTENTION_WMA_UNIT / 2 && | |
6743 | z->z_contention_wma < Z_CONTENTION_WMA_UNIT / 2) { | |
6744 | zpercpu_foreach(zc, z->z_pcpu_cache) { | |
6745 | if (zc->zc_depot_max > zc_mag_size()) { | |
6746 | zc->zc_depot_max--; | |
6747 | } | |
6748 | } | |
6749 | } | |
5ba3f43e | 6750 | |
c3c9b80d A |
6751 | /* |
6752 | * If the zone has been contending like crazy for two periods, | |
6753 | * and is eligible, maybe it's time to enable caching. | |
6754 | */ | |
6755 | if (!z->z_nocaching && !z->z_pcpu_cache && !z->exhaustible && | |
6756 | zc_auto && z->z_contention_wma >= zc_auto && wma >= zc_auto) { | |
6757 | needs_caching = true; | |
6758 | } | |
1c79356b | 6759 | |
c3c9b80d | 6760 | zone_unlock(z); |
6d2010ae | 6761 | |
c3c9b80d A |
6762 | if (needs_caching) { |
6763 | zone_enable_caching(z); | |
6764 | } | |
f427ee49 | 6765 | } |
5ba3f43e | 6766 | |
c3c9b80d A |
6767 | if (kick_defrag) { |
6768 | thread_call_enter(&zone_defrag_callout); | |
d9a64523 | 6769 | } |
f427ee49 A |
6770 | } |
6771 | ||
c3c9b80d A |
6772 | #endif /* !ZALLOC_TEST */ |
6773 | #pragma mark vm integration, MIG routines | |
6774 | #if !ZALLOC_TEST | |
6775 | ||
6776 | /* | |
6777 | * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls | |
6778 | * requesting zone information. | |
6779 | * Frees unused pages towards the end of the region, and zero'es out unused | |
6780 | * space on the last page. | |
6781 | */ | |
6782 | static vm_map_copy_t | |
6783 | create_vm_map_copy( | |
6784 | vm_offset_t start_addr, | |
6785 | vm_size_t total_size, | |
6786 | vm_size_t used_size) | |
f427ee49 | 6787 | { |
c3c9b80d A |
6788 | kern_return_t kr; |
6789 | vm_offset_t end_addr; | |
6790 | vm_size_t free_size; | |
6791 | vm_map_copy_t copy; | |
d9a64523 | 6792 | |
c3c9b80d A |
6793 | if (used_size != total_size) { |
6794 | end_addr = start_addr + used_size; | |
6795 | free_size = total_size - (round_page(end_addr) - start_addr); | |
d9a64523 | 6796 | |
c3c9b80d A |
6797 | if (free_size >= PAGE_SIZE) { |
6798 | kmem_free(ipc_kernel_map, | |
6799 | round_page(end_addr), free_size); | |
6800 | } | |
6801 | bzero((char *) end_addr, round_page(end_addr) - end_addr); | |
f427ee49 A |
6802 | } |
6803 | ||
c3c9b80d A |
6804 | kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr, |
6805 | (vm_map_size_t)used_size, TRUE, ©); | |
6806 | assert(kr == KERN_SUCCESS); | |
f427ee49 | 6807 | |
c3c9b80d | 6808 | return copy; |
d9a64523 A |
6809 | } |
6810 | ||
c3c9b80d A |
6811 | static boolean_t |
6812 | get_zone_info( | |
6813 | zone_t z, | |
6814 | mach_zone_name_t *zn, | |
6815 | mach_zone_info_t *zi) | |
f427ee49 | 6816 | { |
c3c9b80d A |
6817 | struct zone zcopy; |
6818 | vm_size_t cached = 0; | |
f427ee49 | 6819 | |
c3c9b80d A |
6820 | assert(z != ZONE_NULL); |
6821 | zone_lock(z); | |
6822 | if (!z->z_self) { | |
6823 | zone_unlock(z); | |
6824 | return FALSE; | |
6825 | } | |
6826 | zcopy = *z; | |
6827 | if (z->z_pcpu_cache) { | |
6828 | zpercpu_foreach(zc, z->z_pcpu_cache) { | |
6829 | cached += zc->zc_alloc_cur + zc->zc_free_cur; | |
6830 | cached += zc->zc_depot_cur * zc_mag_size(); | |
f427ee49 | 6831 | } |
f427ee49 | 6832 | } |
c3c9b80d | 6833 | zone_unlock(z); |
f427ee49 | 6834 | |
c3c9b80d A |
6835 | if (zn != NULL) { |
6836 | /* | |
6837 | * Append kalloc heap name to zone name (if zone is used by kalloc) | |
6838 | */ | |
6839 | char temp_zone_name[MAX_ZONE_NAME] = ""; | |
6840 | snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", | |
6841 | zone_heap_name(z), z->z_name); | |
f427ee49 | 6842 | |
c3c9b80d A |
6843 | /* assuming here the name data is static */ |
6844 | (void) __nosan_strlcpy(zn->mzn_name, temp_zone_name, | |
6845 | strlen(temp_zone_name) + 1); | |
6846 | } | |
f427ee49 | 6847 | |
c3c9b80d A |
6848 | if (zi != NULL) { |
6849 | *zi = (mach_zone_info_t) { | |
6850 | .mzi_count = zone_count_allocated(&zcopy) - cached, | |
6851 | .mzi_cur_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_cur)), | |
6852 | // max_size for zprint is now high-watermark of pages used | |
6853 | .mzi_max_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_hwm)), | |
6854 | .mzi_elem_size = zone_scale_for_percpu(&zcopy, zcopy.z_elem_size), | |
6855 | .mzi_alloc_size = ptoa_64(zcopy.z_chunk_pages), | |
6856 | .mzi_exhaustible = (uint64_t)zcopy.exhaustible, | |
6857 | }; | |
6858 | zpercpu_foreach(zs, zcopy.z_stats) { | |
6859 | zi->mzi_sum_size += zs->zs_mem_allocated; | |
6860 | } | |
6861 | if (zcopy.collectable) { | |
6862 | SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable, | |
6863 | ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_empty))); | |
6864 | SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE); | |
6865 | } | |
6866 | } | |
f427ee49 | 6867 | |
c3c9b80d | 6868 | return TRUE; |
f427ee49 A |
6869 | } |
6870 | ||
c3c9b80d A |
6871 | kern_return_t |
6872 | task_zone_info( | |
6873 | __unused task_t task, | |
6874 | __unused mach_zone_name_array_t *namesp, | |
6875 | __unused mach_msg_type_number_t *namesCntp, | |
6876 | __unused task_zone_info_array_t *infop, | |
6877 | __unused mach_msg_type_number_t *infoCntp) | |
1c79356b | 6878 | { |
c3c9b80d A |
6879 | return KERN_FAILURE; |
6880 | } | |
f427ee49 | 6881 | |
c3c9b80d A |
6882 | kern_return_t |
6883 | mach_zone_info( | |
6884 | host_priv_t host, | |
6885 | mach_zone_name_array_t *namesp, | |
6886 | mach_msg_type_number_t *namesCntp, | |
6887 | mach_zone_info_array_t *infop, | |
6888 | mach_msg_type_number_t *infoCntp) | |
6889 | { | |
6890 | return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL); | |
6891 | } | |
1c79356b | 6892 | |
5ba3f43e | 6893 | |
c3c9b80d A |
6894 | kern_return_t |
6895 | mach_memory_info( | |
6896 | host_priv_t host, | |
6897 | mach_zone_name_array_t *namesp, | |
6898 | mach_msg_type_number_t *namesCntp, | |
6899 | mach_zone_info_array_t *infop, | |
6900 | mach_msg_type_number_t *infoCntp, | |
6901 | mach_memory_info_array_t *memoryInfop, | |
6902 | mach_msg_type_number_t *memoryInfoCntp) | |
6903 | { | |
6904 | mach_zone_name_t *names; | |
6905 | vm_offset_t names_addr; | |
6906 | vm_size_t names_size; | |
316670eb | 6907 | |
c3c9b80d A |
6908 | mach_zone_info_t *info; |
6909 | vm_offset_t info_addr; | |
6910 | vm_size_t info_size; | |
f427ee49 | 6911 | |
c3c9b80d A |
6912 | mach_memory_info_t *memory_info; |
6913 | vm_offset_t memory_info_addr; | |
6914 | vm_size_t memory_info_size; | |
6915 | vm_size_t memory_info_vmsize; | |
6916 | unsigned int num_info; | |
f427ee49 | 6917 | |
c3c9b80d A |
6918 | unsigned int max_zones, used_zones, i; |
6919 | mach_zone_name_t *zn; | |
6920 | mach_zone_info_t *zi; | |
6921 | kern_return_t kr; | |
39236c6e | 6922 | |
c3c9b80d | 6923 | uint64_t zones_collectable_bytes = 0; |
f427ee49 | 6924 | |
c3c9b80d A |
6925 | if (host == HOST_NULL) { |
6926 | return KERN_INVALID_HOST; | |
6927 | } | |
6928 | #if CONFIG_DEBUGGER_FOR_ZONE_INFO | |
6929 | if (!PE_i_can_has_debugger(NULL)) { | |
6930 | return KERN_INVALID_HOST; | |
6931 | } | |
6932 | #endif | |
f427ee49 | 6933 | |
fe8ab488 | 6934 | /* |
c3c9b80d A |
6935 | * We assume that zones aren't freed once allocated. |
6936 | * We won't pick up any zones that are allocated later. | |
6d2010ae | 6937 | */ |
1c79356b | 6938 | |
c3c9b80d | 6939 | max_zones = os_atomic_load(&num_zones, relaxed); |
f427ee49 | 6940 | |
c3c9b80d A |
6941 | names_size = round_page(max_zones * sizeof *names); |
6942 | kr = kmem_alloc_pageable(ipc_kernel_map, | |
6943 | &names_addr, names_size, VM_KERN_MEMORY_IPC); | |
6944 | if (kr != KERN_SUCCESS) { | |
6945 | return kr; | |
6946 | } | |
6947 | names = (mach_zone_name_t *) names_addr; | |
f427ee49 | 6948 | |
c3c9b80d A |
6949 | info_size = round_page(max_zones * sizeof *info); |
6950 | kr = kmem_alloc_pageable(ipc_kernel_map, | |
6951 | &info_addr, info_size, VM_KERN_MEMORY_IPC); | |
6952 | if (kr != KERN_SUCCESS) { | |
6953 | kmem_free(ipc_kernel_map, | |
6954 | names_addr, names_size); | |
6955 | return kr; | |
0a7de745 | 6956 | } |
c3c9b80d | 6957 | info = (mach_zone_info_t *) info_addr; |
5ba3f43e | 6958 | |
c3c9b80d A |
6959 | zn = &names[0]; |
6960 | zi = &info[0]; | |
f427ee49 | 6961 | |
c3c9b80d A |
6962 | used_zones = max_zones; |
6963 | for (i = 0; i < max_zones; i++) { | |
6964 | if (!get_zone_info(&(zone_array[i]), zn, zi)) { | |
6965 | used_zones--; | |
6966 | continue; | |
6967 | } | |
6968 | zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable); | |
6969 | zn++; | |
6970 | zi++; | |
6971 | } | |
f427ee49 | 6972 | |
c3c9b80d A |
6973 | *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names); |
6974 | *namesCntp = used_zones; | |
f427ee49 | 6975 | |
c3c9b80d A |
6976 | *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info); |
6977 | *infoCntp = used_zones; | |
f427ee49 | 6978 | |
c3c9b80d A |
6979 | num_info = 0; |
6980 | memory_info_addr = 0; | |
f427ee49 | 6981 | |
c3c9b80d A |
6982 | if (memoryInfop && memoryInfoCntp) { |
6983 | vm_map_copy_t copy; | |
6984 | num_info = vm_page_diagnose_estimate(); | |
6985 | memory_info_size = num_info * sizeof(*memory_info); | |
6986 | memory_info_vmsize = round_page(memory_info_size); | |
6987 | kr = kmem_alloc_pageable(ipc_kernel_map, | |
6988 | &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC); | |
6989 | if (kr != KERN_SUCCESS) { | |
6990 | return kr; | |
6991 | } | |
f427ee49 | 6992 | |
c3c9b80d A |
6993 | kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, |
6994 | VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE); | |
6995 | assert(kr == KERN_SUCCESS); | |
f427ee49 | 6996 | |
c3c9b80d A |
6997 | memory_info = (mach_memory_info_t *) memory_info_addr; |
6998 | vm_page_diagnose(memory_info, num_info, zones_collectable_bytes); | |
fe8ab488 | 6999 | |
c3c9b80d A |
7000 | kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE); |
7001 | assert(kr == KERN_SUCCESS); | |
5ba3f43e | 7002 | |
c3c9b80d A |
7003 | kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr, |
7004 | (vm_map_size_t)memory_info_size, TRUE, ©); | |
7005 | assert(kr == KERN_SUCCESS); | |
cb323159 | 7006 | |
c3c9b80d A |
7007 | *memoryInfop = (mach_memory_info_t *) copy; |
7008 | *memoryInfoCntp = num_info; | |
7009 | } | |
cb323159 | 7010 | |
c3c9b80d | 7011 | return KERN_SUCCESS; |
f427ee49 | 7012 | } |
eb6b6ca3 | 7013 | |
c3c9b80d A |
7014 | kern_return_t |
7015 | mach_zone_info_for_zone( | |
7016 | host_priv_t host, | |
7017 | mach_zone_name_t name, | |
7018 | mach_zone_info_t *infop) | |
f427ee49 | 7019 | { |
c3c9b80d | 7020 | zone_t zone_ptr; |
f427ee49 | 7021 | |
c3c9b80d A |
7022 | if (host == HOST_NULL) { |
7023 | return KERN_INVALID_HOST; | |
0a7de745 | 7024 | } |
c3c9b80d A |
7025 | #if CONFIG_DEBUGGER_FOR_ZONE_INFO |
7026 | if (!PE_i_can_has_debugger(NULL)) { | |
7027 | return KERN_INVALID_HOST; | |
0a7de745 | 7028 | } |
c3c9b80d | 7029 | #endif |
f427ee49 | 7030 | |
c3c9b80d A |
7031 | if (infop == NULL) { |
7032 | return KERN_INVALID_ARGUMENT; | |
0a7de745 | 7033 | } |
a3d08fcd | 7034 | |
c3c9b80d A |
7035 | zone_ptr = ZONE_NULL; |
7036 | zone_foreach(z) { | |
1c79356b | 7037 | /* |
c3c9b80d | 7038 | * Append kalloc heap name to zone name (if zone is used by kalloc) |
1c79356b | 7039 | */ |
c3c9b80d A |
7040 | char temp_zone_name[MAX_ZONE_NAME] = ""; |
7041 | snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", | |
7042 | zone_heap_name(z), z->z_name); | |
f427ee49 | 7043 | |
c3c9b80d A |
7044 | /* Find the requested zone by name */ |
7045 | if (track_this_zone(temp_zone_name, name.mzn_name)) { | |
7046 | zone_ptr = z; | |
7047 | break; | |
f427ee49 | 7048 | } |
c3c9b80d | 7049 | } |
f427ee49 | 7050 | |
c3c9b80d A |
7051 | /* No zones found with the requested zone name */ |
7052 | if (zone_ptr == ZONE_NULL) { | |
7053 | return KERN_INVALID_ARGUMENT; | |
7054 | } | |
f427ee49 | 7055 | |
c3c9b80d A |
7056 | if (get_zone_info(zone_ptr, NULL, infop)) { |
7057 | return KERN_SUCCESS; | |
7058 | } | |
7059 | return KERN_FAILURE; | |
7060 | } | |
f427ee49 | 7061 | |
c3c9b80d A |
7062 | kern_return_t |
7063 | mach_zone_info_for_largest_zone( | |
7064 | host_priv_t host, | |
7065 | mach_zone_name_t *namep, | |
7066 | mach_zone_info_t *infop) | |
7067 | { | |
7068 | if (host == HOST_NULL) { | |
7069 | return KERN_INVALID_HOST; | |
7070 | } | |
7071 | #if CONFIG_DEBUGGER_FOR_ZONE_INFO | |
7072 | if (!PE_i_can_has_debugger(NULL)) { | |
7073 | return KERN_INVALID_HOST; | |
7074 | } | |
7075 | #endif | |
f427ee49 | 7076 | |
c3c9b80d A |
7077 | if (namep == NULL || infop == NULL) { |
7078 | return KERN_INVALID_ARGUMENT; | |
7079 | } | |
f427ee49 | 7080 | |
c3c9b80d A |
7081 | if (get_zone_info(zone_find_largest(), namep, infop)) { |
7082 | return KERN_SUCCESS; | |
7083 | } | |
7084 | return KERN_FAILURE; | |
7085 | } | |
7086 | ||
7087 | uint64_t | |
7088 | get_zones_collectable_bytes(void) | |
7089 | { | |
7090 | uint64_t zones_collectable_bytes = 0; | |
7091 | mach_zone_info_t zi; | |
f427ee49 | 7092 | |
c3c9b80d A |
7093 | zone_foreach(z) { |
7094 | if (get_zone_info(z, NULL, &zi)) { | |
7095 | zones_collectable_bytes += | |
7096 | GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable); | |
f427ee49 A |
7097 | } |
7098 | } | |
7099 | ||
c3c9b80d | 7100 | return zones_collectable_bytes; |
f427ee49 A |
7101 | } |
7102 | ||
c3c9b80d A |
7103 | kern_return_t |
7104 | mach_zone_get_zlog_zones( | |
7105 | host_priv_t host, | |
7106 | mach_zone_name_array_t *namesp, | |
7107 | mach_msg_type_number_t *namesCntp) | |
f427ee49 | 7108 | { |
c3c9b80d A |
7109 | #if ZONE_ENABLE_LOGGING |
7110 | unsigned int max_zones, logged_zones, i; | |
7111 | kern_return_t kr; | |
7112 | zone_t zone_ptr; | |
7113 | mach_zone_name_t *names; | |
7114 | vm_offset_t names_addr; | |
7115 | vm_size_t names_size; | |
f427ee49 | 7116 | |
c3c9b80d A |
7117 | if (host == HOST_NULL) { |
7118 | return KERN_INVALID_HOST; | |
7119 | } | |
f427ee49 | 7120 | |
c3c9b80d A |
7121 | if (namesp == NULL || namesCntp == NULL) { |
7122 | return KERN_INVALID_ARGUMENT; | |
7123 | } | |
f427ee49 | 7124 | |
c3c9b80d | 7125 | max_zones = os_atomic_load(&num_zones, relaxed); |
f427ee49 | 7126 | |
c3c9b80d A |
7127 | names_size = round_page(max_zones * sizeof *names); |
7128 | kr = kmem_alloc_pageable(ipc_kernel_map, | |
7129 | &names_addr, names_size, VM_KERN_MEMORY_IPC); | |
7130 | if (kr != KERN_SUCCESS) { | |
7131 | return kr; | |
f427ee49 | 7132 | } |
c3c9b80d | 7133 | names = (mach_zone_name_t *) names_addr; |
f427ee49 | 7134 | |
c3c9b80d A |
7135 | zone_ptr = ZONE_NULL; |
7136 | logged_zones = 0; | |
7137 | for (i = 0; i < max_zones; i++) { | |
7138 | zone_t z = &(zone_array[i]); | |
7139 | assert(z != ZONE_NULL); | |
7140 | ||
7141 | /* Copy out the zone name if zone logging is enabled */ | |
7142 | if (z->zlog_btlog) { | |
7143 | get_zone_info(z, &names[logged_zones], NULL); | |
7144 | logged_zones++; | |
7145 | } | |
f427ee49 | 7146 | } |
c3c9b80d A |
7147 | |
7148 | *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names); | |
7149 | *namesCntp = logged_zones; | |
7150 | ||
7151 | return KERN_SUCCESS; | |
7152 | ||
7153 | #else /* ZONE_ENABLE_LOGGING */ | |
7154 | #pragma unused(host, namesp, namesCntp) | |
7155 | return KERN_FAILURE; | |
7156 | #endif /* ZONE_ENABLE_LOGGING */ | |
f427ee49 A |
7157 | } |
7158 | ||
c3c9b80d A |
7159 | kern_return_t |
7160 | mach_zone_get_btlog_records( | |
7161 | host_priv_t host, | |
7162 | mach_zone_name_t name, | |
7163 | zone_btrecord_array_t *recsp, | |
7164 | mach_msg_type_number_t *recsCntp) | |
f427ee49 | 7165 | { |
c3c9b80d A |
7166 | #if DEBUG || DEVELOPMENT |
7167 | unsigned int numrecs = 0; | |
7168 | zone_btrecord_t *recs; | |
7169 | kern_return_t kr; | |
7170 | zone_t zone_ptr; | |
7171 | vm_offset_t recs_addr; | |
7172 | vm_size_t recs_size; | |
f427ee49 | 7173 | |
c3c9b80d A |
7174 | if (host == HOST_NULL) { |
7175 | return KERN_INVALID_HOST; | |
f427ee49 | 7176 | } |
f427ee49 | 7177 | |
c3c9b80d A |
7178 | if (recsp == NULL || recsCntp == NULL) { |
7179 | return KERN_INVALID_ARGUMENT; | |
f427ee49 A |
7180 | } |
7181 | ||
c3c9b80d A |
7182 | zone_ptr = ZONE_NULL; |
7183 | zone_foreach(z) { | |
7184 | /* | |
7185 | * Append kalloc heap name to zone name (if zone is used by kalloc) | |
7186 | */ | |
7187 | char temp_zone_name[MAX_ZONE_NAME] = ""; | |
7188 | snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", | |
7189 | zone_heap_name(z), z->z_name); | |
7190 | ||
7191 | /* Find the requested zone by name */ | |
7192 | if (track_this_zone(temp_zone_name, name.mzn_name)) { | |
7193 | zone_ptr = z; | |
7194 | break; | |
f427ee49 A |
7195 | } |
7196 | } | |
f427ee49 | 7197 | |
c3c9b80d A |
7198 | /* No zones found with the requested zone name */ |
7199 | if (zone_ptr == ZONE_NULL) { | |
7200 | return KERN_INVALID_ARGUMENT; | |
f427ee49 | 7201 | } |
c3c9b80d A |
7202 | |
7203 | /* Logging not turned on for the requested zone */ | |
7204 | if (!DO_LOGGING(zone_ptr)) { | |
7205 | return KERN_FAILURE; | |
f427ee49 | 7206 | } |
c3c9b80d A |
7207 | |
7208 | /* Allocate memory for btlog records */ | |
7209 | numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog)); | |
7210 | recs_size = round_page(numrecs * sizeof *recs); | |
7211 | ||
7212 | kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC); | |
7213 | if (kr != KERN_SUCCESS) { | |
7214 | return kr; | |
f427ee49 | 7215 | } |
f427ee49 | 7216 | |
f427ee49 | 7217 | /* |
c3c9b80d A |
7218 | * We will call get_btlog_records() below which populates this region while holding a spinlock |
7219 | * (the btlog lock). So these pages need to be wired. | |
f427ee49 | 7220 | */ |
c3c9b80d A |
7221 | kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size, |
7222 | VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE); | |
7223 | assert(kr == KERN_SUCCESS); | |
0a7de745 | 7224 | |
c3c9b80d A |
7225 | recs = (zone_btrecord_t *)recs_addr; |
7226 | get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs); | |
fe8ab488 | 7227 | |
c3c9b80d A |
7228 | kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE); |
7229 | assert(kr == KERN_SUCCESS); | |
0a7de745 | 7230 | |
c3c9b80d A |
7231 | *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs); |
7232 | *recsCntp = numrecs; | |
0a7de745 | 7233 | |
c3c9b80d | 7234 | return KERN_SUCCESS; |
cb323159 | 7235 | |
c3c9b80d A |
7236 | #else /* DEBUG || DEVELOPMENT */ |
7237 | #pragma unused(host, name, recsp, recsCntp) | |
7238 | return KERN_FAILURE; | |
7239 | #endif /* DEBUG || DEVELOPMENT */ | |
7240 | } | |
7241 | ||
7242 | ||
7243 | #if DEBUG || DEVELOPMENT | |
7244 | ||
7245 | kern_return_t | |
7246 | mach_memory_info_check(void) | |
7247 | { | |
7248 | mach_memory_info_t * memory_info; | |
7249 | mach_memory_info_t * info; | |
7250 | unsigned int num_info; | |
7251 | vm_offset_t memory_info_addr; | |
7252 | kern_return_t kr; | |
7253 | size_t memory_info_size, memory_info_vmsize; | |
7254 | uint64_t top_wired, zonestotal, total; | |
5ba3f43e | 7255 | |
c3c9b80d A |
7256 | num_info = vm_page_diagnose_estimate(); |
7257 | memory_info_size = num_info * sizeof(*memory_info); | |
7258 | memory_info_vmsize = round_page(memory_info_size); | |
7259 | kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG); | |
7260 | assert(kr == KERN_SUCCESS); | |
0a7de745 | 7261 | |
c3c9b80d A |
7262 | memory_info = (mach_memory_info_t *) memory_info_addr; |
7263 | vm_page_diagnose(memory_info, num_info, 0); | |
0a7de745 | 7264 | |
c3c9b80d A |
7265 | top_wired = total = zonestotal = 0; |
7266 | zone_foreach(z) { | |
7267 | zonestotal += zone_size_wired(z); | |
f427ee49 | 7268 | } |
3e170ce0 | 7269 | |
c3c9b80d A |
7270 | for (uint32_t idx = 0; idx < num_info; idx++) { |
7271 | info = &memory_info[idx]; | |
7272 | if (!info->size) { | |
7273 | continue; | |
7274 | } | |
7275 | if (VM_KERN_COUNT_WIRED == info->site) { | |
7276 | top_wired = info->size; | |
7277 | } | |
7278 | if (VM_KERN_SITE_HIDE & info->flags) { | |
7279 | continue; | |
7280 | } | |
7281 | if (!(VM_KERN_SITE_WIRED & info->flags)) { | |
7282 | continue; | |
f427ee49 | 7283 | } |
c3c9b80d | 7284 | total += info->size; |
f427ee49 | 7285 | } |
c3c9b80d | 7286 | total += zonestotal; |
39037602 | 7287 | |
c3c9b80d A |
7288 | printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n", |
7289 | total, top_wired, zonestotal, top_wired - total); | |
fe8ab488 | 7290 | |
c3c9b80d | 7291 | kmem_free(kernel_map, memory_info_addr, memory_info_vmsize); |
f427ee49 | 7292 | |
c3c9b80d | 7293 | return kr; |
f427ee49 A |
7294 | } |
7295 | ||
c3c9b80d | 7296 | extern boolean_t(*volatile consider_buffer_cache_collect)(int); |
f427ee49 | 7297 | |
c3c9b80d | 7298 | #endif /* DEBUG || DEVELOPMENT */ |
1c79356b | 7299 | |
c3c9b80d A |
7300 | kern_return_t |
7301 | mach_zone_force_gc( | |
7302 | host_t host) | |
7303 | { | |
7304 | if (host == HOST_NULL) { | |
7305 | return KERN_INVALID_HOST; | |
6d2010ae | 7306 | } |
0a7de745 | 7307 | |
c3c9b80d A |
7308 | #if DEBUG || DEVELOPMENT |
7309 | /* Callout to buffer cache GC to drop elements in the apfs zones */ | |
7310 | if (consider_buffer_cache_collect != NULL) { | |
7311 | (void)(*consider_buffer_cache_collect)(0); | |
0a7de745 | 7312 | } |
c3c9b80d A |
7313 | zone_gc(ZONE_GC_DRAIN); |
7314 | #endif /* DEBUG || DEVELOPMENT */ | |
7315 | return KERN_SUCCESS; | |
7316 | } | |
0b4e3aa0 | 7317 | |
c3c9b80d A |
7318 | zone_t |
7319 | zone_find_largest(void) | |
7320 | { | |
7321 | uint32_t largest_idx = 0; | |
7322 | vm_offset_t largest_size = zone_size_wired(&zone_array[0]); | |
39037602 | 7323 | |
c3c9b80d A |
7324 | zone_index_foreach(i) { |
7325 | vm_offset_t size = zone_size_wired(&zone_array[i]); | |
7326 | if (size > largest_size) { | |
7327 | largest_idx = i; | |
7328 | largest_size = size; | |
39037602 | 7329 | } |
fe8ab488 A |
7330 | } |
7331 | ||
c3c9b80d A |
7332 | return &zone_array[largest_idx]; |
7333 | } | |
5ba3f43e | 7334 | |
c3c9b80d A |
7335 | #endif /* !ZALLOC_TEST */ |
7336 | #pragma mark zone creation, configuration, destruction | |
7337 | #if !ZALLOC_TEST | |
d9a64523 | 7338 | |
c3c9b80d A |
7339 | static zone_t |
7340 | zone_init_defaults(zone_id_t zid) | |
7341 | { | |
7342 | zone_t z = &zone_array[zid]; | |
5ba3f43e | 7343 | |
c3c9b80d A |
7344 | z->z_wired_max = ~0u; |
7345 | z->collectable = true; | |
7346 | z->expandable = true; | |
7347 | z->z_submap_idx = Z_SUBMAP_IDX_GENERAL; | |
a39ff7e2 | 7348 | |
c3c9b80d A |
7349 | lck_spin_init(&z->z_lock, &zone_locks_grp, LCK_ATTR_NULL); |
7350 | STAILQ_INIT(&z->z_recirc); | |
7351 | return z; | |
1c79356b A |
7352 | } |
7353 | ||
c3c9b80d A |
7354 | static bool |
7355 | zone_is_initializing(zone_t z) | |
fe8ab488 | 7356 | { |
c3c9b80d | 7357 | return !z->z_self && !z->z_destroyed; |
fe8ab488 A |
7358 | } |
7359 | ||
c3c9b80d A |
7360 | void |
7361 | zone_set_submap_idx(zone_t zone, unsigned int sub_map_idx) | |
fe8ab488 | 7362 | { |
c3c9b80d A |
7363 | if (!zone_is_initializing(zone)) { |
7364 | panic("%s: called after zone_create()", __func__); | |
7365 | } | |
7366 | if (sub_map_idx > zone_last_submap_idx) { | |
7367 | panic("zone_set_submap_idx(%d) > %d", sub_map_idx, zone_last_submap_idx); | |
7368 | } | |
7369 | zone->z_submap_idx = sub_map_idx; | |
fe8ab488 A |
7370 | } |
7371 | ||
c3c9b80d A |
7372 | void |
7373 | zone_set_noexpand(zone_t zone, vm_size_t nelems) | |
1c79356b | 7374 | { |
c3c9b80d A |
7375 | if (!zone_is_initializing(zone)) { |
7376 | panic("%s: called after zone_create()", __func__); | |
7377 | } | |
7378 | zone->expandable = false; | |
7379 | zone->z_wired_max = zone_alloc_pages_for_nelems(zone, nelems); | |
5ba3f43e A |
7380 | } |
7381 | ||
c3c9b80d A |
7382 | void |
7383 | zone_set_exhaustible(zone_t zone, vm_size_t nelems) | |
5ba3f43e | 7384 | { |
c3c9b80d A |
7385 | if (!zone_is_initializing(zone)) { |
7386 | panic("%s: called after zone_create()", __func__); | |
7387 | } | |
7388 | zone->expandable = false; | |
7389 | zone->exhaustible = true; | |
7390 | zone->z_wired_max = zone_alloc_pages_for_nelems(zone, nelems); | |
1c79356b A |
7391 | } |
7392 | ||
c3c9b80d A |
7393 | /** |
7394 | * @function zone_create_find | |
7395 | * | |
7396 | * @abstract | |
7397 | * Finds an unused zone for the given name and element size. | |
7398 | * | |
7399 | * @param name the zone name | |
7400 | * @param size the element size (including redzones, ...) | |
7401 | * @param flags the flags passed to @c zone_create* | |
7402 | * @param zid_inout the desired zone ID or ZONE_ID_ANY | |
7403 | * | |
7404 | * @returns a zone to initialize further. | |
7405 | */ | |
7406 | static zone_t | |
7407 | zone_create_find( | |
7408 | const char *name, | |
7409 | vm_size_t size, | |
7410 | zone_create_flags_t flags, | |
7411 | zone_id_t *zid_inout) | |
1c79356b | 7412 | { |
c3c9b80d A |
7413 | zone_id_t nzones, zid = *zid_inout; |
7414 | zone_t z; | |
f427ee49 | 7415 | |
c3c9b80d | 7416 | simple_lock(&all_zones_lock, &zone_locks_grp); |
f427ee49 | 7417 | |
c3c9b80d A |
7418 | nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed); |
7419 | assert(num_zones_in_use <= nzones && nzones < MAX_ZONES); | |
f427ee49 | 7420 | |
c3c9b80d A |
7421 | if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) { |
7422 | /* | |
7423 | * The first time around, make sure the reserved zone IDs | |
7424 | * have an initialized lock as zone_index_foreach() will | |
7425 | * enumerate them. | |
7426 | */ | |
7427 | while (nzones < ZONE_ID__FIRST_DYNAMIC) { | |
7428 | zone_init_defaults(nzones++); | |
7429 | } | |
7430 | ||
7431 | os_atomic_store(&num_zones, nzones, release); | |
7432 | } | |
7433 | ||
7434 | if (zid != ZONE_ID_ANY) { | |
7435 | if (zid >= ZONE_ID__FIRST_DYNAMIC) { | |
7436 | panic("zone_create: invalid desired zone ID %d for %s", | |
7437 | zid, name); | |
7438 | } | |
7439 | if (flags & ZC_DESTRUCTIBLE) { | |
7440 | panic("zone_create: ID %d (%s) must be permanent", zid, name); | |
7441 | } | |
7442 | if (zone_array[zid].z_self) { | |
7443 | panic("zone_create: creating zone ID %d (%s) twice", zid, name); | |
7444 | } | |
7445 | z = &zone_array[zid]; | |
7446 | } else { | |
7447 | if (flags & ZC_DESTRUCTIBLE) { | |
7448 | /* | |
7449 | * If possible, find a previously zdestroy'ed zone in the | |
7450 | * zone_array that we can reuse. | |
7451 | */ | |
7452 | for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES); | |
7453 | i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) { | |
7454 | z = &zone_array[i]; | |
f427ee49 | 7455 | |
c3c9b80d A |
7456 | /* |
7457 | * If the zone name and the element size are the | |
7458 | * same, we can just reuse the old zone struct. | |
7459 | */ | |
7460 | if (strcmp(z->z_name, name) || zone_elem_size(z) != size) { | |
7461 | continue; | |
7462 | } | |
7463 | bitmap_clear(zone_destroyed_bitmap, i); | |
7464 | z->z_destroyed = false; | |
7465 | z->z_self = z; | |
7466 | zid = (zone_id_t)i; | |
7467 | goto out; | |
f427ee49 | 7468 | } |
f427ee49 A |
7469 | } |
7470 | ||
c3c9b80d A |
7471 | zid = nzones++; |
7472 | z = zone_init_defaults(zid); | |
f427ee49 | 7473 | |
c3c9b80d A |
7474 | /* |
7475 | * The release barrier pairs with the acquire in | |
7476 | * zone_index_foreach() and makes sure that enumeration loops | |
7477 | * always see an initialized zone lock. | |
7478 | */ | |
7479 | os_atomic_store(&num_zones, nzones, release); | |
f427ee49 A |
7480 | } |
7481 | ||
c3c9b80d A |
7482 | out: |
7483 | num_zones_in_use++; | |
7484 | simple_unlock(&all_zones_lock); | |
f427ee49 | 7485 | |
c3c9b80d A |
7486 | *zid_inout = zid; |
7487 | return z; | |
1c79356b A |
7488 | } |
7489 | ||
c3c9b80d A |
7490 | __abortlike |
7491 | static void | |
7492 | zone_create_panic(const char *name, const char *f1, const char *f2) | |
d9a64523 | 7493 | { |
c3c9b80d A |
7494 | panic("zone_create: creating zone %s: flag %s and %s are incompatible", |
7495 | name, f1, f2); | |
d9a64523 | 7496 | } |
c3c9b80d A |
7497 | #define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \ |
7498 | if ((flags) & forbidden_flag) { \ | |
7499 | zone_create_panic(name, #current_flag, #forbidden_flag); \ | |
7500 | } | |
d9a64523 | 7501 | |
c3c9b80d A |
7502 | /* |
7503 | * Adjusts the size of the element based on minimum size, alignment | |
7504 | * and kasan redzones | |
7505 | */ | |
7506 | static vm_size_t | |
7507 | zone_elem_adjust_size( | |
7508 | const char *name __unused, | |
7509 | vm_size_t elem_size, | |
7510 | zone_create_flags_t flags __unused, | |
7511 | uint32_t *redzone __unused) | |
d9a64523 | 7512 | { |
c3c9b80d A |
7513 | vm_size_t size; |
7514 | /* | |
7515 | * Adjust element size for minimum size and pointer alignment | |
7516 | */ | |
7517 | size = (elem_size + sizeof(vm_offset_t) - 1) & -sizeof(vm_offset_t); | |
7518 | if (size < ZONE_MIN_ELEM_SIZE) { | |
7519 | size = ZONE_MIN_ELEM_SIZE; | |
f427ee49 | 7520 | } |
d9a64523 | 7521 | |
c3c9b80d A |
7522 | #if KASAN_ZALLOC |
7523 | /* | |
7524 | * Expand the zone allocation size to include the redzones. | |
7525 | * | |
7526 | * For page-multiple zones add a full guard page because they | |
7527 | * likely require alignment. | |
7528 | */ | |
7529 | uint32_t redzone_tmp; | |
7530 | if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU)) { | |
7531 | redzone_tmp = 0; | |
7532 | } else if ((size & PAGE_MASK) == 0) { | |
7533 | if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) { | |
7534 | panic("zone_create: zone %s can't provide more than PAGE_SIZE" | |
7535 | "alignment", name); | |
7536 | } | |
7537 | redzone_tmp = PAGE_SIZE; | |
7538 | } else if (flags & ZC_ALIGNMENT_REQUIRED) { | |
7539 | redzone_tmp = 0; | |
7540 | } else { | |
7541 | redzone_tmp = KASAN_GUARD_SIZE; | |
7542 | } | |
7543 | size += redzone_tmp * 2; | |
7544 | if (redzone) { | |
7545 | *redzone = redzone_tmp; | |
7546 | } | |
7547 | #endif | |
7548 | return size; | |
f427ee49 | 7549 | } |
fe8ab488 | 7550 | |
c3c9b80d A |
7551 | /* |
7552 | * Returns the allocation chunk size that has least framentation | |
7553 | */ | |
7554 | static vm_size_t | |
7555 | zone_get_min_alloc_granule( | |
7556 | vm_size_t elem_size, | |
7557 | zone_create_flags_t flags) | |
0b4e3aa0 | 7558 | { |
c3c9b80d A |
7559 | vm_size_t alloc_granule = PAGE_SIZE; |
7560 | if (flags & ZC_PERCPU) { | |
7561 | alloc_granule = PAGE_SIZE * zpercpu_count(); | |
7562 | if (PAGE_SIZE % elem_size > 256) { | |
7563 | panic("zone_create: per-cpu zone has too much fragmentation"); | |
5ba3f43e | 7564 | } |
c3c9b80d A |
7565 | } else if ((elem_size & PAGE_MASK) == 0) { |
7566 | /* zero fragmentation by definition */ | |
7567 | alloc_granule = elem_size; | |
7568 | } else if (alloc_granule % elem_size == 0) { | |
7569 | /* zero fragmentation by definition */ | |
7570 | } else { | |
7571 | vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule; | |
7572 | vm_size_t alloc_tmp = PAGE_SIZE; | |
7573 | while ((alloc_tmp += PAGE_SIZE) <= ZONE_MAX_ALLOC_SIZE) { | |
7574 | vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp; | |
7575 | if (frag_tmp < frag) { | |
7576 | frag = frag_tmp; | |
7577 | alloc_granule = alloc_tmp; | |
7578 | } | |
39236c6e | 7579 | } |
39236c6e | 7580 | } |
c3c9b80d | 7581 | return alloc_granule; |
0b4e3aa0 A |
7582 | } |
7583 | ||
c3c9b80d A |
7584 | vm_size_t |
7585 | zone_get_foreign_alloc_size( | |
7586 | const char *name __unused, | |
7587 | vm_size_t elem_size, | |
7588 | zone_create_flags_t flags, | |
7589 | uint16_t min_pages) | |
d9a64523 | 7590 | { |
c3c9b80d A |
7591 | vm_size_t adjusted_size = zone_elem_adjust_size(name, elem_size, flags, |
7592 | NULL); | |
7593 | vm_size_t alloc_granule = zone_get_min_alloc_granule(adjusted_size, | |
7594 | flags); | |
7595 | vm_size_t min_size = min_pages * PAGE_SIZE; | |
7596 | /* | |
7597 | * Round up min_size to a multiple of alloc_granule | |
7598 | */ | |
7599 | return ((min_size + alloc_granule - 1) / alloc_granule) | |
7600 | * alloc_granule; | |
7601 | } | |
d9a64523 | 7602 | |
c3c9b80d A |
7603 | zone_t |
7604 | zone_create_ext( | |
7605 | const char *name, | |
7606 | vm_size_t size, | |
7607 | zone_create_flags_t flags, | |
7608 | zone_id_t zid, | |
7609 | void (^extra_setup)(zone_t)) | |
7610 | { | |
7611 | vm_size_t alloc; | |
7612 | uint32_t redzone; | |
7613 | zone_t z; | |
d9a64523 | 7614 | |
c3c9b80d A |
7615 | if (size > ZONE_MAX_ALLOC_SIZE) { |
7616 | panic("zone_create: element size too large: %zd", (size_t)size); | |
d9a64523 | 7617 | } |
c910b4d9 | 7618 | |
c3c9b80d A |
7619 | if (size < 2 * sizeof(vm_size_t)) { |
7620 | /* Elements are too small for kasan. */ | |
7621 | flags |= ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE; | |
5ba3f43e | 7622 | } |
5ba3f43e | 7623 | |
c3c9b80d | 7624 | size = zone_elem_adjust_size(name, size, flags, &redzone); |
c910b4d9 | 7625 | /* |
c3c9b80d | 7626 | * Allocate the zone slot, return early if we found an older match. |
f427ee49 | 7627 | */ |
c3c9b80d A |
7628 | z = zone_create_find(name, size, flags, &zid); |
7629 | if (__improbable(z->z_self)) { | |
7630 | /* We found a zone to reuse */ | |
7631 | return z; | |
7632 | } | |
f427ee49 A |
7633 | |
7634 | /* | |
c3c9b80d | 7635 | * Initialize the zone properly. |
c910b4d9 A |
7636 | */ |
7637 | ||
c3c9b80d A |
7638 | /* |
7639 | * If the kernel is post lockdown, copy the zone name passed in. | |
7640 | * Else simply maintain a pointer to the name string as it can only | |
7641 | * be a core XNU zone (no unloadable kext exists before lockdown). | |
7642 | */ | |
7643 | if (startup_phase >= STARTUP_SUB_LOCKDOWN) { | |
7644 | size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN); | |
7645 | char *buf = zalloc_permanent(nsz, ZALIGN_NONE); | |
7646 | strlcpy(buf, name, nsz); | |
7647 | z->z_name = buf; | |
7648 | } else { | |
7649 | z->z_name = name; | |
0a7de745 | 7650 | } |
c3c9b80d A |
7651 | if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) { |
7652 | z->z_stats = zalloc_percpu_permanent_type(struct zone_stats); | |
7653 | } else { | |
7654 | /* | |
7655 | * zone_init() hasn't run yet, use the storage provided by | |
7656 | * zone_stats_startup(), and zone_init() will replace it | |
7657 | * with the final value once the PERCPU zone exists. | |
7658 | */ | |
7659 | z->z_stats = __zpcpu_mangle_for_boot(&zone_stats_startup[zone_index(z)]); | |
0a7de745 | 7660 | } |
55e303ae | 7661 | |
c3c9b80d A |
7662 | alloc = zone_get_min_alloc_granule(size, flags); |
7663 | ||
7664 | if (flags & ZC_KALLOC_HEAP) { | |
7665 | size_t rem = (alloc % size) / (alloc / size); | |
7666 | ||
7667 | /* | |
7668 | * Try to grow the elements size and spread them more if the remaining | |
7669 | * space is large enough. | |
7670 | */ | |
7671 | size += rem & ~(KALLOC_MINALIGN - 1); | |
f427ee49 | 7672 | } |
316670eb | 7673 | |
c3c9b80d A |
7674 | z->z_elem_size = (uint16_t)size; |
7675 | z->z_chunk_pages = (uint16_t)atop(alloc); | |
7676 | if (flags & ZC_PERCPU) { | |
7677 | z->z_chunk_elems = (uint16_t)(PAGE_SIZE / z->z_elem_size); | |
f427ee49 | 7678 | } else { |
c3c9b80d A |
7679 | z->z_chunk_elems = (uint16_t)(alloc / z->z_elem_size); |
7680 | } | |
7681 | if (zone_element_idx(zone_element_encode(0, | |
7682 | z->z_chunk_elems - 1, ZPM_AUTO)) != z->z_chunk_elems - 1) { | |
7683 | panic("zone_element_encode doesn't work for zone [%s]", name); | |
39037602 | 7684 | } |
1c79356b | 7685 | |
f427ee49 | 7686 | #if KASAN_ZALLOC |
c3c9b80d A |
7687 | z->z_kasan_redzone = redzone; |
7688 | if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) { | |
7689 | z->kasan_fakestacks = true; | |
fe8ab488 | 7690 | } |
c3c9b80d A |
7691 | #endif |
7692 | ||
c910b4d9 | 7693 | /* |
c3c9b80d | 7694 | * Handle KPI flags |
c910b4d9 | 7695 | */ |
c3c9b80d A |
7696 | #if __LP64__ |
7697 | if (flags & ZC_SEQUESTER) { | |
7698 | z->z_va_sequester = true; | |
7699 | } | |
f427ee49 | 7700 | #endif |
c3c9b80d A |
7701 | /* ZC_CACHING applied after all configuration is done */ |
7702 | if (flags & ZC_NOCACHING) { | |
7703 | z->z_nocaching = true; | |
7704 | } | |
7705 | ||
7706 | if (flags & ZC_PERCPU) { | |
7707 | /* | |
7708 | * ZC_ZFREE_CLEARMEM is forced because per-cpu zones allow for | |
7709 | * pointer-sized allocations which poisoning doesn't support. | |
7710 | */ | |
7711 | zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_ALLOW_FOREIGN); | |
7712 | z->z_percpu = true; | |
7713 | z->gzalloc_exempt = true; | |
7714 | z->z_free_zeroes = true; | |
7715 | } | |
7716 | if (flags & ZC_ZFREE_CLEARMEM) { | |
7717 | z->z_free_zeroes = true; | |
7718 | } | |
7719 | if (flags & ZC_NOGC) { | |
7720 | z->collectable = false; | |
7721 | } | |
7722 | if (flags & ZC_NOENCRYPT) { | |
7723 | z->z_noencrypt = true; | |
7724 | } | |
7725 | if (flags & ZC_ALIGNMENT_REQUIRED) { | |
7726 | z->alignment_required = true; | |
7727 | } | |
7728 | if (flags & ZC_NOGZALLOC) { | |
7729 | z->gzalloc_exempt = true; | |
7730 | } | |
7731 | if (flags & ZC_NOCALLOUT) { | |
7732 | z->no_callout = true; | |
7733 | } | |
7734 | if (flags & ZC_DESTRUCTIBLE) { | |
7735 | zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_ALLOW_FOREIGN); | |
7736 | z->z_destructible = true; | |
7737 | } | |
c910b4d9 | 7738 | |
f427ee49 | 7739 | /* |
c3c9b80d | 7740 | * Handle Internal flags |
f427ee49 | 7741 | */ |
c3c9b80d A |
7742 | if (flags & ZC_ALLOW_FOREIGN) { |
7743 | z->z_allows_foreign = true; | |
c910b4d9 | 7744 | } |
c3c9b80d A |
7745 | if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) && |
7746 | (flags & ZC_DATA_BUFFERS)) { | |
7747 | z->z_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES; | |
7748 | } | |
7749 | if (flags & ZC_KASAN_NOQUARANTINE) { | |
7750 | z->kasan_noquarantine = true; | |
7751 | } | |
7752 | /* ZC_KASAN_NOREDZONE already handled */ | |
c910b4d9 | 7753 | |
f427ee49 | 7754 | /* |
c3c9b80d | 7755 | * Then if there's extra tuning, do it |
f427ee49 | 7756 | */ |
c3c9b80d A |
7757 | if (extra_setup) { |
7758 | extra_setup(z); | |
0a7de745 | 7759 | } |
d9a64523 | 7760 | |
c3c9b80d A |
7761 | /* |
7762 | * Configure debugging features | |
7763 | */ | |
f427ee49 | 7764 | #if CONFIG_GZALLOC |
c3c9b80d A |
7765 | gzalloc_zone_init(z); /* might set z->gzalloc_tracked */ |
7766 | if (z->gzalloc_tracked) { | |
7767 | z->z_nocaching = true; | |
1c79356b | 7768 | } |
c3c9b80d | 7769 | #endif |
f427ee49 | 7770 | #if ZONE_ENABLE_LOGGING |
c3c9b80d A |
7771 | if (!z->gzalloc_tracked && num_zones_logged < max_num_zones_to_log) { |
7772 | /* | |
7773 | * Check for and set up zone leak detection if requested via boot-args. | |
7774 | * might set z->zone_logging | |
7775 | */ | |
7776 | zone_setup_logging(z); | |
5ba3f43e | 7777 | } |
f427ee49 | 7778 | #endif /* ZONE_ENABLE_LOGGING */ |
c3c9b80d A |
7779 | #if VM_MAX_TAG_ZONES |
7780 | if (!z->gzalloc_tracked && z->kalloc_heap && zone_tagging_on) { | |
7781 | static int tag_zone_index; | |
7782 | vm_offset_t esize = zone_elem_size(z); | |
7783 | z->tags = true; | |
7784 | z->tags_inline = (((page_size + esize - 1) / esize) <= | |
7785 | (sizeof(uint32_t) / sizeof(uint16_t))); | |
7786 | z->tag_zone_index = os_atomic_inc_orig(&tag_zone_index, relaxed); | |
7787 | assert(z->tag_zone_index < VM_MAX_TAG_ZONES); | |
d9a64523 | 7788 | } |
c3c9b80d | 7789 | #endif |
0b4e3aa0 | 7790 | |
c3c9b80d A |
7791 | /* |
7792 | * Finally, fixup properties based on security policies, boot-args, ... | |
7793 | */ | |
7794 | if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) && | |
7795 | z->kalloc_heap == KHEAP_ID_DATA_BUFFERS) { | |
7796 | z->z_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES; | |
6d2010ae | 7797 | } |
c3c9b80d A |
7798 | #if __LP64__ |
7799 | if ((ZSECURITY_OPTIONS_SEQUESTER & zsecurity_options) && | |
7800 | (flags & ZC_NOSEQUESTER) == 0 && | |
7801 | z->z_submap_idx == Z_SUBMAP_IDX_GENERAL) { | |
7802 | z->z_va_sequester = true; | |
7803 | } | |
7804 | #endif | |
7805 | /* | |
7806 | * Clear entire element for non data zones and upto zp_min_size for | |
7807 | * data zones. | |
7808 | */ | |
7809 | if (z->z_submap_idx != Z_SUBMAP_IDX_BAG_OF_BYTES) { | |
7810 | z->z_free_zeroes = true; | |
7811 | } else if (size <= zp_min_size) { | |
7812 | z->z_free_zeroes = true; | |
5ba3f43e | 7813 | } |
f427ee49 | 7814 | |
c3c9b80d | 7815 | if ((flags & ZC_CACHING) && !z->z_nocaching) { |
eb6b6ca3 | 7816 | /* |
c3c9b80d | 7817 | * If zcache hasn't been initialized yet, remember our decision, |
f427ee49 | 7818 | * |
c3c9b80d A |
7819 | * zone_enable_caching() will be called again by |
7820 | * zcache_bootstrap(), while the system is still single | |
7821 | * threaded, to build the missing caches. | |
cb323159 | 7822 | */ |
c3c9b80d A |
7823 | if (__probable(zc_magazine_zone)) { |
7824 | zone_enable_caching(z); | |
f427ee49 | 7825 | } else { |
c3c9b80d A |
7826 | z->z_pcpu_cache = |
7827 | __zpcpu_mangle_for_boot(&zone_cache_startup[zid]); | |
f427ee49 | 7828 | } |
5ba3f43e | 7829 | } |
1c79356b | 7830 | |
c3c9b80d A |
7831 | if (zp_factor != 0 && !z->z_free_zeroes) { |
7832 | if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) { | |
7833 | zpercpu_foreach(zs, z->z_stats) { | |
7834 | zs->zs_poison_seqno = zone_poison_count_init(z); | |
7835 | } | |
7836 | } else { | |
7837 | zone_stats_startup[zid].zs_poison_seqno = | |
7838 | zone_poison_count_init(z); | |
1c79356b | 7839 | } |
39037602 | 7840 | } |
316670eb | 7841 | |
c3c9b80d A |
7842 | zone_lock(z); |
7843 | z->z_self = z; | |
7844 | zone_unlock(z); | |
316670eb | 7845 | |
c3c9b80d A |
7846 | return z; |
7847 | } | |
1c79356b | 7848 | |
c3c9b80d | 7849 | __startup_func |
1c79356b | 7850 | void |
c3c9b80d | 7851 | zone_create_startup(struct zone_create_startup_spec *spec) |
1c79356b | 7852 | { |
c3c9b80d A |
7853 | *spec->z_var = zone_create_ext(spec->z_name, spec->z_size, |
7854 | spec->z_flags, spec->z_zid, spec->z_setup); | |
6d2010ae | 7855 | } |
2d21ac55 | 7856 | |
d9a64523 | 7857 | /* |
c3c9b80d A |
7858 | * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t |
7859 | * union works. trust but verify. | |
d9a64523 | 7860 | */ |
c3c9b80d A |
7861 | #define zalloc_check_zov_alias(f1, f2) \ |
7862 | static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2)) | |
7863 | zalloc_check_zov_alias(z_self, zv_zone); | |
7864 | zalloc_check_zov_alias(z_stats, zv_stats); | |
7865 | zalloc_check_zov_alias(z_name, zv_name); | |
7866 | zalloc_check_zov_alias(z_views, zv_next); | |
7867 | #undef zalloc_check_zov_alias | |
a39ff7e2 | 7868 | |
c3c9b80d A |
7869 | __startup_func |
7870 | void | |
7871 | zone_view_startup_init(struct zone_view_startup_spec *spec) | |
a39ff7e2 | 7872 | { |
c3c9b80d A |
7873 | struct kalloc_heap *heap = NULL; |
7874 | zone_view_t zv = spec->zv_view; | |
7875 | zone_t z; | |
a39ff7e2 | 7876 | |
c3c9b80d A |
7877 | switch (spec->zv_heapid) { |
7878 | case KHEAP_ID_DEFAULT: | |
7879 | heap = KHEAP_DEFAULT; | |
7880 | break; | |
7881 | case KHEAP_ID_DATA_BUFFERS: | |
7882 | heap = KHEAP_DATA_BUFFERS; | |
7883 | break; | |
7884 | case KHEAP_ID_KEXT: | |
7885 | heap = KHEAP_KEXT; | |
7886 | break; | |
7887 | default: | |
7888 | heap = NULL; | |
a39ff7e2 | 7889 | } |
f427ee49 | 7890 | |
c3c9b80d A |
7891 | if (heap) { |
7892 | z = kalloc_heap_zone_for_size(heap, spec->zv_size); | |
7893 | assert(z); | |
7894 | } else { | |
7895 | z = spec->zv_zone; | |
7896 | assert(spec->zv_size <= zone_elem_size(z)); | |
a39ff7e2 A |
7897 | } |
7898 | ||
c3c9b80d A |
7899 | zv->zv_zone = z; |
7900 | zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats); | |
7901 | zv->zv_next = z->z_views; | |
7902 | if (z->z_views == NULL && z->kalloc_heap == KHEAP_ID_NONE) { | |
7903 | /* | |
7904 | * count the raw view for zones not in a heap, | |
7905 | * kalloc_heap_init() already counts it for its members. | |
7906 | */ | |
7907 | zone_view_count += 2; | |
7908 | } else { | |
7909 | zone_view_count += 1; | |
a39ff7e2 | 7910 | } |
c3c9b80d | 7911 | z->z_views = zv; |
a39ff7e2 A |
7912 | } |
7913 | ||
c3c9b80d A |
7914 | zone_t |
7915 | zone_create( | |
7916 | const char *name, | |
7917 | vm_size_t size, | |
7918 | zone_create_flags_t flags) | |
316670eb | 7919 | { |
c3c9b80d | 7920 | return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL); |
316670eb A |
7921 | } |
7922 | ||
c3c9b80d A |
7923 | zone_t |
7924 | zinit( | |
7925 | vm_size_t size, /* the size of an element */ | |
7926 | vm_size_t max, /* maximum memory to use */ | |
7927 | vm_size_t alloc __unused, /* allocation size */ | |
7928 | const char *name) /* a name for the zone */ | |
3e170ce0 | 7929 | { |
c3c9b80d A |
7930 | zone_t z = zone_create(name, size, ZC_DESTRUCTIBLE); |
7931 | z->z_wired_max = zone_alloc_pages_for_nelems(z, max / size); | |
7932 | return z; | |
3e170ce0 A |
7933 | } |
7934 | ||
c3c9b80d A |
7935 | void |
7936 | zdestroy(zone_t z) | |
6d2010ae | 7937 | { |
c3c9b80d | 7938 | unsigned int zindex = zone_index(z); |
0a7de745 | 7939 | |
c3c9b80d A |
7940 | current_thread()->options |= TH_OPT_ZONE_PRIV; |
7941 | lck_mtx_lock(&zone_gc_lock); | |
0a7de745 | 7942 | |
c3c9b80d | 7943 | zone_reclaim(z, ZONE_RECLAIM_DESTROY); |
0a7de745 | 7944 | |
c3c9b80d A |
7945 | lck_mtx_unlock(&zone_gc_lock); |
7946 | current_thread()->options &= ~TH_OPT_ZONE_PRIV; | |
0a7de745 | 7947 | |
c3c9b80d A |
7948 | #if CONFIG_GZALLOC |
7949 | if (__improbable(z->gzalloc_tracked)) { | |
7950 | /* If the zone is gzalloc managed dump all the elements in the free cache */ | |
7951 | gzalloc_empty_free_cache(z); | |
0a7de745 | 7952 | } |
316670eb | 7953 | #endif |
6d2010ae | 7954 | |
c3c9b80d | 7955 | zone_lock(z); |
6d2010ae | 7956 | |
c3c9b80d A |
7957 | while (!zone_pva_is_null(z->z_pageq_va)) { |
7958 | struct zone_page_metadata *meta; | |
7959 | vm_offset_t free_addr; | |
6d2010ae | 7960 | |
c3c9b80d A |
7961 | zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages); |
7962 | meta = zone_meta_queue_pop_native(z, &z->z_pageq_va, &free_addr); | |
7963 | assert(meta->zm_chunk_len <= ZM_CHUNK_LEN_MAX); | |
7964 | bzero(meta, sizeof(*meta) * z->z_chunk_pages); | |
7965 | zone_unlock(z); | |
7966 | kmem_free(zone_submap(z), free_addr, ptoa(z->z_chunk_pages)); | |
7967 | zone_lock(z); | |
0a7de745 | 7968 | } |
6d2010ae | 7969 | |
c3c9b80d A |
7970 | #if !KASAN_ZALLOC |
7971 | /* Assert that all counts are zero */ | |
7972 | if (z->z_elems_avail || z->z_elems_free || | |
7973 | zone_size_wired(z) || z->z_va_cur) { | |
7974 | panic("zdestroy: Zone %s%s isn't empty at zdestroy() time", | |
7975 | zone_heap_name(z), z->z_name); | |
6d2010ae | 7976 | } |
6d2010ae | 7977 | |
c3c9b80d A |
7978 | /* consistency check: make sure everything is indeed empty */ |
7979 | assert(zone_pva_is_null(z->z_pageq_empty)); | |
7980 | assert(zone_pva_is_null(z->z_pageq_partial)); | |
7981 | assert(zone_pva_is_null(z->z_pageq_full)); | |
7982 | assert(zone_pva_is_null(z->z_pageq_va)); | |
7983 | #endif | |
6d2010ae | 7984 | |
c3c9b80d | 7985 | zone_unlock(z); |
6d2010ae | 7986 | |
c3c9b80d | 7987 | simple_lock(&all_zones_lock, &zone_locks_grp); |
6d2010ae | 7988 | |
c3c9b80d A |
7989 | assert(!bitmap_test(zone_destroyed_bitmap, zindex)); |
7990 | /* Mark the zone as empty in the bitmap */ | |
7991 | bitmap_set(zone_destroyed_bitmap, zindex); | |
7992 | num_zones_in_use--; | |
7993 | assert(num_zones_in_use > 0); | |
0a7de745 | 7994 | |
c3c9b80d A |
7995 | simple_unlock(&all_zones_lock); |
7996 | } | |
6d2010ae | 7997 | |
c3c9b80d A |
7998 | #endif /* !ZALLOC_TEST */ |
7999 | #pragma mark zalloc module init | |
8000 | #if !ZALLOC_TEST | |
39037602 | 8001 | |
c3c9b80d A |
8002 | /* |
8003 | * Initialize the "zone of zones" which uses fixed memory allocated | |
8004 | * earlier in memory initialization. zone_bootstrap is called | |
8005 | * before zone_init. | |
8006 | */ | |
8007 | __startup_func | |
8008 | void | |
8009 | zone_bootstrap(void) | |
8010 | { | |
8011 | /* Validate struct zone_packed_virtual_address expectations */ | |
8012 | static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1"); | |
8013 | if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) { | |
8014 | panic("zone_pva_t can't pack a kernel page address in 31 bits"); | |
8015 | } | |
39037602 | 8016 | |
c3c9b80d | 8017 | zpercpu_early_count = ml_early_cpu_max_number() + 1; |
39037602 | 8018 | |
c3c9b80d A |
8019 | /* Set up zone element poisoning */ |
8020 | zp_bootstrap(); | |
0a7de745 | 8021 | |
c3c9b80d A |
8022 | /* |
8023 | * the KASAN quarantine for kalloc doesn't understand heaps | |
8024 | * and trips the heap confusion panics. At the end of the day, | |
8025 | * all these security measures are double duty with KASAN. | |
8026 | * | |
8027 | * On 32bit kernels, these protections are just too expensive. | |
8028 | */ | |
8029 | #if !defined(__LP64__) || KASAN_ZALLOC | |
8030 | zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER; | |
8031 | zsecurity_options &= ~ZSECURITY_OPTIONS_SUBMAP_USER_DATA; | |
8032 | zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC; | |
8033 | #endif | |
3e170ce0 | 8034 | |
c3c9b80d A |
8035 | thread_call_setup_with_options(&zone_expand_callout, |
8036 | zone_expand_async, NULL, THREAD_CALL_PRIORITY_HIGH, | |
8037 | THREAD_CALL_OPTIONS_ONCE); | |
8038 | ||
8039 | thread_call_setup_with_options(&zone_defrag_callout, | |
8040 | zone_defrag_async, NULL, THREAD_CALL_PRIORITY_USER, | |
8041 | THREAD_CALL_OPTIONS_ONCE); | |
8042 | } | |
8043 | ||
8044 | #if __LP64__ | |
8045 | #if ARM_LARGE_MEMORY || __x86_64__ | |
8046 | #define ZONE_MAP_VIRTUAL_SIZE_LP64 (128ULL * 1024ULL * 1024 * 1024) | |
8047 | #else | |
8048 | #define ZONE_MAP_VIRTUAL_SIZE_LP64 (32ULL * 1024ULL * 1024 * 1024) | |
8049 | #endif | |
8050 | #endif /* __LP64__ */ | |
3e170ce0 | 8051 | |
c3c9b80d | 8052 | #define ZONE_GUARD_SIZE (64UL << 10) |
6d2010ae | 8053 | |
c3c9b80d A |
8054 | #if __LP64__ |
8055 | static inline vm_offset_t | |
8056 | zone_restricted_va_max(void) | |
a39ff7e2 | 8057 | { |
c3c9b80d A |
8058 | vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR); |
8059 | vm_offset_t vm_page_max = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR); | |
a39ff7e2 | 8060 | |
c3c9b80d A |
8061 | return trunc_page(MIN(compressor_max, vm_page_max)); |
8062 | } | |
a39ff7e2 A |
8063 | #endif |
8064 | ||
c3c9b80d A |
8065 | __startup_func |
8066 | static void | |
8067 | zone_tunables_fixup(void) | |
8068 | { | |
8069 | if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) { | |
8070 | zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT; | |
a39ff7e2 | 8071 | } |
c3c9b80d A |
8072 | if (zc_magazine_size > PAGE_SIZE / ZONE_MIN_ELEM_SIZE) { |
8073 | zc_magazine_size = (uint16_t)(PAGE_SIZE / ZONE_MIN_ELEM_SIZE); | |
8074 | } | |
8075 | } | |
8076 | STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup); | |
a39ff7e2 | 8077 | |
c3c9b80d A |
8078 | __startup_func |
8079 | static vm_size_t | |
8080 | zone_phys_size_max(void) | |
8081 | { | |
8082 | vm_size_t zsize; | |
8083 | vm_size_t zsizearg; | |
f427ee49 | 8084 | |
c3c9b80d A |
8085 | if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) { |
8086 | zsize = zsizearg * (1024ULL * 1024); | |
8087 | } else { | |
8088 | /* Set target zone size as 1/4 of physical memory */ | |
8089 | zsize = (vm_size_t)(sane_size >> 2); | |
8090 | #if defined(__LP64__) | |
8091 | zsize += zsize >> 1; | |
8092 | #endif /* __LP64__ */ | |
a39ff7e2 A |
8093 | } |
8094 | ||
c3c9b80d A |
8095 | if (zsize < CONFIG_ZONE_MAP_MIN) { |
8096 | zsize = CONFIG_ZONE_MAP_MIN; /* Clamp to min */ | |
a39ff7e2 | 8097 | } |
c3c9b80d A |
8098 | if (zsize > sane_size >> 1) { |
8099 | zsize = (vm_size_t)(sane_size >> 1); /* Clamp to half of RAM max */ | |
a39ff7e2 | 8100 | } |
c3c9b80d A |
8101 | if (zsizearg == 0 && zsize > ZONE_MAP_MAX) { |
8102 | /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */ | |
8103 | printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n", | |
8104 | (uintptr_t)zsize, (uintptr_t)ZONE_MAP_MAX); | |
8105 | zsize = ZONE_MAP_MAX; | |
8106 | } | |
8107 | ||
8108 | return (vm_size_t)trunc_page(zsize); | |
a39ff7e2 A |
8109 | } |
8110 | ||
c3c9b80d A |
8111 | __options_decl(zone_init_allocate_flags_t, unsigned, { |
8112 | ZIA_NONE = 0x00000000, | |
8113 | ZIA_REPLACE = 0x00000001, /* replace a previous non permanent range */ | |
8114 | ZIA_RANDOM = 0x00000002, /* place at a random address */ | |
8115 | ZIA_PERMANENT = 0x00000004, /* permanent allocation */ | |
8116 | ZIA_GUARD = 0x00000008, /* will be used as a guard */ | |
8117 | }); | |
8118 | ||
8119 | __startup_func | |
8120 | static struct zone_map_range | |
8121 | zone_init_allocate_va(vm_map_address_t addr, vm_size_t size, | |
8122 | zone_init_allocate_flags_t flags) | |
a39ff7e2 | 8123 | { |
c3c9b80d A |
8124 | vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; |
8125 | int vm_alloc_flags = 0; | |
8126 | struct zone_map_range r; | |
8127 | kern_return_t kr; | |
8128 | ||
8129 | if (flags & ZIA_REPLACE) { | |
8130 | vm_alloc_flags |= VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE; | |
8131 | } else { | |
8132 | vm_alloc_flags |= VM_FLAGS_ANYWHERE; | |
0a7de745 | 8133 | } |
c3c9b80d A |
8134 | if (flags & ZIA_RANDOM) { |
8135 | vm_alloc_flags |= VM_FLAGS_RANDOM_ADDR; | |
0a7de745 | 8136 | } |
c3c9b80d A |
8137 | if (flags & ZIA_PERMANENT) { |
8138 | vmk_flags.vmkf_permanent = true; | |
a39ff7e2 A |
8139 | } |
8140 | ||
c3c9b80d | 8141 | vm_object_reference(kernel_object); |
a39ff7e2 | 8142 | |
c3c9b80d A |
8143 | kr = vm_map_enter(kernel_map, &addr, size, 0, |
8144 | vm_alloc_flags, vmk_flags, VM_KERN_MEMORY_ZONE, | |
8145 | kernel_object, 0, FALSE, | |
8146 | (flags & ZIA_GUARD) ? VM_PROT_NONE : VM_PROT_DEFAULT, | |
8147 | (flags & ZIA_GUARD) ? VM_PROT_NONE : VM_PROT_DEFAULT, | |
8148 | VM_INHERIT_NONE); | |
5ba3f43e | 8149 | |
c3c9b80d A |
8150 | if (KERN_SUCCESS != kr) { |
8151 | panic("vm_map_enter(0x%zx) failed: %d", (size_t)size, kr); | |
5ba3f43e A |
8152 | } |
8153 | ||
c3c9b80d A |
8154 | r.min_address = (vm_offset_t)addr; |
8155 | r.max_address = (vm_offset_t)addr + size; | |
8156 | return r; | |
5ba3f43e A |
8157 | } |
8158 | ||
c3c9b80d A |
8159 | __startup_func |
8160 | static void | |
8161 | zone_submap_init( | |
8162 | vm_offset_t *submap_min, | |
8163 | unsigned idx, | |
8164 | uint64_t zone_sub_map_numer, | |
8165 | uint64_t *remaining_denom, | |
8166 | vm_offset_t *remaining_size, | |
8167 | vm_size_t guard_size) | |
d9a64523 | 8168 | { |
c3c9b80d A |
8169 | vm_offset_t submap_start, submap_end; |
8170 | vm_size_t submap_size; | |
8171 | vm_map_t submap; | |
d9a64523 | 8172 | kern_return_t kr; |
d9a64523 | 8173 | |
c3c9b80d A |
8174 | submap_size = trunc_page(zone_sub_map_numer * *remaining_size / |
8175 | *remaining_denom); | |
8176 | submap_start = *submap_min; | |
8177 | submap_end = submap_start + submap_size; | |
d9a64523 | 8178 | |
c3c9b80d A |
8179 | #if defined(__LP64__) |
8180 | if (idx == Z_SUBMAP_IDX_VA_RESTRICTED) { | |
8181 | vm_offset_t restricted_va_max = zone_restricted_va_max(); | |
8182 | if (submap_end > restricted_va_max) { | |
8183 | #if DEBUG || DEVELOPMENT | |
8184 | printf("zone_init: submap[%d] clipped to %zdM of %zdM\n", idx, | |
8185 | (size_t)(restricted_va_max - submap_start) >> 20, | |
8186 | (size_t)submap_size >> 20); | |
8187 | #endif /* DEBUG || DEVELOPMENT */ | |
8188 | guard_size += submap_end - restricted_va_max; | |
8189 | *remaining_size -= submap_end - restricted_va_max; | |
8190 | submap_end = restricted_va_max; | |
8191 | submap_size = restricted_va_max - submap_start; | |
8192 | } | |
d9a64523 | 8193 | |
c3c9b80d A |
8194 | vm_packing_verify_range("vm_compressor", |
8195 | submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR)); | |
8196 | vm_packing_verify_range("vm_page", | |
8197 | submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR)); | |
8198 | } | |
8199 | #endif /* defined(__LP64__) */ | |
d9a64523 | 8200 | |
c3c9b80d A |
8201 | vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; |
8202 | vmk_flags.vmkf_permanent = TRUE; | |
8203 | kr = kmem_suballoc(kernel_map, submap_min, submap_size, | |
8204 | FALSE, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, vmk_flags, | |
8205 | VM_KERN_MEMORY_ZONE, &submap); | |
0a7de745 | 8206 | if (kr != KERN_SUCCESS) { |
c3c9b80d A |
8207 | panic("kmem_suballoc(kernel_map[%d] %p:%p) failed: %d", |
8208 | idx, (void *)submap_start, (void *)submap_end, kr); | |
0a7de745 | 8209 | } |
d9a64523 | 8210 | |
c3c9b80d A |
8211 | #if DEBUG || DEVELOPMENT |
8212 | printf("zone_init: submap[%d] %p:%p (%zuM)\n", | |
8213 | idx, (void *)submap_start, (void *)submap_end, | |
8214 | (size_t)submap_size >> 20); | |
8215 | #endif /* DEBUG || DEVELOPMENT */ | |
d9a64523 | 8216 | |
c3c9b80d A |
8217 | zone_init_allocate_va(submap_end, guard_size, |
8218 | ZIA_PERMANENT | ZIA_GUARD | ZIA_REPLACE); | |
8219 | ||
8220 | zone_submaps[idx] = submap; | |
8221 | *submap_min = submap_end + guard_size; | |
8222 | *remaining_size -= submap_size; | |
8223 | *remaining_denom -= zone_sub_map_numer; | |
8224 | } | |
8225 | ||
8226 | /* | |
8227 | * Allocate metadata array and migrate foreign initial metadata. | |
8228 | * | |
8229 | * So that foreign pages and native pages have the same scheme, | |
8230 | * we allocate VA space that covers both foreign and native pages. | |
8231 | */ | |
8232 | __startup_func | |
8233 | static void | |
8234 | zone_metadata_init(void) | |
8235 | { | |
8236 | struct zone_map_range r0 = zone_info.zi_map_range[0]; | |
8237 | struct zone_map_range r1 = zone_info.zi_map_range[1]; | |
8238 | struct zone_map_range mr, br; | |
8239 | vm_size_t meta_size, bits_size, foreign_base; | |
8240 | vm_offset_t hstart, hend; | |
8241 | ||
8242 | if (r0.min_address > r1.min_address) { | |
8243 | r0 = zone_info.zi_map_range[1]; | |
8244 | r1 = zone_info.zi_map_range[0]; | |
d9a64523 A |
8245 | } |
8246 | ||
c3c9b80d A |
8247 | meta_size = round_page(atop(r1.max_address - r0.min_address) * |
8248 | sizeof(struct zone_page_metadata)) + ZONE_GUARD_SIZE * 2; | |
d9a64523 | 8249 | |
c3c9b80d A |
8250 | /* |
8251 | * Allocations can't be smaller than 8 bytes, which is 128b / 16B per 1k | |
8252 | * of physical memory (16M per 1G). | |
8253 | * | |
8254 | * Let's preallocate for the worst to avoid weird panics. | |
8255 | */ | |
8256 | bits_size = round_page(16 * (ptoa(zone_phys_mapped_max_pages) >> 10)); | |
d9a64523 | 8257 | |
c3c9b80d A |
8258 | /* |
8259 | * Compute the size of the "hole" in the middle of the range. | |
8260 | * | |
8261 | * If it is smaller than 256k, just leave it be, with this layout: | |
8262 | * | |
8263 | * [G][ r0 meta ][ hole ][ r1 meta ][ bits ][G] | |
8264 | * | |
8265 | * else punch a hole with guard pages around the hole, and place the | |
8266 | * bits in the hole if it fits, or after r1 otherwise, yielding either | |
8267 | * of the following layouts: | |
8268 | * | |
8269 | * |__________________hend____________| | |
8270 | * |__hstart_| | | |
8271 | * [G][ r0 meta ][ bits ][G]..........[G][ r1 meta ][G] | |
8272 | * [G][ r0 meta ][G]..................[G][ r1 meta ][ bits ][G] | |
8273 | */ | |
8274 | hstart = round_page(atop(r0.max_address - r0.min_address) * | |
8275 | sizeof(struct zone_page_metadata)); | |
8276 | hend = trunc_page(atop(r1.min_address - r0.min_address) * | |
8277 | sizeof(struct zone_page_metadata)); | |
8278 | ||
8279 | if (hstart >= hend || hend - hstart < (256ul << 10)) { | |
8280 | mr = zone_init_allocate_va(0, meta_size + bits_size, | |
8281 | ZIA_PERMANENT | ZIA_RANDOM); | |
8282 | mr.min_address += ZONE_GUARD_SIZE; | |
8283 | mr.max_address -= ZONE_GUARD_SIZE; | |
8284 | br.max_address = mr.max_address; | |
8285 | mr.max_address -= bits_size; | |
8286 | br.min_address = mr.max_address; | |
d9a64523 | 8287 | |
d9a64523 | 8288 | #if DEBUG || DEVELOPMENT |
c3c9b80d A |
8289 | printf("zone_init: metadata %p:%p (%zuK)\n", |
8290 | (void *)mr.min_address, (void *)mr.max_address, | |
8291 | (size_t)zone_range_size(&mr) >> 10); | |
8292 | printf("zone_init: metabits %p:%p (%zuK)\n", | |
8293 | (void *)br.min_address, (void *)br.max_address, | |
8294 | (size_t)zone_range_size(&br) >> 10); | |
8295 | #endif /* DEBUG || DEVELOPMENT */ | |
8296 | } else { | |
8297 | vm_size_t size, alloc_size = meta_size; | |
8298 | vm_offset_t base; | |
8299 | bool bits_in_middle = true; | |
d9a64523 | 8300 | |
c3c9b80d A |
8301 | if (hend - hstart - 2 * ZONE_GUARD_SIZE < bits_size) { |
8302 | alloc_size += bits_size; | |
8303 | bits_in_middle = false; | |
8304 | } | |
d9a64523 | 8305 | |
c3c9b80d A |
8306 | mr = zone_init_allocate_va(0, alloc_size, ZIA_RANDOM); |
8307 | ||
8308 | base = mr.min_address; | |
8309 | size = ZONE_GUARD_SIZE + hstart + ZONE_GUARD_SIZE; | |
8310 | if (bits_in_middle) { | |
8311 | size += bits_size; | |
8312 | br.min_address = base + ZONE_GUARD_SIZE + hstart; | |
8313 | br.max_address = br.min_address + bits_size; | |
8314 | } | |
8315 | zone_init_allocate_va(base, size, ZIA_PERMANENT | ZIA_REPLACE); | |
d9a64523 | 8316 | |
c3c9b80d A |
8317 | base += size; |
8318 | size = mr.min_address + hend - base; | |
8319 | kmem_free(kernel_map, base, size); | |
f427ee49 | 8320 | |
c3c9b80d A |
8321 | base = mr.min_address + hend; |
8322 | size = mr.max_address - base; | |
8323 | zone_init_allocate_va(base, size, ZIA_PERMANENT | ZIA_REPLACE); | |
d9a64523 | 8324 | |
c3c9b80d A |
8325 | mr.min_address += ZONE_GUARD_SIZE; |
8326 | mr.max_address -= ZONE_GUARD_SIZE; | |
8327 | if (!bits_in_middle) { | |
8328 | br.max_address = mr.max_address; | |
8329 | mr.max_address -= bits_size; | |
8330 | br.min_address = mr.max_address; | |
d9a64523 | 8331 | } |
c3c9b80d A |
8332 | |
8333 | #if DEBUG || DEVELOPMENT | |
8334 | printf("zone_init: metadata0 %p:%p (%zuK)\n", | |
8335 | (void *)mr.min_address, (void *)(mr.min_address + hstart), | |
8336 | (size_t)hstart >> 10); | |
8337 | printf("zone_init: metadata1 %p:%p (%zuK)\n", | |
8338 | (void *)(mr.min_address + hend), (void *)mr.max_address, | |
8339 | (size_t)(zone_range_size(&mr) - hend) >> 10); | |
8340 | printf("zone_init: metabits %p:%p (%zuK)\n", | |
8341 | (void *)br.min_address, (void *)br.max_address, | |
8342 | (size_t)zone_range_size(&br) >> 10); | |
8343 | #endif /* DEBUG || DEVELOPMENT */ | |
d9a64523 A |
8344 | } |
8345 | ||
c3c9b80d A |
8346 | br.min_address = (br.min_address + ZBA_CHUNK_SIZE - 1) & -ZBA_CHUNK_SIZE; |
8347 | br.max_address = br.max_address & -ZBA_CHUNK_SIZE; | |
8348 | ||
8349 | zone_info.zi_meta_range = mr; | |
8350 | zone_info.zi_bits_range = br; | |
8351 | ||
8352 | /* | |
8353 | * Migrate the original static metadata into its new location. | |
8354 | */ | |
8355 | zone_info.zi_meta_base = (struct zone_page_metadata *)mr.min_address - | |
8356 | zone_pva_from_addr(r0.min_address).packed_address; | |
8357 | foreign_base = zone_info.zi_map_range[ZONE_ADDR_FOREIGN].min_address; | |
8358 | zone_meta_populate(foreign_base, zone_foreign_size()); | |
8359 | memcpy(zone_meta_from_addr(foreign_base), | |
8360 | zone_foreign_meta_array_startup, | |
8361 | atop(zone_foreign_size()) * sizeof(struct zone_page_metadata)); | |
8362 | ||
8363 | zba_populate(0); | |
8364 | memcpy(zba_base_header(), zba_chunk_startup, | |
8365 | sizeof(zba_chunk_startup)); | |
8366 | } | |
8367 | ||
8368 | /* Global initialization of Zone Allocator. | |
8369 | * Runs after zone_bootstrap. | |
8370 | */ | |
8371 | __startup_func | |
8372 | static void | |
8373 | zone_init(void) | |
8374 | { | |
8375 | vm_size_t zone_map_size; | |
8376 | vm_size_t remaining_size; | |
8377 | vm_offset_t submap_min = 0; | |
8378 | uint64_t denom = 0; | |
8379 | uint64_t submap_ratios[Z_SUBMAP_IDX_COUNT] = { | |
8380 | #ifdef __LP64__ | |
8381 | [Z_SUBMAP_IDX_VA_RESTRICTED] = 20, | |
8382 | #else | |
8383 | [Z_SUBMAP_IDX_VA_RESERVE] = 10, | |
8384 | #endif /* defined(__LP64__) */ | |
8385 | [Z_SUBMAP_IDX_GENERAL] = 40, | |
8386 | [Z_SUBMAP_IDX_BAG_OF_BYTES] = 40, | |
8387 | }; | |
8388 | ||
8389 | if (ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) { | |
8390 | zone_last_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES; | |
8391 | } else { | |
8392 | zone_last_submap_idx = Z_SUBMAP_IDX_GENERAL; | |
d9a64523 | 8393 | } |
c3c9b80d | 8394 | zone_phys_mapped_max_pages = (uint32_t)atop(zone_phys_size_max()); |
d9a64523 | 8395 | |
c3c9b80d A |
8396 | for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) { |
8397 | #if DEBUG || DEVELOPMENT | |
8398 | char submap_name[1 + sizeof("submap")]; | |
8399 | snprintf(submap_name, sizeof(submap_name), "submap%d", idx); | |
8400 | PE_parse_boot_argn(submap_name, &submap_ratios[idx], sizeof(uint64_t)); | |
8401 | #endif | |
8402 | denom += submap_ratios[idx]; | |
d9a64523 A |
8403 | } |
8404 | ||
c3c9b80d A |
8405 | #if __LP64__ |
8406 | zone_map_size = ZONE_MAP_VIRTUAL_SIZE_LP64; | |
8407 | #else | |
8408 | zone_map_size = ptoa(zone_phys_mapped_max_pages * | |
8409 | (denom + submap_ratios[Z_SUBMAP_IDX_VA_RESERVE]) / denom); | |
8410 | #endif | |
d9a64523 | 8411 | |
c3c9b80d A |
8412 | remaining_size = zone_map_size - |
8413 | ZONE_GUARD_SIZE * (zone_last_submap_idx + 1); | |
d9a64523 A |
8414 | |
8415 | /* | |
c3c9b80d A |
8416 | * And now allocate the various pieces of VA and submaps. |
8417 | * | |
8418 | * Make a first allocation of contiguous VA, that we'll deallocate, | |
8419 | * and we'll carve-out memory in that range again linearly. | |
8420 | * The kernel is stil single threaded at this stage. | |
d9a64523 | 8421 | */ |
d9a64523 | 8422 | |
c3c9b80d A |
8423 | struct zone_map_range *map_range = |
8424 | &zone_info.zi_map_range[ZONE_ADDR_NATIVE]; | |
d9a64523 | 8425 | |
c3c9b80d A |
8426 | *map_range = zone_init_allocate_va(0, zone_map_size, ZIA_NONE); |
8427 | submap_min = map_range->min_address; | |
d9a64523 | 8428 | |
c3c9b80d A |
8429 | /* |
8430 | * Allocate the submaps | |
8431 | */ | |
8432 | for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) { | |
8433 | zone_submap_init(&submap_min, idx, submap_ratios[idx], | |
8434 | &denom, &remaining_size, ZONE_GUARD_SIZE); | |
8435 | } | |
d9a64523 | 8436 | |
c3c9b80d | 8437 | assert(submap_min == map_range->max_address); |
d9a64523 | 8438 | |
c3c9b80d | 8439 | zone_metadata_init(); |
5ba3f43e | 8440 | |
c3c9b80d A |
8441 | #if VM_MAX_TAG_ZONES |
8442 | if (zone_tagging_on) { | |
8443 | zone_tagging_init(zone_map_size); | |
8444 | } | |
8445 | #endif | |
8446 | #if CONFIG_GZALLOC | |
8447 | gzalloc_init(zone_map_size); | |
8448 | #endif | |
5ba3f43e | 8449 | |
c3c9b80d A |
8450 | zone_create_flags_t kma_flags = ZC_NOCACHING | |
8451 | ZC_NOGC | ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT | | |
8452 | ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE; | |
5ba3f43e | 8453 | |
c3c9b80d A |
8454 | (void)zone_create_ext("vm.permanent", 1, kma_flags, |
8455 | ZONE_ID_PERMANENT, ^(zone_t z){ | |
8456 | z->z_permanent = true; | |
8457 | z->z_elem_size = 1; | |
8458 | #if defined(__LP64__) | |
8459 | z->z_submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED; | |
8460 | #endif | |
8461 | }); | |
8462 | (void)zone_create_ext("vm.permanent.percpu", 1, kma_flags | ZC_PERCPU, | |
8463 | ZONE_ID_PERCPU_PERMANENT, ^(zone_t z){ | |
8464 | z->z_permanent = true; | |
8465 | z->z_elem_size = 1; | |
8466 | #if defined(__LP64__) | |
8467 | z->z_submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED; | |
8468 | #endif | |
8469 | }); | |
5ba3f43e | 8470 | |
c3c9b80d A |
8471 | /* |
8472 | * Now migrate the startup statistics into their final storage. | |
8473 | */ | |
8474 | int cpu = cpu_number(); | |
f427ee49 | 8475 | zone_index_foreach(idx) { |
c3c9b80d | 8476 | zone_t tz = &zone_array[idx]; |
f427ee49 | 8477 | |
c3c9b80d A |
8478 | if (tz->z_stats == __zpcpu_mangle_for_boot(&zone_stats_startup[idx])) { |
8479 | zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats); | |
8480 | ||
8481 | *zpercpu_get_cpu(zs, cpu) = *zpercpu_get_cpu(tz->z_stats, cpu); | |
8482 | tz->z_stats = zs; | |
8483 | #if ZONE_ENABLE_LOGGING | |
8484 | if (tz->zone_logging && !tz->zlog_btlog) { | |
8485 | zone_enable_logging(tz); | |
8486 | } | |
8487 | #endif /* ZONE_ENABLE_LOGGING */ | |
0a7de745 | 8488 | } |
0a7de745 | 8489 | } |
5ba3f43e | 8490 | |
c3c9b80d A |
8491 | #if CONFIG_ZLEAKS |
8492 | /* | |
8493 | * Initialize the zone leak monitor | |
8494 | */ | |
8495 | zleak_init(zone_map_size); | |
8496 | #endif /* CONFIG_ZLEAKS */ | |
5ba3f43e | 8497 | |
c3c9b80d A |
8498 | #if VM_MAX_TAG_ZONES |
8499 | if (zone_tagging_on) { | |
8500 | vm_allocation_zones_init(); | |
8501 | } | |
8502 | #endif | |
5ba3f43e | 8503 | } |
c3c9b80d | 8504 | STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init); |
5ba3f43e | 8505 | |
c3c9b80d A |
8506 | __startup_func |
8507 | static void | |
8508 | zone_cache_bootstrap(void) | |
8509 | { | |
8510 | zone_t magzone; | |
d9a64523 | 8511 | |
c3c9b80d A |
8512 | magzone = zone_create("zcc_magazine_zone", sizeof(struct zone_magazine) + |
8513 | zc_mag_size() * sizeof(zone_element_t), | |
8514 | ZC_NOGZALLOC | ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE | | |
8515 | ZC_SEQUESTER | ZC_CACHING | ZC_ZFREE_CLEARMEM); | |
8516 | magzone->z_elems_rsv = (uint16_t)(2 * zpercpu_count()); | |
5ba3f43e | 8517 | |
c3c9b80d | 8518 | os_atomic_store(&zc_magazine_zone, magzone, compiler_acq_rel); |
316670eb | 8519 | |
c3c9b80d A |
8520 | /* |
8521 | * Now that we are initialized, we can enable zone caching for zones that | |
8522 | * were made before zcache_bootstrap() was called. | |
8523 | * | |
8524 | * The system is still single threaded so we don't need to take the lock. | |
8525 | */ | |
8526 | zone_index_foreach(i) { | |
8527 | zone_t z = &zone_array[i]; | |
8528 | if (z->z_pcpu_cache) { | |
8529 | z->z_pcpu_cache = NULL; | |
8530 | zone_enable_caching(z); | |
8531 | } | |
d9a64523 | 8532 | } |
316670eb | 8533 | } |
c3c9b80d | 8534 | STARTUP(ZALLOC, STARTUP_RANK_FOURTH, zone_cache_bootstrap); |
316670eb | 8535 | |
c3c9b80d A |
8536 | void |
8537 | zalloc_first_proc_made(void) | |
39236c6e | 8538 | { |
c3c9b80d A |
8539 | zone_caching_disabled = 0; |
8540 | } | |
f427ee49 | 8541 | |
c3c9b80d A |
8542 | __startup_func |
8543 | vm_offset_t | |
8544 | zone_foreign_mem_init(vm_size_t size) | |
8545 | { | |
8546 | vm_offset_t mem; | |
8547 | ||
8548 | if (atop(size) > ZONE_FOREIGN_META_INLINE_COUNT) { | |
8549 | panic("ZONE_FOREIGN_META_INLINE_COUNT has become too small: " | |
8550 | "%d > %d", (int)atop(size), ZONE_FOREIGN_META_INLINE_COUNT); | |
39236c6e | 8551 | } |
39236c6e | 8552 | |
c3c9b80d A |
8553 | mem = (vm_offset_t)pmap_steal_memory(size); |
8554 | ||
8555 | zone_info.zi_meta_base = zone_foreign_meta_array_startup - | |
8556 | zone_pva_from_addr(mem).packed_address; | |
8557 | zone_info.zi_map_range[ZONE_ADDR_FOREIGN].min_address = mem; | |
8558 | zone_info.zi_map_range[ZONE_ADDR_FOREIGN].max_address = mem + size; | |
8559 | ||
8560 | zone_info.zi_bits_range = (struct zone_map_range){ | |
8561 | .min_address = (vm_offset_t)zba_chunk_startup, | |
8562 | .max_address = (vm_offset_t)zba_chunk_startup + | |
8563 | sizeof(zba_chunk_startup), | |
8564 | }; | |
8565 | zba_init_chunk(0); | |
8566 | ||
8567 | return mem; | |
f427ee49 | 8568 | } |
1c79356b | 8569 | |
c3c9b80d | 8570 | #endif /* !ZALLOC_TEST */ |
f427ee49 A |
8571 | #pragma mark - tests |
8572 | #if DEBUG || DEVELOPMENT | |
1c79356b | 8573 | |
f427ee49 A |
8574 | /* |
8575 | * Used for sysctl kern.run_zone_test which is not thread-safe. Ensure only one | |
8576 | * thread goes through at a time. Or we can end up with multiple test zones (if | |
8577 | * a second zinit() comes through before zdestroy()), which could lead us to | |
8578 | * run out of zones. | |
8579 | */ | |
c3c9b80d | 8580 | static SIMPLE_LOCK_DECLARE(zone_test_lock, 0); |
f427ee49 A |
8581 | static boolean_t zone_test_running = FALSE; |
8582 | static zone_t test_zone_ptr = NULL; | |
1c79356b | 8583 | |
f427ee49 | 8584 | static uintptr_t * |
c3c9b80d | 8585 | zone_copy_allocations(zone_t z, uintptr_t *elems, zone_pva_t page_index) |
f427ee49 | 8586 | { |
c3c9b80d A |
8587 | vm_offset_t elem_size = zone_elem_size(z); |
8588 | vm_offset_t base; | |
f427ee49 | 8589 | struct zone_page_metadata *meta; |
39037602 | 8590 | |
f427ee49 | 8591 | while (!zone_pva_is_null(page_index)) { |
c3c9b80d A |
8592 | base = zone_pva_to_addr(page_index); |
8593 | meta = zone_pva_to_meta(page_index); | |
39037602 | 8594 | |
c3c9b80d A |
8595 | if (meta->zm_inline_bitmap) { |
8596 | for (size_t i = 0; i < meta->zm_chunk_len; i++) { | |
8597 | uint32_t map = meta[i].zm_bitmap; | |
39037602 | 8598 | |
c3c9b80d A |
8599 | for (; map; map &= map - 1) { |
8600 | *elems++ = INSTANCE_PUT(base + | |
8601 | elem_size * __builtin_clz(map)); | |
8602 | } | |
8603 | base += elem_size * 32; | |
8604 | } | |
8605 | } else { | |
8606 | uint32_t order = zba_bits_ref_order(meta->zm_bitmap); | |
8607 | bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); | |
8608 | for (size_t i = 0; i < (1u << order); i++) { | |
8609 | uint64_t map = bits[i]; | |
8610 | ||
8611 | for (; map; map &= map - 1) { | |
8612 | *elems++ = INSTANCE_PUT(base + | |
8613 | elem_size * __builtin_clzll(map)); | |
8614 | } | |
8615 | base += elem_size * 64; | |
0a7de745 | 8616 | } |
0a7de745 | 8617 | } |
0a7de745 | 8618 | |
f427ee49 A |
8619 | page_index = meta->zm_page_next; |
8620 | } | |
0a7de745 | 8621 | return elems; |
39037602 A |
8622 | } |
8623 | ||
8624 | kern_return_t | |
8625 | zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon) | |
8626 | { | |
f427ee49 A |
8627 | uintptr_t zbt[MAX_ZTRACE_DEPTH]; |
8628 | zone_t zone = NULL; | |
0a7de745 A |
8629 | uintptr_t * array; |
8630 | uintptr_t * next; | |
8631 | uintptr_t element, bt; | |
8632 | uint32_t idx, count, found; | |
8633 | uint32_t btidx, btcount, nobtcount, btfound; | |
8634 | uint32_t elemSize; | |
c3c9b80d | 8635 | size_t maxElems; |
5ba3f43e | 8636 | kern_return_t kr; |
39037602 | 8637 | |
c3c9b80d A |
8638 | zone_foreach(z) { |
8639 | if (!strncmp(zoneName, z->z_name, nameLen)) { | |
8640 | zone = z; | |
0a7de745 A |
8641 | break; |
8642 | } | |
8643 | } | |
f427ee49 | 8644 | if (zone == NULL) { |
0a7de745 A |
8645 | return KERN_INVALID_NAME; |
8646 | } | |
0a7de745 | 8647 | |
c3c9b80d A |
8648 | elemSize = (uint32_t)zone_elem_size(zone); |
8649 | maxElems = (zone->z_elems_avail + 1) & ~1ul; | |
0a7de745 | 8650 | |
c3c9b80d | 8651 | if ((ptoa(zone->z_percpu ? 1 : zone->z_chunk_pages) % elemSize) && |
f427ee49 | 8652 | !zone_leaks_scan_enable) { |
0a7de745 A |
8653 | return KERN_INVALID_CAPABILITY; |
8654 | } | |
8655 | ||
8656 | kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array, | |
c3c9b80d | 8657 | maxElems * sizeof(uintptr_t), VM_KERN_MEMORY_DIAG); |
0a7de745 A |
8658 | if (KERN_SUCCESS != kr) { |
8659 | return kr; | |
8660 | } | |
8661 | ||
c3c9b80d | 8662 | zone_lock(zone); |
0a7de745 A |
8663 | |
8664 | next = array; | |
c3c9b80d A |
8665 | next = zone_copy_allocations(zone, next, zone->z_pageq_partial); |
8666 | next = zone_copy_allocations(zone, next, zone->z_pageq_full); | |
0a7de745 A |
8667 | count = (uint32_t)(next - array); |
8668 | ||
c3c9b80d | 8669 | zone_unlock(zone); |
0a7de745 | 8670 | |
c3c9b80d | 8671 | zone_leaks_scan(array, count, (uint32_t)zone_elem_size(zone), &found); |
0a7de745 A |
8672 | assert(found <= count); |
8673 | ||
8674 | for (idx = 0; idx < count; idx++) { | |
8675 | element = array[idx]; | |
8676 | if (kInstanceFlagReferenced & element) { | |
8677 | continue; | |
8678 | } | |
8679 | element = INSTANCE_PUT(element) & ~kInstanceFlags; | |
8680 | } | |
8681 | ||
f427ee49 | 8682 | #if ZONE_ENABLE_LOGGING |
0a7de745 A |
8683 | if (zone->zlog_btlog && !corruption_debug_flag) { |
8684 | // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found | |
8685 | btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon); | |
8686 | } | |
f427ee49 | 8687 | #endif /* ZONE_ENABLE_LOGGING */ |
0a7de745 A |
8688 | |
8689 | for (nobtcount = idx = 0; idx < count; idx++) { | |
8690 | element = array[idx]; | |
8691 | if (!element) { | |
8692 | continue; | |
8693 | } | |
8694 | if (kInstanceFlagReferenced & element) { | |
8695 | continue; | |
8696 | } | |
8697 | element = INSTANCE_PUT(element) & ~kInstanceFlags; | |
8698 | ||
8699 | // see if we can find any backtrace left in the element | |
f427ee49 | 8700 | btcount = (typeof(btcount))(zone_elem_size(zone) / sizeof(uintptr_t)); |
0a7de745 A |
8701 | if (btcount >= MAX_ZTRACE_DEPTH) { |
8702 | btcount = MAX_ZTRACE_DEPTH - 1; | |
8703 | } | |
8704 | for (btfound = btidx = 0; btidx < btcount; btidx++) { | |
8705 | bt = ((uintptr_t *)element)[btcount - 1 - btidx]; | |
8706 | if (!VM_KERNEL_IS_SLID(bt)) { | |
8707 | break; | |
8708 | } | |
8709 | zbt[btfound++] = bt; | |
8710 | } | |
8711 | if (btfound) { | |
8712 | (*proc)(refCon, 1, elemSize, &zbt[0], btfound); | |
8713 | } else { | |
8714 | nobtcount++; | |
8715 | } | |
8716 | } | |
8717 | if (nobtcount) { | |
8718 | // fake backtrace when we found nothing | |
8719 | zbt[0] = (uintptr_t) &zalloc; | |
8720 | (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1); | |
8721 | } | |
8722 | ||
8723 | kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t)); | |
8724 | ||
8725 | return KERN_SUCCESS; | |
1c79356b A |
8726 | } |
8727 | ||
5ba3f43e A |
8728 | boolean_t |
8729 | run_zone_test(void) | |
8730 | { | |
d9a64523 | 8731 | unsigned int i = 0, max_iter = 5; |
5ba3f43e A |
8732 | void * test_ptr; |
8733 | zone_t test_zone; | |
c3c9b80d A |
8734 | zone_t test_pcpu_zone; |
8735 | kern_return_t kr; | |
b0d623f7 | 8736 | |
0a7de745 | 8737 | simple_lock(&zone_test_lock, &zone_locks_grp); |
5ba3f43e A |
8738 | if (!zone_test_running) { |
8739 | zone_test_running = TRUE; | |
8740 | } else { | |
8741 | simple_unlock(&zone_test_lock); | |
8742 | printf("run_zone_test: Test already running.\n"); | |
8743 | return FALSE; | |
8744 | } | |
8745 | simple_unlock(&zone_test_lock); | |
39037602 | 8746 | |
5ba3f43e | 8747 | printf("run_zone_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n"); |
39037602 | 8748 | |
5ba3f43e A |
8749 | /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */ |
8750 | do { | |
8751 | test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl"); | |
8752 | if (test_zone == NULL) { | |
8753 | printf("run_zone_test: zinit() failed\n"); | |
8754 | return FALSE; | |
8755 | } | |
39037602 | 8756 | |
5ba3f43e | 8757 | #if KASAN_ZALLOC |
c3c9b80d | 8758 | if (test_zone_ptr == NULL && test_zone->z_elems_free != 0) { |
5ba3f43e | 8759 | #else |
c3c9b80d | 8760 | if (test_zone->z_elems_free != 0) { |
5ba3f43e A |
8761 | #endif |
8762 | printf("run_zone_test: free count is not zero\n"); | |
8763 | return FALSE; | |
8764 | } | |
8765 | ||
8766 | if (test_zone_ptr == NULL) { | |
8767 | /* Stash the zone pointer returned on the fist zinit */ | |
8768 | printf("run_zone_test: zone created for the first time\n"); | |
8769 | test_zone_ptr = test_zone; | |
8770 | } else if (test_zone != test_zone_ptr) { | |
8771 | printf("run_zone_test: old zone pointer and new zone pointer don't match\n"); | |
8772 | return FALSE; | |
8773 | } | |
8774 | ||
8775 | test_ptr = zalloc(test_zone); | |
8776 | if (test_ptr == NULL) { | |
8777 | printf("run_zone_test: zalloc() failed\n"); | |
8778 | return FALSE; | |
8779 | } | |
8780 | zfree(test_zone, test_ptr); | |
8781 | ||
8782 | zdestroy(test_zone); | |
8783 | i++; | |
8784 | ||
8785 | printf("run_zone_test: Iteration %d successful\n", i); | |
8786 | } while (i < max_iter); | |
8787 | ||
f427ee49 A |
8788 | /* test Z_VA_SEQUESTER */ |
8789 | if (zsecurity_options & ZSECURITY_OPTIONS_SEQUESTER) { | |
8790 | int idx, num_allocs = 8; | |
8791 | vm_size_t elem_size = 2 * PAGE_SIZE / num_allocs; | |
8792 | void *allocs[num_allocs]; | |
c3c9b80d A |
8793 | void **allocs_pcpu; |
8794 | vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed); | |
f427ee49 A |
8795 | |
8796 | test_zone = zone_create("test_zone_sysctl", elem_size, | |
8797 | ZC_DESTRUCTIBLE | ZC_SEQUESTER); | |
c3c9b80d A |
8798 | assert(test_zone); |
8799 | ||
8800 | test_pcpu_zone = zone_create("test_zone_sysctl.pcpu", sizeof(uint64_t), | |
8801 | ZC_DESTRUCTIBLE | ZC_SEQUESTER | ZC_PERCPU); | |
8802 | assert(test_pcpu_zone); | |
f427ee49 A |
8803 | |
8804 | for (idx = 0; idx < num_allocs; idx++) { | |
8805 | allocs[idx] = zalloc(test_zone); | |
8806 | assert(NULL != allocs[idx]); | |
8807 | printf("alloc[%d] %p\n", idx, allocs[idx]); | |
8808 | } | |
8809 | for (idx = 0; idx < num_allocs; idx++) { | |
8810 | zfree(test_zone, allocs[idx]); | |
8811 | } | |
c3c9b80d A |
8812 | assert(!zone_pva_is_null(test_zone->z_pageq_empty)); |
8813 | ||
8814 | kr = kernel_memory_allocate(kernel_map, | |
8815 | (vm_address_t *)&allocs_pcpu, PAGE_SIZE, | |
8816 | 0, KMA_ZERO | KMA_KOBJECT, VM_KERN_MEMORY_DIAG); | |
8817 | assert(kr == KERN_SUCCESS); | |
8818 | ||
8819 | for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { | |
8820 | allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone, | |
8821 | Z_WAITOK | Z_ZERO); | |
8822 | assert(NULL != allocs_pcpu[idx]); | |
8823 | } | |
8824 | for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { | |
8825 | zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]); | |
8826 | } | |
8827 | assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty)); | |
f427ee49 | 8828 | |
c3c9b80d | 8829 | printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n", |
f427ee49 | 8830 | vm_page_wire_count, vm_page_free_count, |
c3c9b80d A |
8831 | 100L * phys_pages / zone_phys_mapped_max_pages); |
8832 | zone_gc(ZONE_GC_DRAIN); | |
8833 | printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n", | |
f427ee49 | 8834 | vm_page_wire_count, vm_page_free_count, |
c3c9b80d A |
8835 | 100L * phys_pages / zone_phys_mapped_max_pages); |
8836 | ||
f427ee49 | 8837 | unsigned int allva = 0; |
c3c9b80d A |
8838 | |
8839 | zone_foreach(z) { | |
8840 | zone_lock(z); | |
8841 | allva += z->z_wired_cur; | |
8842 | if (zone_pva_is_null(z->z_pageq_va)) { | |
8843 | zone_unlock(z); | |
f427ee49 A |
8844 | continue; |
8845 | } | |
8846 | unsigned count = 0; | |
8847 | uint64_t size; | |
c3c9b80d | 8848 | zone_pva_t pg = z->z_pageq_va; |
f427ee49 A |
8849 | struct zone_page_metadata *page_meta; |
8850 | while (pg.packed_address) { | |
c3c9b80d A |
8851 | page_meta = zone_pva_to_meta(pg); |
8852 | count += z->z_percpu ? 1 : z->z_chunk_pages; | |
8853 | if (page_meta->zm_chunk_len == ZM_SECONDARY_PAGE) { | |
8854 | count -= page_meta->zm_page_index; | |
8855 | } | |
f427ee49 A |
8856 | pg = page_meta->zm_page_next; |
8857 | } | |
c3c9b80d | 8858 | assert(z->z_wired_cur + count == z->z_va_cur); |
f427ee49 A |
8859 | size = zone_size_wired(z); |
8860 | if (!size) { | |
8861 | size = 1; | |
8862 | } | |
8863 | printf("%s%s: seq %d, res %d, %qd %%\n", | |
c3c9b80d A |
8864 | zone_heap_name(z), z->z_name, z->z_va_cur - z->z_wired_cur, |
8865 | z->z_wired_cur, zone_size_allocated(z) * 100ULL / size); | |
8866 | zone_unlock(z); | |
f427ee49 A |
8867 | } |
8868 | ||
8869 | printf("total va: %d\n", allva); | |
8870 | ||
c3c9b80d A |
8871 | assert(zone_pva_is_null(test_zone->z_pageq_empty)); |
8872 | assert(zone_pva_is_null(test_zone->z_pageq_partial)); | |
8873 | assert(!zone_pva_is_null(test_zone->z_pageq_va)); | |
8874 | assert(zone_pva_is_null(test_pcpu_zone->z_pageq_empty)); | |
8875 | assert(zone_pva_is_null(test_pcpu_zone->z_pageq_partial)); | |
8876 | assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_va)); | |
8877 | ||
f427ee49 A |
8878 | for (idx = 0; idx < num_allocs; idx++) { |
8879 | assert(0 == pmap_find_phys(kernel_pmap, (addr64_t)(uintptr_t) allocs[idx])); | |
8880 | } | |
c3c9b80d A |
8881 | |
8882 | /* make sure the zone is still usable after a GC */ | |
8883 | ||
f427ee49 A |
8884 | for (idx = 0; idx < num_allocs; idx++) { |
8885 | allocs[idx] = zalloc(test_zone); | |
8886 | assert(allocs[idx]); | |
8887 | printf("alloc[%d] %p\n", idx, allocs[idx]); | |
8888 | } | |
c3c9b80d A |
8889 | assert(zone_pva_is_null(test_zone->z_pageq_va)); |
8890 | assert(test_zone->z_wired_cur == test_zone->z_va_cur); | |
f427ee49 A |
8891 | for (idx = 0; idx < num_allocs; idx++) { |
8892 | zfree(test_zone, allocs[idx]); | |
8893 | } | |
c3c9b80d A |
8894 | |
8895 | for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { | |
8896 | allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone, | |
8897 | Z_WAITOK | Z_ZERO); | |
8898 | assert(NULL != allocs_pcpu[idx]); | |
8899 | } | |
8900 | for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { | |
8901 | zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]); | |
8902 | } | |
8903 | ||
8904 | assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty)); | |
8905 | assert(zone_pva_is_null(test_pcpu_zone->z_pageq_va)); | |
8906 | ||
8907 | kmem_free(kernel_map, (vm_address_t)allocs_pcpu, PAGE_SIZE); | |
8908 | ||
f427ee49 | 8909 | zdestroy(test_zone); |
c3c9b80d | 8910 | zdestroy(test_pcpu_zone); |
f427ee49 A |
8911 | } else { |
8912 | printf("run_zone_test: skipping sequester test (not enabled)\n"); | |
8913 | } | |
8914 | ||
5ba3f43e A |
8915 | printf("run_zone_test: Test passed\n"); |
8916 | ||
0a7de745 | 8917 | simple_lock(&zone_test_lock, &zone_locks_grp); |
5ba3f43e A |
8918 | zone_test_running = FALSE; |
8919 | simple_unlock(&zone_test_lock); | |
8920 | ||
8921 | return TRUE; | |
813fb2f6 A |
8922 | } |
8923 | ||
f427ee49 A |
8924 | /* |
8925 | * Routines to test that zone garbage collection and zone replenish threads | |
8926 | * running at the same time don't cause problems. | |
8927 | */ | |
8928 | ||
8929 | void | |
8930 | zone_gc_replenish_test(void) | |
8931 | { | |
c3c9b80d | 8932 | zone_gc(ZONE_GC_DRAIN); |
f427ee49 A |
8933 | } |
8934 | ||
8935 | ||
8936 | void | |
8937 | zone_alloc_replenish_test(void) | |
8938 | { | |
8939 | zone_t z = NULL; | |
8940 | struct data { struct data *next; } *node, *list = NULL; | |
8941 | ||
8942 | /* | |
8943 | * Find a zone that has a replenish thread | |
8944 | */ | |
8945 | zone_index_foreach(i) { | |
8946 | z = &zone_array[i]; | |
c3c9b80d | 8947 | if (z->z_replenishes && zone_elem_size(z) >= sizeof(struct data)) { |
f427ee49 A |
8948 | z = &zone_array[i]; |
8949 | break; | |
8950 | } | |
8951 | } | |
8952 | if (z == NULL) { | |
8953 | printf("Couldn't find a replenish zone\n"); | |
8954 | return; | |
8955 | } | |
8956 | ||
8957 | for (uint32_t i = 0; i < 2000; ++i) { /* something big enough to go past replenishment */ | |
8958 | node = zalloc(z); | |
8959 | node->next = list; | |
8960 | list = node; | |
8961 | } | |
8962 | ||
8963 | /* | |
8964 | * release the memory we allocated | |
8965 | */ | |
8966 | while (list != NULL) { | |
8967 | node = list; | |
8968 | list = list->next; | |
8969 | zfree(z, node); | |
8970 | } | |
8971 | } | |
8972 | ||
39037602 | 8973 | #endif /* DEBUG || DEVELOPMENT */ |