]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/zalloc.c
1ef23d043ac16388bf8d1ec075437f3f2b5fc3ad
[apple/xnu.git] / osfmk / kern / zalloc.c
1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: kern/zalloc.c
60 * Author: Avadis Tevanian, Jr.
61 *
62 * Zone-based memory allocator. A zone is a collection of fixed size
63 * data blocks for which quick allocation/deallocation is possible.
64 */
65
66 #define ZALLOC_ALLOW_DEPRECATED 1
67 #include <mach/mach_types.h>
68 #include <mach/vm_param.h>
69 #include <mach/kern_return.h>
70 #include <mach/mach_host_server.h>
71 #include <mach/task_server.h>
72 #include <mach/machine/vm_types.h>
73 #include <mach/vm_map.h>
74 #include <mach/sdt.h>
75
76 #include <kern/bits.h>
77 #include <kern/startup.h>
78 #include <kern/kern_types.h>
79 #include <kern/assert.h>
80 #include <kern/backtrace.h>
81 #include <kern/host.h>
82 #include <kern/macro_help.h>
83 #include <kern/sched.h>
84 #include <kern/locks.h>
85 #include <kern/sched_prim.h>
86 #include <kern/misc_protos.h>
87 #include <kern/thread_call.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/kalloc.h>
90
91 #include <prng/random.h>
92
93 #include <vm/pmap.h>
94 #include <vm/vm_map.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_page.h>
97 #include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */
98
99 #include <pexpert/pexpert.h>
100
101 #include <machine/machparam.h>
102 #include <machine/machine_routines.h> /* ml_cpu_get_info */
103
104 #include <os/atomic.h>
105
106 #include <libkern/OSDebug.h>
107 #include <libkern/OSAtomic.h>
108 #include <libkern/section_keywords.h>
109 #include <sys/kdebug.h>
110
111 #include <san/kasan.h>
112
113 #if KASAN_ZALLOC
114 #define ZONE_ENABLE_LOGGING 0
115 #elif DEBUG || DEVELOPMENT
116 #define ZONE_ENABLE_LOGGING 1
117 #else
118 #define ZONE_ENABLE_LOGGING 0
119 #endif
120
121 extern void vm_pageout_garbage_collect(int collect);
122
123 /* Returns pid of the task with the largest number of VM map entries. */
124 extern pid_t find_largest_process_vm_map_entries(void);
125
126 /*
127 * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
128 * For any other pid we try to kill that process synchronously.
129 */
130 extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
131
132 extern zone_t vm_map_entry_zone;
133 extern zone_t vm_object_zone;
134 extern vm_offset_t kmapoff_kaddr;
135 extern unsigned int kmapoff_pgcnt;
136 extern unsigned int stack_total;
137 extern unsigned long long stack_allocs;
138
139 /*
140 * The max # of elements in a chunk should fit into
141 * zone_page_metadata.free_count (uint16_t).
142 *
143 * Update this if the type of free_count changes.
144 */
145 #define ZONE_CHUNK_MAXELEMENTS (UINT16_MAX)
146
147 #define ZONE_PAGECOUNT_BITS 14
148
149 /* Zone elements must fit both a next pointer and a backup pointer */
150 #define ZONE_MIN_ELEM_SIZE (2 * sizeof(vm_offset_t))
151 #define ZONE_MAX_ALLOC_SIZE (32 * 1024)
152
153 /* per-cpu zones are special because of counters */
154 #define ZONE_MIN_PCPU_ELEM_SIZE (1 * sizeof(vm_offset_t))
155
156 struct zone_map_range {
157 vm_offset_t min_address;
158 vm_offset_t max_address;
159 };
160
161 struct zone_page_metadata {
162 /* The index of the zone this metadata page belongs to */
163 zone_id_t zm_index;
164
165 /*
166 * zm_secondary_page == 0: number of pages in this run
167 * zm_secondary_page == 1: offset to the chunk start
168 */
169 uint16_t zm_page_count : ZONE_PAGECOUNT_BITS;
170
171 /* Whether this page is part of a chunk run */
172 uint16_t zm_percpu : 1;
173 uint16_t zm_secondary_page : 1;
174
175 /*
176 * The start of the freelist can be maintained as a 16-bit
177 * offset instead of a pointer because the free elements would
178 * be at max ZONE_MAX_ALLOC_SIZE bytes away from the start
179 * of the allocation chunk.
180 *
181 * Offset from start of the allocation chunk to free element
182 * list head.
183 */
184 uint16_t zm_freelist_offs;
185
186 /*
187 * zm_secondary_page == 0: number of allocated elements in the chunk
188 * zm_secondary_page == 1: unused
189 *
190 * PAGE_METADATA_EMPTY_FREELIST indicates an empty freelist
191 */
192 uint16_t zm_alloc_count;
193 #define PAGE_METADATA_EMPTY_FREELIST UINT16_MAX
194
195 zone_pva_t zm_page_next;
196 zone_pva_t zm_page_prev;
197
198 /*
199 * This is only for the sake of debuggers
200 */
201 #define ZONE_FOREIGN_COOKIE 0x123456789abcdef
202 uint64_t zm_foreign_cookie[];
203 };
204
205
206 /* Align elements that use the zone page list to 32 byte boundaries. */
207 #define ZONE_PAGE_FIRST_OFFSET(kind) ((kind) == ZONE_ADDR_NATIVE ? 0 : 32)
208
209 static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing");
210
211 static __security_const_late struct {
212 struct zone_map_range zi_map_range;
213 struct zone_map_range zi_general_range;
214 struct zone_map_range zi_meta_range;
215 struct zone_map_range zi_foreign_range;
216
217 /*
218 * The metadata lives within the zi_meta_range address range.
219 *
220 * The correct formula to find a metadata index is:
221 * absolute_page_index - page_index(zi_meta_range.min_address)
222 *
223 * And then this index is used to dereference zi_meta_range.min_address
224 * as a `struct zone_page_metadata` array.
225 *
226 * To avoid doing that substraction all the time in the various fast-paths,
227 * zi_array_base is offset by `page_index(zi_meta_range.min_address)`
228 * to avoid redoing that math all the time.
229 */
230 struct zone_page_metadata *zi_array_base;
231 } zone_info;
232
233 /*
234 * The zone_locks_grp allows for collecting lock statistics.
235 * All locks are associated to this group in zinit.
236 * Look at tools/lockstat for debugging lock contention.
237 */
238 LCK_GRP_DECLARE(zone_locks_grp, "zone_locks");
239 LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp);
240
241 /*
242 * Exclude more than one concurrent garbage collection
243 */
244 LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc");
245 LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp);
246
247 boolean_t panic_include_zprint = FALSE;
248 mach_memory_info_t *panic_kext_memory_info = NULL;
249 vm_size_t panic_kext_memory_size = 0;
250
251 /*
252 * Protects zone_array, num_zones, num_zones_in_use, and
253 * zone_destroyed_bitmap
254 */
255 static SIMPLE_LOCK_DECLARE(all_zones_lock, 0);
256 static unsigned int num_zones_in_use;
257 unsigned int _Atomic num_zones;
258 SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count;
259
260 #if KASAN_ZALLOC
261 #define MAX_ZONES 566
262 #else /* !KASAN_ZALLOC */
263 #define MAX_ZONES 402
264 #endif/* !KASAN_ZALLOC */
265 struct zone zone_array[MAX_ZONES];
266
267 /* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */
268 static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count;
269
270 /* Used to keep track of destroyed slots in the zone_array */
271 static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)];
272
273 /* number of pages used by all zones */
274 static long _Atomic zones_phys_page_count;
275
276 /* number of zone mapped pages used by all zones */
277 static long _Atomic zones_phys_page_mapped_count;
278
279 /*
280 * Turn ZSECURITY_OPTIONS_STRICT_IOKIT_FREE off on x86 so as not
281 * not break third party kexts that haven't yet been recompiled
282 * to use the new iokit macros.
283 */
284 #if XNU_TARGET_OS_OSX && __x86_64__
285 #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT 0
286 #else
287 #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT \
288 ZSECURITY_OPTIONS_STRICT_IOKIT_FREE
289 #endif
290
291 #define ZSECURITY_DEFAULT ( \
292 ZSECURITY_OPTIONS_SEQUESTER | \
293 ZSECURITY_OPTIONS_SUBMAP_USER_DATA | \
294 ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC | \
295 ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT | \
296 0)
297 TUNABLE(zone_security_options_t, zsecurity_options, "zs", ZSECURITY_DEFAULT);
298
299 #if VM_MAX_TAG_ZONES
300 /* enable tags for zones that ask for it */
301 TUNABLE(bool, zone_tagging_on, "-zt", false);
302 #endif /* VM_MAX_TAG_ZONES */
303
304 #if DEBUG || DEVELOPMENT
305 TUNABLE(bool, zalloc_disable_copyio_check, "-no-copyio-zalloc-check", false);
306 __options_decl(zalloc_debug_t, uint32_t, {
307 ZALLOC_DEBUG_ZONEGC = 0x00000001,
308 ZALLOC_DEBUG_ZCRAM = 0x00000002,
309 });
310
311 TUNABLE(zalloc_debug_t, zalloc_debug, "zalloc_debug", 0);
312 #endif /* DEBUG || DEVELOPMENT */
313 #if CONFIG_ZLEAKS
314 /* Making pointer scanning leaks detection possible for all zones */
315 TUNABLE(bool, zone_leaks_scan_enable, "-zl", false);
316 #else
317 #define zone_leaks_scan_enable false
318 #endif
319
320 /*
321 * Async allocation of zones
322 * This mechanism allows for bootstrapping an empty zone which is setup with
323 * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
324 * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
325 * This will prime the zone for the next use.
326 *
327 * Currently the thread_callout function (zalloc_async) will loop through all zones
328 * looking for any zone with async_pending set and do the work for it.
329 *
330 * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
331 * then zalloc_noblock to an empty zone may succeed.
332 */
333 static void zalloc_async(thread_call_param_t p0, thread_call_param_t p1);
334 static thread_call_data_t call_async_alloc;
335 static void zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size);
336
337 /*
338 * Zone Corruption Debugging
339 *
340 * We use four techniques to detect modification of a zone element
341 * after it's been freed.
342 *
343 * (1) Check the freelist next pointer for sanity.
344 * (2) Store a backup of the next pointer at the end of the element,
345 * and compare it to the primary next pointer when the element is allocated
346 * to detect corruption of the freelist due to use-after-free bugs.
347 * The backup pointer is also XORed with a per-boot random cookie.
348 * (3) Poison the freed element by overwriting it with 0xdeadbeef,
349 * and check for that value when the element is being reused to make sure
350 * no part of the element has been modified while it was on the freelist.
351 * This will also help catch read-after-frees, as code will now dereference
352 * 0xdeadbeef instead of a valid but freed pointer.
353 * (4) If the zfree_clear_mem flag is set clear the element on free and
354 * assert that it is still clear when alloc-ed.
355 *
356 * (1) and (2) occur for every allocation and free to a zone.
357 * This is done to make it slightly more difficult for an attacker to
358 * manipulate the freelist to behave in a specific way.
359 *
360 * Poisoning (3) occurs periodically for every N frees (counted per-zone).
361 * If -zp is passed as a boot arg, poisoning occurs for every free.
362 *
363 * Zeroing (4) is done for those zones that pass the ZC_ZFREE_CLEARMEM
364 * flag on creation or if the element size is less than one cacheline.
365 *
366 * Performance slowdown is inversely proportional to the frequency of poisoning,
367 * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
368 * and higher. You can expect to find a 100% reproducible bug in an average of
369 * N tries, with a standard deviation of about N, but you will want to set
370 * "-zp" to always poison every free if you are attempting to reproduce
371 * a known bug.
372 *
373 * For a more heavyweight, but finer-grained method of detecting misuse
374 * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
375 *
376 * Zone Corruption Logging
377 *
378 * You can also track where corruptions come from by using the boot-arguments
379 * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
380 * in this document for more implementation and usage information.
381 *
382 * Zone Leak Detection
383 *
384 * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
385 * found later in this file via the showtopztrace and showz* macros in kgmacros,
386 * or use zlog without the -zc argument.
387 *
388 */
389
390 #define ZP_DEFAULT_SAMPLING_FACTOR 16
391 #define ZP_DEFAULT_SCALE_FACTOR 4
392
393 /*
394 * set by zp-factor=N boot arg
395 *
396 * A zp_factor of 0 indicates zone poisoning is disabled and can also be set by
397 * passing the -no-zp boot-arg.
398 *
399 * A zp_factor of 1 indicates zone poisoning is on for all elements and can be
400 * set by passing the -zp boot-arg.
401 */
402 static TUNABLE(uint32_t, zp_factor, "zp-factor", ZP_DEFAULT_SAMPLING_FACTOR);
403
404 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
405 static TUNABLE(uint32_t, zp_scale, "zp-scale", ZP_DEFAULT_SCALE_FACTOR);
406
407 /* initialized to a per-boot random value in zp_bootstrap */
408 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_poisoned_cookie;
409 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_nopoison_cookie;
410 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_min_size;
411 static SECURITY_READ_ONLY_LATE(uint64_t) zone_phys_mapped_max;
412
413 static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT];
414 static SECURITY_READ_ONLY_LATE(uint32_t) zone_last_submap_idx;
415
416 static struct bool_gen zone_bool_gen;
417 static zone_t zone_find_largest(void);
418 static void zone_drop_free_elements(zone_t z);
419
420 #define submap_for_zone(z) zone_submaps[(z)->submap_idx]
421 #define MAX_SUBMAP_NAME 16
422
423 /* Globals for random boolean generator for elements in free list */
424 #define MAX_ENTROPY_PER_ZCRAM 4
425
426 #if CONFIG_ZCACHE
427 /*
428 * Specifies a single zone to enable CPU caching for.
429 * Can be set using boot-args: zcc_enable_for_zone_name=<zone>
430 */
431 static char cache_zone_name[MAX_ZONE_NAME];
432 static TUNABLE(bool, zcc_kalloc, "zcc_kalloc", false);
433
434 __header_always_inline bool
435 zone_caching_enabled(zone_t z)
436 {
437 return z->zcache.zcc_depot != NULL;
438 }
439 #else
440 __header_always_inline bool
441 zone_caching_enabled(zone_t z __unused)
442 {
443 return false;
444 }
445 #endif /* CONFIG_ZCACHE */
446
447 #pragma mark Zone metadata
448
449 __enum_closed_decl(zone_addr_kind_t, bool, {
450 ZONE_ADDR_NATIVE,
451 ZONE_ADDR_FOREIGN,
452 });
453
454 static inline zone_id_t
455 zone_index(zone_t z)
456 {
457 return (zone_id_t)(z - zone_array);
458 }
459
460 static inline bool
461 zone_has_index(zone_t z, zone_id_t zid)
462 {
463 return zone_array + zid == z;
464 }
465
466 static inline vm_size_t
467 zone_elem_count(zone_t zone, vm_size_t alloc_size, zone_addr_kind_t kind)
468 {
469 if (kind == ZONE_ADDR_NATIVE) {
470 if (zone->percpu) {
471 return PAGE_SIZE / zone_elem_size(zone);
472 }
473 return alloc_size / zone_elem_size(zone);
474 } else {
475 assert(alloc_size == PAGE_SIZE);
476 return (PAGE_SIZE - ZONE_PAGE_FIRST_OFFSET(kind)) / zone_elem_size(zone);
477 }
478 }
479
480 __abortlike
481 static void
482 zone_metadata_corruption(zone_t zone, struct zone_page_metadata *meta,
483 const char *kind)
484 {
485 panic("zone metadata corruption: %s (meta %p, zone %s%s)",
486 kind, meta, zone_heap_name(zone), zone->z_name);
487 }
488
489 __abortlike
490 static void
491 zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr)
492 {
493 panic("zone element pointer validation failed (addr: %p, zone %s%s)",
494 (void *)addr, zone_heap_name(zone), zone->z_name);
495 }
496
497 __abortlike
498 static void
499 zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr,
500 struct zone_page_metadata *meta)
501 {
502 panic("%p not in the expected zone %s%s (%d != %d)",
503 (void *)addr, zone_heap_name(zone), zone->z_name,
504 meta->zm_index, zone_index(zone));
505 }
506
507 __abortlike
508 static void
509 zone_page_metadata_native_queue_corruption(zone_t zone, zone_pva_t *queue)
510 {
511 panic("foreign metadata index %d enqueued in native head %p from zone %s%s",
512 queue->packed_address, queue, zone_heap_name(zone),
513 zone->z_name);
514 }
515
516 __abortlike
517 static void
518 zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta)
519 {
520 panic("metadata list corruption through element %p detected in zone %s%s",
521 meta, zone_heap_name(zone), zone->z_name);
522 }
523
524 __abortlike
525 static void
526 zone_page_metadata_foreign_queue_corruption(zone_t zone, zone_pva_t *queue)
527 {
528 panic("native metadata index %d enqueued in foreign head %p from zone %s%s",
529 queue->packed_address, queue, zone_heap_name(zone), zone->z_name);
530 }
531
532 __abortlike
533 static void
534 zone_page_metadata_foreign_confusion_panic(zone_t zone, vm_offset_t addr)
535 {
536 panic("manipulating foreign address %p in a native-only zone %s%s",
537 (void *)addr, zone_heap_name(zone), zone->z_name);
538 }
539
540 __abortlike __unused
541 static void
542 zone_invalid_foreign_addr_panic(zone_t zone, vm_offset_t addr)
543 {
544 panic("addr %p being freed to foreign zone %s%s not from foreign range",
545 (void *)addr, zone_heap_name(zone), zone->z_name);
546 }
547
548 __abortlike
549 static void
550 zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta,
551 const char *kind)
552 {
553 panic("accounting mismatch (%s) for zone %s%s, meta %p", kind,
554 zone_heap_name(zone), zone->z_name, meta);
555 }
556
557 __abortlike
558 static void
559 zone_accounting_panic(zone_t zone, const char *kind)
560 {
561 panic("accounting mismatch (%s) for zone %s%s", kind,
562 zone_heap_name(zone), zone->z_name);
563 }
564
565 __abortlike
566 static void
567 zone_nofail_panic(zone_t zone)
568 {
569 panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)",
570 zone_heap_name(zone), zone->z_name);
571 }
572
573 #if __arm64__
574 // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
575 #define zone_range_load(r, rmin, rmax) \
576 asm("ldp %[rmin], %[rmax], [%[range]]" \
577 : [rmin] "=r"(rmin), [rmax] "=r"(rmax) \
578 : [range] "r"(r))
579 #else
580 #define zone_range_load(r, rmin, rmax) \
581 ({ rmin = (r)->min_address; rmax = (r)->max_address; })
582 #endif
583
584 __header_always_inline bool
585 zone_range_contains(const struct zone_map_range *r, vm_offset_t addr, vm_offset_t size)
586 {
587 vm_offset_t rmin, rmax;
588
589 /*
590 * The `&` is not a typo: we really expect the check to pass,
591 * so encourage the compiler to eagerly load and test without branches
592 */
593 zone_range_load(r, rmin, rmax);
594 return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
595 }
596
597 __header_always_inline vm_size_t
598 zone_range_size(const struct zone_map_range *r)
599 {
600 vm_offset_t rmin, rmax;
601
602 zone_range_load(r, rmin, rmax);
603 return rmax - rmin;
604 }
605
606 #define from_zone_map(addr, size) \
607 zone_range_contains(&zone_info.zi_map_range, (vm_offset_t)(addr), size)
608
609 #define from_general_submap(addr, size) \
610 zone_range_contains(&zone_info.zi_general_range, (vm_offset_t)(addr), size)
611
612 #define from_foreign_range(addr, size) \
613 zone_range_contains(&zone_info.zi_foreign_range, (vm_offset_t)(addr), size)
614
615 #define from_native_meta_map(addr) \
616 zone_range_contains(&zone_info.zi_meta_range, (vm_offset_t)(addr), \
617 sizeof(struct zone_page_metadata))
618
619 #define zone_addr_kind(addr, size) \
620 (from_zone_map(addr, size) ? ZONE_ADDR_NATIVE : ZONE_ADDR_FOREIGN)
621
622 __header_always_inline bool
623 zone_pva_is_null(zone_pva_t page)
624 {
625 return page.packed_address == 0;
626 }
627
628 __header_always_inline bool
629 zone_pva_is_queue(zone_pva_t page)
630 {
631 // actual kernel pages have the top bit set
632 return (int32_t)page.packed_address > 0;
633 }
634
635 __header_always_inline bool
636 zone_pva_is_equal(zone_pva_t pva1, zone_pva_t pva2)
637 {
638 return pva1.packed_address == pva2.packed_address;
639 }
640
641 __header_always_inline void
642 zone_queue_set_head(zone_t z, zone_pva_t queue, zone_pva_t oldv,
643 struct zone_page_metadata *meta)
644 {
645 zone_pva_t *queue_head = &((zone_pva_t *)zone_array)[queue.packed_address];
646
647 if (!zone_pva_is_equal(*queue_head, oldv)) {
648 zone_page_metadata_list_corruption(z, meta);
649 }
650 *queue_head = meta->zm_page_next;
651 }
652
653 __header_always_inline zone_pva_t
654 zone_queue_encode(zone_pva_t *headp)
655 {
656 return (zone_pva_t){ (uint32_t)(headp - (zone_pva_t *)zone_array) };
657 }
658
659 __header_always_inline zone_pva_t
660 zone_pva_from_addr(vm_address_t addr)
661 {
662 // cannot use atop() because we want to maintain the sign bit
663 return (zone_pva_t){ (uint32_t)((intptr_t)addr >> PAGE_SHIFT) };
664 }
665
666 __header_always_inline vm_address_t
667 zone_pva_to_addr(zone_pva_t page)
668 {
669 // cause sign extension so that we end up with the right address
670 return (vm_offset_t)(int32_t)page.packed_address << PAGE_SHIFT;
671 }
672
673 __header_always_inline struct zone_page_metadata *
674 zone_pva_to_meta(zone_pva_t page, zone_addr_kind_t kind)
675 {
676 if (kind == ZONE_ADDR_NATIVE) {
677 return &zone_info.zi_array_base[page.packed_address];
678 } else {
679 return (struct zone_page_metadata *)zone_pva_to_addr(page);
680 }
681 }
682
683 __header_always_inline zone_pva_t
684 zone_pva_from_meta(struct zone_page_metadata *meta, zone_addr_kind_t kind)
685 {
686 if (kind == ZONE_ADDR_NATIVE) {
687 uint32_t index = (uint32_t)(meta - zone_info.zi_array_base);
688 return (zone_pva_t){ index };
689 } else {
690 return zone_pva_from_addr((vm_address_t)meta);
691 }
692 }
693
694 __header_always_inline struct zone_page_metadata *
695 zone_meta_from_addr(vm_offset_t addr, zone_addr_kind_t kind)
696 {
697 if (kind == ZONE_ADDR_NATIVE) {
698 return zone_pva_to_meta(zone_pva_from_addr(addr), kind);
699 } else {
700 return (struct zone_page_metadata *)trunc_page(addr);
701 }
702 }
703
704 #define zone_native_meta_from_addr(addr) \
705 zone_meta_from_addr((vm_offset_t)(addr), ZONE_ADDR_NATIVE)
706
707 __header_always_inline vm_offset_t
708 zone_meta_to_addr(struct zone_page_metadata *meta, zone_addr_kind_t kind)
709 {
710 if (kind == ZONE_ADDR_NATIVE) {
711 return ptoa((int)(meta - zone_info.zi_array_base));
712 } else {
713 return (vm_offset_t)meta;
714 }
715 }
716
717 __header_always_inline void
718 zone_meta_queue_push(zone_t z, zone_pva_t *headp,
719 struct zone_page_metadata *meta, zone_addr_kind_t kind)
720 {
721 zone_pva_t head = *headp;
722 zone_pva_t queue_pva = zone_queue_encode(headp);
723 struct zone_page_metadata *tmp;
724
725 meta->zm_page_next = head;
726 if (!zone_pva_is_null(head)) {
727 tmp = zone_pva_to_meta(head, kind);
728 if (!zone_pva_is_equal(tmp->zm_page_prev, queue_pva)) {
729 zone_page_metadata_list_corruption(z, meta);
730 }
731 tmp->zm_page_prev = zone_pva_from_meta(meta, kind);
732 }
733 meta->zm_page_prev = queue_pva;
734 *headp = zone_pva_from_meta(meta, kind);
735 }
736
737 __header_always_inline struct zone_page_metadata *
738 zone_meta_queue_pop(zone_t z, zone_pva_t *headp, zone_addr_kind_t kind,
739 vm_offset_t *page_addrp)
740 {
741 zone_pva_t head = *headp;
742 struct zone_page_metadata *meta = zone_pva_to_meta(head, kind);
743 vm_offset_t page_addr = zone_pva_to_addr(head);
744 struct zone_page_metadata *tmp;
745
746 if (kind == ZONE_ADDR_NATIVE && !from_native_meta_map(meta)) {
747 zone_page_metadata_native_queue_corruption(z, headp);
748 }
749 if (kind == ZONE_ADDR_FOREIGN && from_zone_map(meta, sizeof(*meta))) {
750 zone_page_metadata_foreign_queue_corruption(z, headp);
751 }
752
753 if (!zone_pva_is_null(meta->zm_page_next)) {
754 tmp = zone_pva_to_meta(meta->zm_page_next, kind);
755 if (!zone_pva_is_equal(tmp->zm_page_prev, head)) {
756 zone_page_metadata_list_corruption(z, meta);
757 }
758 tmp->zm_page_prev = meta->zm_page_prev;
759 }
760 *headp = meta->zm_page_next;
761
762 *page_addrp = page_addr;
763 return meta;
764 }
765
766 __header_always_inline void
767 zone_meta_requeue(zone_t z, zone_pva_t *headp,
768 struct zone_page_metadata *meta, zone_addr_kind_t kind)
769 {
770 zone_pva_t meta_pva = zone_pva_from_meta(meta, kind);
771 struct zone_page_metadata *tmp;
772
773 if (!zone_pva_is_null(meta->zm_page_next)) {
774 tmp = zone_pva_to_meta(meta->zm_page_next, kind);
775 if (!zone_pva_is_equal(tmp->zm_page_prev, meta_pva)) {
776 zone_page_metadata_list_corruption(z, meta);
777 }
778 tmp->zm_page_prev = meta->zm_page_prev;
779 }
780 if (zone_pva_is_queue(meta->zm_page_prev)) {
781 zone_queue_set_head(z, meta->zm_page_prev, meta_pva, meta);
782 } else {
783 tmp = zone_pva_to_meta(meta->zm_page_prev, kind);
784 if (!zone_pva_is_equal(tmp->zm_page_next, meta_pva)) {
785 zone_page_metadata_list_corruption(z, meta);
786 }
787 tmp->zm_page_next = meta->zm_page_next;
788 }
789
790 zone_meta_queue_push(z, headp, meta, kind);
791 }
792
793 /*
794 * Routine to populate a page backing metadata in the zone_metadata_region.
795 * Must be called without the zone lock held as it might potentially block.
796 */
797 static void
798 zone_meta_populate(struct zone_page_metadata *from, struct zone_page_metadata *to)
799 {
800 vm_offset_t page_addr = trunc_page(from);
801
802 for (; page_addr < (vm_offset_t)to; page_addr += PAGE_SIZE) {
803 #if !KASAN_ZALLOC
804 /*
805 * This can race with another thread doing a populate on the same metadata
806 * page, where we see an updated pmap but unmapped KASan shadow, causing a
807 * fault in the shadow when we first access the metadata page. Avoid this
808 * by always synchronizing on the zone_metadata_region lock with KASan.
809 */
810 if (pmap_find_phys(kernel_pmap, page_addr)) {
811 continue;
812 }
813 #endif
814
815 for (;;) {
816 kern_return_t ret = KERN_SUCCESS;
817
818 /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
819 lck_mtx_lock(&zone_metadata_region_lck);
820 if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
821 ret = kernel_memory_populate(kernel_map, page_addr,
822 PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
823 VM_KERN_MEMORY_OSFMK);
824 }
825 lck_mtx_unlock(&zone_metadata_region_lck);
826
827 if (ret == KERN_SUCCESS) {
828 break;
829 }
830
831 /*
832 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
833 * to bad system deadlocks, so if the allocation failed,
834 * we need to do the VM_PAGE_WAIT() outside of the lock.
835 */
836 VM_PAGE_WAIT();
837 }
838 }
839 }
840
841 static inline bool
842 zone_allocated_element_offset_is_valid(zone_t zone, vm_offset_t addr,
843 vm_offset_t page, zone_addr_kind_t kind)
844 {
845 vm_offset_t offs = addr - page - ZONE_PAGE_FIRST_OFFSET(kind);
846 vm_offset_t esize = zone_elem_size(zone);
847
848 if (esize & (esize - 1)) { /* not a power of 2 */
849 return (offs % esize) == 0;
850 } else {
851 return (offs & (esize - 1)) == 0;
852 }
853 }
854
855 __attribute__((always_inline))
856 static struct zone_page_metadata *
857 zone_allocated_element_resolve(zone_t zone, vm_offset_t addr,
858 vm_offset_t *pagep, zone_addr_kind_t *kindp)
859 {
860 struct zone_page_metadata *meta;
861 zone_addr_kind_t kind;
862 vm_offset_t page;
863 vm_offset_t esize = zone_elem_size(zone);
864
865 kind = zone_addr_kind(addr, esize);
866 page = trunc_page(addr);
867 meta = zone_meta_from_addr(addr, kind);
868
869 if (kind == ZONE_ADDR_NATIVE) {
870 if (meta->zm_secondary_page) {
871 if (meta->zm_percpu) {
872 zone_invalid_element_addr_panic(zone, addr);
873 }
874 page -= ptoa(meta->zm_page_count);
875 meta -= meta->zm_page_count;
876 }
877 } else if (!zone->allows_foreign) {
878 zone_page_metadata_foreign_confusion_panic(zone, addr);
879 #if __LP64__
880 } else if (!from_foreign_range(addr, esize)) {
881 zone_invalid_foreign_addr_panic(zone, addr);
882 #else
883 } else if (!pmap_kernel_va(addr)) {
884 zone_invalid_element_addr_panic(zone, addr);
885 #endif
886 }
887
888 if (!zone_allocated_element_offset_is_valid(zone, addr, page, kind)) {
889 zone_invalid_element_addr_panic(zone, addr);
890 }
891
892 if (!zone_has_index(zone, meta->zm_index)) {
893 zone_page_metadata_index_confusion_panic(zone, addr, meta);
894 }
895
896 if (kindp) {
897 *kindp = kind;
898 }
899 if (pagep) {
900 *pagep = page;
901 }
902 return meta;
903 }
904
905 __attribute__((always_inline))
906 void
907 zone_allocated_element_validate(zone_t zone, vm_offset_t addr)
908 {
909 zone_allocated_element_resolve(zone, addr, NULL, NULL);
910 }
911
912 __header_always_inline vm_offset_t
913 zone_page_meta_get_freelist(zone_t zone, struct zone_page_metadata *meta,
914 vm_offset_t page)
915 {
916 assert(!meta->zm_secondary_page);
917 if (meta->zm_freelist_offs == PAGE_METADATA_EMPTY_FREELIST) {
918 return 0;
919 }
920
921 vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
922 if (meta->zm_freelist_offs + zone_elem_size(zone) > size) {
923 zone_metadata_corruption(zone, meta, "freelist corruption");
924 }
925
926 return page + meta->zm_freelist_offs;
927 }
928
929 __header_always_inline void
930 zone_page_meta_set_freelist(struct zone_page_metadata *meta,
931 vm_offset_t page, vm_offset_t addr)
932 {
933 assert(!meta->zm_secondary_page);
934 if (addr) {
935 meta->zm_freelist_offs = (uint16_t)(addr - page);
936 } else {
937 meta->zm_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
938 }
939 }
940
941 static bool
942 zone_page_meta_is_sane_element(zone_t zone, struct zone_page_metadata *meta,
943 vm_offset_t page, vm_offset_t element, zone_addr_kind_t kind)
944 {
945 if (element == 0) {
946 /* ends of the freelist are NULL */
947 return true;
948 }
949 if (element < page + ZONE_PAGE_FIRST_OFFSET(kind)) {
950 return false;
951 }
952 vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
953 if (element > page + size - zone_elem_size(zone)) {
954 return false;
955 }
956 return true;
957 }
958
959 /* Routine to get the size of a zone allocated address.
960 * If the address doesnt belong to the zone maps, returns 0.
961 */
962 vm_size_t
963 zone_element_size(void *addr, zone_t *z)
964 {
965 struct zone_page_metadata *meta;
966 struct zone *src_zone;
967
968 if (from_zone_map(addr, sizeof(void *))) {
969 meta = zone_native_meta_from_addr(addr);
970 src_zone = &zone_array[meta->zm_index];
971 if (z) {
972 *z = src_zone;
973 }
974 return zone_elem_size(src_zone);
975 }
976 #if CONFIG_GZALLOC
977 if (__improbable(gzalloc_enabled())) {
978 vm_size_t gzsize;
979 if (gzalloc_element_size(addr, z, &gzsize)) {
980 return gzsize;
981 }
982 }
983 #endif /* CONFIG_GZALLOC */
984
985 return 0;
986 }
987
988 /* This function just formats the reason for the panics by redoing the checks */
989 __abortlike
990 static void
991 zone_require_panic(zone_t zone, void *addr)
992 {
993 uint32_t zindex;
994 zone_t other;
995
996 if (!from_zone_map(addr, zone_elem_size(zone))) {
997 panic("zone_require failed: address not in a zone (addr: %p)", addr);
998 }
999
1000 zindex = zone_native_meta_from_addr(addr)->zm_index;
1001 other = &zone_array[zindex];
1002 if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) {
1003 panic("zone_require failed: invalid zone index %d "
1004 "(addr: %p, expected: %s%s)", zindex,
1005 addr, zone_heap_name(zone), zone->z_name);
1006 } else {
1007 panic("zone_require failed: address in unexpected zone id %d (%s%s) "
1008 "(addr: %p, expected: %s%s)",
1009 zindex, zone_heap_name(other), other->z_name,
1010 addr, zone_heap_name(zone), zone->z_name);
1011 }
1012 }
1013
1014 __abortlike
1015 static void
1016 zone_id_require_panic(zone_id_t zid, void *addr)
1017 {
1018 zone_require_panic(&zone_array[zid], addr);
1019 }
1020
1021 /*
1022 * Routines to panic if a pointer is not mapped to an expected zone.
1023 * This can be used as a means of pinning an object to the zone it is expected
1024 * to be a part of. Causes a panic if the address does not belong to any
1025 * specified zone, does not belong to any zone, has been freed and therefore
1026 * unmapped from the zone, or the pointer contains an uninitialized value that
1027 * does not belong to any zone.
1028 *
1029 * Note that this can only work with collectable zones without foreign pages.
1030 */
1031 void
1032 zone_require(zone_t zone, void *addr)
1033 {
1034 if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
1035 (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
1036 return;
1037 }
1038 #if CONFIG_GZALLOC
1039 if (__probable(gzalloc_enabled())) {
1040 return;
1041 }
1042 #endif
1043 zone_require_panic(zone, addr);
1044 }
1045
1046 void
1047 zone_id_require(zone_id_t zid, vm_size_t esize, void *addr)
1048 {
1049 if (__probable(from_general_submap(addr, esize) &&
1050 (zid == zone_native_meta_from_addr(addr)->zm_index))) {
1051 return;
1052 }
1053 #if CONFIG_GZALLOC
1054 if (__probable(gzalloc_enabled())) {
1055 return;
1056 }
1057 #endif
1058 zone_id_require_panic(zid, addr);
1059 }
1060
1061 bool
1062 zone_owns(zone_t zone, void *addr)
1063 {
1064 if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
1065 (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
1066 return true;
1067 }
1068 #if CONFIG_GZALLOC
1069 if (__probable(gzalloc_enabled())) {
1070 return true;
1071 }
1072 #endif
1073 return false;
1074 }
1075
1076 #pragma mark ZTAGS
1077 #if VM_MAX_TAG_ZONES
1078
1079 // for zones with tagging enabled:
1080
1081 // calculate a pointer to the tag base entry,
1082 // holding either a uint32_t the first tag offset for a page in the zone map,
1083 // or two uint16_t tags if the page can only hold one or two elements
1084
1085 #define ZTAGBASE(zone, element) \
1086 (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_info.zi_map_range.min_address)])
1087
1088 // pointer to the tag for an element
1089 #define ZTAG(zone, element) \
1090 ({ \
1091 vm_tag_t * result; \
1092 if ((zone)->tags_inline) { \
1093 result = (vm_tag_t *) ZTAGBASE((zone), (element)); \
1094 if ((page_mask & element) >= zone_elem_size(zone)) result++; \
1095 } else { \
1096 result = &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / zone_elem_size((zone))]; \
1097 } \
1098 result; \
1099 })
1100
1101
1102 static vm_offset_t zone_tagbase_min;
1103 static vm_offset_t zone_tagbase_max;
1104 static vm_offset_t zone_tagbase_map_size;
1105 static vm_map_t zone_tagbase_map;
1106
1107 static vm_offset_t zone_tags_min;
1108 static vm_offset_t zone_tags_max;
1109 static vm_offset_t zone_tags_map_size;
1110 static vm_map_t zone_tags_map;
1111
1112 // simple heap allocator for allocating the tags for new memory
1113
1114 LCK_MTX_EARLY_DECLARE(ztLock, &zone_locks_grp); /* heap lock */
1115
1116 enum{
1117 ztFreeIndexCount = 8,
1118 ztFreeIndexMax = (ztFreeIndexCount - 1),
1119 ztTagsPerBlock = 4
1120 };
1121
1122 struct ztBlock {
1123 #if __LITTLE_ENDIAN__
1124 uint64_t free:1,
1125 next:21,
1126 prev:21,
1127 size:21;
1128 #else
1129 // ztBlock needs free bit least significant
1130 #error !__LITTLE_ENDIAN__
1131 #endif
1132 };
1133 typedef struct ztBlock ztBlock;
1134
1135 static ztBlock * ztBlocks;
1136 static uint32_t ztBlocksCount;
1137 static uint32_t ztBlocksFree;
1138
1139 static uint32_t
1140 ztLog2up(uint32_t size)
1141 {
1142 if (1 == size) {
1143 size = 0;
1144 } else {
1145 size = 32 - __builtin_clz(size - 1);
1146 }
1147 return size;
1148 }
1149
1150 static uint32_t
1151 ztLog2down(uint32_t size)
1152 {
1153 size = 31 - __builtin_clz(size);
1154 return size;
1155 }
1156
1157 static void
1158 ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags)
1159 {
1160 vm_map_offset_t addr = (vm_map_offset_t) address;
1161 vm_map_offset_t page, end;
1162
1163 page = trunc_page(addr);
1164 end = round_page(addr + size);
1165
1166 for (; page < end; page += page_size) {
1167 if (!pmap_find_phys(kernel_pmap, page)) {
1168 kern_return_t __unused
1169 ret = kernel_memory_populate(map, page, PAGE_SIZE,
1170 KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG);
1171 assert(ret == KERN_SUCCESS);
1172 }
1173 }
1174 }
1175
1176 static boolean_t
1177 ztPresent(const void * address, size_t size)
1178 {
1179 vm_map_offset_t addr = (vm_map_offset_t) address;
1180 vm_map_offset_t page, end;
1181 boolean_t result;
1182
1183 page = trunc_page(addr);
1184 end = round_page(addr + size);
1185 for (result = TRUE; (page < end); page += page_size) {
1186 result = pmap_find_phys(kernel_pmap, page);
1187 if (!result) {
1188 break;
1189 }
1190 }
1191 return result;
1192 }
1193
1194
1195 void __unused
1196 ztDump(boolean_t sanity);
1197 void __unused
1198 ztDump(boolean_t sanity)
1199 {
1200 uint32_t q, cq, p;
1201
1202 for (q = 0; q <= ztFreeIndexMax; q++) {
1203 p = q;
1204 do{
1205 if (sanity) {
1206 cq = ztLog2down(ztBlocks[p].size);
1207 if (cq > ztFreeIndexMax) {
1208 cq = ztFreeIndexMax;
1209 }
1210 if (!ztBlocks[p].free
1211 || ((p != q) && (q != cq))
1212 || (ztBlocks[ztBlocks[p].next].prev != p)
1213 || (ztBlocks[ztBlocks[p].prev].next != p)) {
1214 kprintf("zterror at %d", p);
1215 ztDump(FALSE);
1216 kprintf("zterror at %d", p);
1217 assert(FALSE);
1218 }
1219 continue;
1220 }
1221 kprintf("zt[%03d]%c %d, %d, %d\n",
1222 p, ztBlocks[p].free ? 'F' : 'A',
1223 ztBlocks[p].next, ztBlocks[p].prev,
1224 ztBlocks[p].size);
1225 p = ztBlocks[p].next;
1226 if (p == q) {
1227 break;
1228 }
1229 }while (p != q);
1230 if (!sanity) {
1231 printf("\n");
1232 }
1233 }
1234 if (!sanity) {
1235 printf("-----------------------\n");
1236 }
1237 }
1238
1239
1240
1241 #define ZTBDEQ(idx) \
1242 ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next; \
1243 ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
1244
1245 static void
1246 ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
1247 {
1248 uint32_t q, w, p, size, merge;
1249
1250 assert(count);
1251 ztBlocksFree += count;
1252
1253 // merge with preceding
1254 merge = (index + count);
1255 if ((merge < ztBlocksCount)
1256 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
1257 && ztBlocks[merge].free) {
1258 ZTBDEQ(merge);
1259 count += ztBlocks[merge].size;
1260 }
1261
1262 // merge with following
1263 merge = (index - 1);
1264 if ((merge > ztFreeIndexMax)
1265 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
1266 && ztBlocks[merge].free) {
1267 size = ztBlocks[merge].size;
1268 count += size;
1269 index -= size;
1270 ZTBDEQ(index);
1271 }
1272
1273 q = ztLog2down(count);
1274 if (q > ztFreeIndexMax) {
1275 q = ztFreeIndexMax;
1276 }
1277 w = q;
1278 // queue in order of size
1279 while (TRUE) {
1280 p = ztBlocks[w].next;
1281 if (p == q) {
1282 break;
1283 }
1284 if (ztBlocks[p].size >= count) {
1285 break;
1286 }
1287 w = p;
1288 }
1289 ztBlocks[p].prev = index;
1290 ztBlocks[w].next = index;
1291
1292 // fault in first
1293 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
1294
1295 // mark first & last with free flag and size
1296 ztBlocks[index].free = TRUE;
1297 ztBlocks[index].size = count;
1298 ztBlocks[index].prev = w;
1299 ztBlocks[index].next = p;
1300 if (count > 1) {
1301 index += (count - 1);
1302 // fault in last
1303 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
1304 ztBlocks[index].free = TRUE;
1305 ztBlocks[index].size = count;
1306 }
1307 }
1308
1309 static uint32_t
1310 ztAlloc(zone_t zone, uint32_t count)
1311 {
1312 uint32_t q, w, p, leftover;
1313
1314 assert(count);
1315
1316 q = ztLog2up(count);
1317 if (q > ztFreeIndexMax) {
1318 q = ztFreeIndexMax;
1319 }
1320 do{
1321 w = q;
1322 while (TRUE) {
1323 p = ztBlocks[w].next;
1324 if (p == q) {
1325 break;
1326 }
1327 if (ztBlocks[p].size >= count) {
1328 // dequeue, mark both ends allocated
1329 ztBlocks[w].next = ztBlocks[p].next;
1330 ztBlocks[ztBlocks[p].next].prev = w;
1331 ztBlocks[p].free = FALSE;
1332 ztBlocksFree -= ztBlocks[p].size;
1333 if (ztBlocks[p].size > 1) {
1334 ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
1335 }
1336
1337 // fault all the allocation
1338 ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
1339 // mark last as allocated
1340 if (count > 1) {
1341 ztBlocks[p + count - 1].free = FALSE;
1342 }
1343 // free remainder
1344 leftover = ztBlocks[p].size - count;
1345 if (leftover) {
1346 ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
1347 }
1348
1349 return p;
1350 }
1351 w = p;
1352 }
1353 q++;
1354 }while (q <= ztFreeIndexMax);
1355
1356 return -1U;
1357 }
1358
1359 __startup_func
1360 static void
1361 zone_tagging_init(vm_size_t max_zonemap_size)
1362 {
1363 kern_return_t ret;
1364 vm_map_kernel_flags_t vmk_flags;
1365 uint32_t idx;
1366
1367 // allocate submaps VM_KERN_MEMORY_DIAG
1368
1369 zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t);
1370 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1371 vmk_flags.vmkf_permanent = TRUE;
1372 ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size,
1373 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
1374 &zone_tagbase_map);
1375
1376 if (ret != KERN_SUCCESS) {
1377 panic("zone_init: kmem_suballoc failed");
1378 }
1379 zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size);
1380
1381 zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t);
1382 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1383 vmk_flags.vmkf_permanent = TRUE;
1384 ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size,
1385 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
1386 &zone_tags_map);
1387
1388 if (ret != KERN_SUCCESS) {
1389 panic("zone_init: kmem_suballoc failed");
1390 }
1391 zone_tags_max = zone_tags_min + round_page(zone_tags_map_size);
1392
1393 ztBlocks = (ztBlock *) zone_tags_min;
1394 ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
1395
1396 // initialize the qheads
1397 lck_mtx_lock(&ztLock);
1398
1399 ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0);
1400 for (idx = 0; idx < ztFreeIndexCount; idx++) {
1401 ztBlocks[idx].free = TRUE;
1402 ztBlocks[idx].next = idx;
1403 ztBlocks[idx].prev = idx;
1404 ztBlocks[idx].size = 0;
1405 }
1406 // free remaining space
1407 ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
1408
1409 lck_mtx_unlock(&ztLock);
1410 }
1411
1412 static void
1413 ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
1414 {
1415 uint32_t * tagbase;
1416 uint32_t count, block, blocks, idx;
1417 size_t pages;
1418
1419 pages = atop(size);
1420 tagbase = ZTAGBASE(zone, mem);
1421
1422 lck_mtx_lock(&ztLock);
1423
1424 // fault tagbase
1425 ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0);
1426
1427 if (!zone->tags_inline) {
1428 // allocate tags
1429 count = (uint32_t)(size / zone_elem_size(zone));
1430 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1431 block = ztAlloc(zone, blocks);
1432 if (-1U == block) {
1433 ztDump(false);
1434 }
1435 assert(-1U != block);
1436 }
1437
1438 lck_mtx_unlock(&ztLock);
1439
1440 if (!zone->tags_inline) {
1441 // set tag base for each page
1442 block *= ztTagsPerBlock;
1443 for (idx = 0; idx < pages; idx++) {
1444 vm_offset_t esize = zone_elem_size(zone);
1445 tagbase[idx] = block + (uint32_t)((ptoa(idx) + esize - 1) / esize);
1446 }
1447 }
1448 }
1449
1450 static void
1451 ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
1452 {
1453 uint32_t * tagbase;
1454 uint32_t count, block, blocks, idx;
1455 size_t pages;
1456
1457 // set tag base for each page
1458 pages = atop(size);
1459 tagbase = ZTAGBASE(zone, mem);
1460 block = tagbase[0];
1461 for (idx = 0; idx < pages; idx++) {
1462 tagbase[idx] = 0xFFFFFFFF;
1463 }
1464
1465 lck_mtx_lock(&ztLock);
1466 if (!zone->tags_inline) {
1467 count = (uint32_t)(size / zone_elem_size(zone));
1468 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1469 assert(block != 0xFFFFFFFF);
1470 block /= ztTagsPerBlock;
1471 ztFree(NULL /* zone is unlocked */, block, blocks);
1472 }
1473
1474 lck_mtx_unlock(&ztLock);
1475 }
1476
1477 uint32_t
1478 zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size)
1479 {
1480 simple_lock(&all_zones_lock, &zone_locks_grp);
1481
1482 zone_index_foreach(idx) {
1483 zone_t z = &zone_array[idx];
1484 if (!z->tags) {
1485 continue;
1486 }
1487 if (tag_zone_index != z->tag_zone_index) {
1488 continue;
1489 }
1490
1491 *elem_size = zone_elem_size(z);
1492 simple_unlock(&all_zones_lock);
1493 return idx;
1494 }
1495
1496 simple_unlock(&all_zones_lock);
1497
1498 return -1U;
1499 }
1500
1501 #endif /* VM_MAX_TAG_ZONES */
1502 #pragma mark zalloc helpers
1503
1504 const char *
1505 zone_name(zone_t z)
1506 {
1507 return z->z_name;
1508 }
1509
1510 const char *
1511 zone_heap_name(zone_t z)
1512 {
1513 if (__probable(z->kalloc_heap < KHEAP_ID_COUNT)) {
1514 return kalloc_heap_names[z->kalloc_heap];
1515 }
1516 return "invalid";
1517 }
1518
1519 static inline vm_size_t
1520 zone_submaps_approx_size(void)
1521 {
1522 vm_size_t size = 0;
1523
1524 for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
1525 size += zone_submaps[idx]->size;
1526 }
1527
1528 return size;
1529 }
1530
1531 bool
1532 zone_maps_owned(vm_address_t addr, vm_size_t size)
1533 {
1534 return from_zone_map(addr, size);
1535 }
1536
1537 void
1538 zone_map_sizes(
1539 vm_map_size_t *psize,
1540 vm_map_size_t *pfree,
1541 vm_map_size_t *plargest_free)
1542 {
1543 vm_map_sizes(zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP], psize, pfree, plargest_free);
1544 }
1545
1546 vm_map_t
1547 zone_submap(zone_t zone)
1548 {
1549 return submap_for_zone(zone);
1550 }
1551
1552 unsigned
1553 zpercpu_count(void)
1554 {
1555 return zpercpu_early_count;
1556 }
1557
1558 int
1559 track_this_zone(const char *zonename, const char *logname)
1560 {
1561 unsigned int len;
1562 const char *zc = zonename;
1563 const char *lc = logname;
1564
1565 /*
1566 * Compare the strings. We bound the compare by MAX_ZONE_NAME.
1567 */
1568
1569 for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
1570 /*
1571 * If the current characters don't match, check for a space in
1572 * in the zone name and a corresponding period in the log name.
1573 * If that's not there, then the strings don't match.
1574 */
1575
1576 if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
1577 break;
1578 }
1579
1580 /*
1581 * The strings are equal so far. If we're at the end, then it's a match.
1582 */
1583
1584 if (*zc == '\0') {
1585 return TRUE;
1586 }
1587 }
1588
1589 return FALSE;
1590 }
1591
1592 #if DEBUG || DEVELOPMENT
1593
1594 vm_size_t
1595 zone_element_info(void *addr, vm_tag_t * ptag)
1596 {
1597 vm_size_t size = 0;
1598 vm_tag_t tag = VM_KERN_MEMORY_NONE;
1599 struct zone_page_metadata *meta;
1600 struct zone *src_zone;
1601
1602 if (from_zone_map(addr, sizeof(void *))) {
1603 meta = zone_native_meta_from_addr(addr);
1604 src_zone = &zone_array[meta->zm_index];
1605 #if VM_MAX_TAG_ZONES
1606 if (__improbable(src_zone->tags)) {
1607 tag = (ZTAG(src_zone, (vm_offset_t) addr)[0] >> 1);
1608 }
1609 #endif /* VM_MAX_TAG_ZONES */
1610 size = zone_elem_size(src_zone);
1611 } else {
1612 #if CONFIG_GZALLOC
1613 gzalloc_element_size(addr, NULL, &size);
1614 #endif /* CONFIG_GZALLOC */
1615 }
1616 *ptag = tag;
1617 return size;
1618 }
1619
1620 #endif /* DEBUG || DEVELOPMENT */
1621
1622 /* Someone wrote to freed memory. */
1623 __abortlike
1624 static void
1625 zone_element_was_modified_panic(
1626 zone_t zone,
1627 vm_offset_t element,
1628 vm_offset_t found,
1629 vm_offset_t expected,
1630 vm_offset_t offset)
1631 {
1632 panic("a freed zone element has been modified in zone %s%s: "
1633 "expected %p but found %p, bits changed %p, "
1634 "at offset %d of %d in element %p, cookies %p %p",
1635 zone_heap_name(zone),
1636 zone->z_name,
1637 (void *) expected,
1638 (void *) found,
1639 (void *) (expected ^ found),
1640 (uint32_t) offset,
1641 (uint32_t) zone_elem_size(zone),
1642 (void *) element,
1643 (void *) zp_nopoison_cookie,
1644 (void *) zp_poisoned_cookie);
1645 }
1646
1647 /* The backup pointer is stored in the last pointer-sized location in an element. */
1648 __header_always_inline vm_offset_t *
1649 get_backup_ptr(vm_size_t elem_size, vm_offset_t *element)
1650 {
1651 return (vm_offset_t *)((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
1652 }
1653
1654 /*
1655 * The primary and backup pointers don't match.
1656 * Determine which one was likely the corrupted pointer, find out what it
1657 * probably should have been, and panic.
1658 */
1659 __abortlike
1660 static void
1661 backup_ptr_mismatch_panic(
1662 zone_t zone,
1663 struct zone_page_metadata *page_meta,
1664 vm_offset_t page,
1665 vm_offset_t element)
1666 {
1667 vm_offset_t primary = *(vm_offset_t *)element;
1668 vm_offset_t backup = *get_backup_ptr(zone_elem_size(zone), &element);
1669 vm_offset_t likely_backup;
1670 vm_offset_t likely_primary;
1671 zone_addr_kind_t kind = zone_addr_kind(page, zone_elem_size(zone));
1672
1673 likely_primary = primary ^ zp_nopoison_cookie;
1674 boolean_t sane_backup;
1675 boolean_t sane_primary = zone_page_meta_is_sane_element(zone, page_meta,
1676 page, likely_primary, kind);
1677 boolean_t element_was_poisoned = (backup & 0x1);
1678
1679 #if defined(__LP64__)
1680 /* We can inspect the tag in the upper bits for additional confirmation */
1681 if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000) {
1682 element_was_poisoned = TRUE;
1683 } else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000) {
1684 element_was_poisoned = FALSE;
1685 }
1686 #endif
1687
1688 if (element_was_poisoned) {
1689 likely_backup = backup ^ zp_poisoned_cookie;
1690 } else {
1691 likely_backup = backup ^ zp_nopoison_cookie;
1692 }
1693 sane_backup = zone_page_meta_is_sane_element(zone, page_meta,
1694 page, likely_backup, kind);
1695
1696 /* The primary is definitely the corrupted one */
1697 if (!sane_primary && sane_backup) {
1698 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1699 }
1700
1701 /* The backup is definitely the corrupted one */
1702 if (sane_primary && !sane_backup) {
1703 zone_element_was_modified_panic(zone, element, backup,
1704 (likely_primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
1705 zone_elem_size(zone) - sizeof(vm_offset_t));
1706 }
1707
1708 /*
1709 * Not sure which is the corrupted one.
1710 * It's less likely that the backup pointer was overwritten with
1711 * ( (sane address) ^ (valid cookie) ), so we'll guess that the
1712 * primary pointer has been overwritten with a sane but incorrect address.
1713 */
1714 if (sane_primary && sane_backup) {
1715 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1716 }
1717
1718 /* Neither are sane, so just guess. */
1719 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1720 }
1721
1722 /*
1723 * zone_sequestered_page_get
1724 * z is locked
1725 */
1726 static struct zone_page_metadata *
1727 zone_sequestered_page_get(zone_t z, vm_offset_t *page)
1728 {
1729 const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
1730
1731 if (!zone_pva_is_null(z->pages_sequester)) {
1732 if (os_sub_overflow(z->sequester_page_count, z->alloc_pages,
1733 &z->sequester_page_count)) {
1734 zone_accounting_panic(z, "sequester_page_count wrap-around");
1735 }
1736 return zone_meta_queue_pop(z, &z->pages_sequester, kind, page);
1737 }
1738
1739 return NULL;
1740 }
1741
1742 /*
1743 * zone_sequestered_page_populate
1744 * z is unlocked
1745 * page_meta is invalid on failure
1746 */
1747 static kern_return_t
1748 zone_sequestered_page_populate(zone_t z, struct zone_page_metadata *page_meta,
1749 vm_offset_t space, vm_size_t alloc_size, int zflags)
1750 {
1751 kern_return_t retval;
1752
1753 assert(alloc_size == ptoa(z->alloc_pages));
1754 retval = kernel_memory_populate(submap_for_zone(z), space, alloc_size,
1755 zflags, VM_KERN_MEMORY_ZONE);
1756 if (retval != KERN_SUCCESS) {
1757 lock_zone(z);
1758 zone_meta_queue_push(z, &z->pages_sequester, page_meta, ZONE_ADDR_NATIVE);
1759 z->sequester_page_count += z->alloc_pages;
1760 unlock_zone(z);
1761 }
1762 return retval;
1763 }
1764
1765 #pragma mark Zone poisoning/zeroing
1766
1767 /*
1768 * Initialize zone poisoning
1769 * called from zone_bootstrap before any allocations are made from zalloc
1770 */
1771 __startup_func
1772 static void
1773 zp_bootstrap(void)
1774 {
1775 char temp_buf[16];
1776
1777 /*
1778 * Initialize backup pointer random cookie for poisoned elements
1779 * Try not to call early_random() back to back, it may return
1780 * the same value if mach_absolute_time doesn't have sufficient time
1781 * to tick over between calls. <rdar://problem/11597395>
1782 * (This is only a problem on embedded devices)
1783 */
1784 zp_poisoned_cookie = (uintptr_t) early_random();
1785
1786 /* -zp: enable poisoning for every alloc and free */
1787 if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
1788 zp_factor = 1;
1789 }
1790
1791 /* -no-zp: disable poisoning */
1792 if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
1793 zp_factor = 0;
1794 printf("Zone poisoning disabled\n");
1795 }
1796
1797 /* Initialize backup pointer random cookie for unpoisoned elements */
1798 zp_nopoison_cookie = (uintptr_t) early_random();
1799
1800 #if MACH_ASSERT
1801 if (zp_poisoned_cookie == zp_nopoison_cookie) {
1802 panic("early_random() is broken: %p and %p are not random\n",
1803 (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
1804 }
1805 #endif
1806
1807 /*
1808 * Use the last bit in the backup pointer to hint poisoning state
1809 * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
1810 * the low bits are zero.
1811 */
1812 zp_poisoned_cookie |= (uintptr_t)0x1ULL;
1813 zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
1814
1815 #if defined(__LP64__)
1816 /*
1817 * Make backup pointers more obvious in GDB for 64 bit
1818 * by making OxFFFFFF... ^ cookie = 0xFACADE...
1819 * (0xFACADE = 0xFFFFFF ^ 0x053521)
1820 * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
1821 * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
1822 * by the sanity check, so it's OK for that part of the cookie to be predictable.
1823 *
1824 * TODO: Use #defines, xors, and shifts
1825 */
1826
1827 zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
1828 zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
1829
1830 zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
1831 zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
1832 #endif
1833
1834 /*
1835 * Initialize zp_min_size to two cachelines. Elements smaller than this will
1836 * be zero-ed.
1837 */
1838 ml_cpu_info_t cpu_info;
1839 ml_cpu_get_info(&cpu_info);
1840 zp_min_size = 2 * cpu_info.cache_line_size;
1841 }
1842
1843 inline uint32_t
1844 zone_poison_count_init(zone_t zone)
1845 {
1846 return zp_factor + (((uint32_t)zone_elem_size(zone)) >> zp_scale) ^
1847 (mach_absolute_time() & 0x7);
1848 }
1849
1850 #if ZALLOC_ENABLE_POISONING
1851 static bool
1852 zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
1853 {
1854 bool poison = false;
1855 uint32_t zp_count_local;
1856
1857 assert(!zone->percpu);
1858 if (zp_factor != 0) {
1859 /*
1860 * Poison the memory of every zp_count-th element before it ends up
1861 * on the freelist to catch use-after-free and use of uninitialized
1862 * memory.
1863 *
1864 * Every element is poisoned when zp_factor is set to 1.
1865 *
1866 */
1867 zp_count_local = os_atomic_load(zp_count, relaxed);
1868 if (__improbable(zp_count_local == 0 || zp_factor == 1)) {
1869 poison = true;
1870
1871 os_atomic_store(zp_count, zone_poison_count_init(zone), relaxed);
1872
1873 /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
1874 vm_offset_t *element_cursor = ((vm_offset_t *) elem);
1875 vm_offset_t *end_cursor = (vm_offset_t *)(elem + zone_elem_size(zone));
1876
1877 for (; element_cursor < end_cursor; element_cursor++) {
1878 *element_cursor = ZONE_POISON;
1879 }
1880 } else {
1881 os_atomic_store(zp_count, zp_count_local - 1, relaxed);
1882 /*
1883 * Zero first zp_min_size bytes of elements that aren't being poisoned.
1884 * Element size is larger than zp_min_size in this path as elements
1885 * that are smaller will always be zero-ed.
1886 */
1887 bzero((void *) elem, zp_min_size);
1888 }
1889 }
1890 return poison;
1891 }
1892 #else
1893 static bool
1894 zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
1895 {
1896 #pragma unused(zone, zp_count, elem)
1897 assert(!zone->percpu);
1898 return false;
1899 }
1900 #endif
1901
1902 __attribute__((always_inline))
1903 static bool
1904 zfree_clear(zone_t zone, vm_offset_t addr, vm_size_t elem_size)
1905 {
1906 assert(zone->zfree_clear_mem);
1907 if (zone->percpu) {
1908 zpercpu_foreach_cpu(i) {
1909 bzero((void *)(addr + ptoa(i)), elem_size);
1910 }
1911 } else {
1912 bzero((void *)addr, elem_size);
1913 }
1914
1915 return true;
1916 }
1917
1918 /*
1919 * Zero the element if zone has zfree_clear_mem flag set else poison
1920 * the element if zp_count hits 0.
1921 */
1922 __attribute__((always_inline))
1923 bool
1924 zfree_clear_or_poison(zone_t zone, uint32_t *zp_count, vm_offset_t addr)
1925 {
1926 vm_size_t elem_size = zone_elem_size(zone);
1927
1928 if (zone->zfree_clear_mem) {
1929 return zfree_clear(zone, addr, elem_size);
1930 }
1931
1932 return zfree_poison_element(zone, zp_count, (vm_offset_t)addr);
1933 }
1934
1935 /*
1936 * Clear out the old next pointer and backup to avoid leaking the zone
1937 * poisoning cookie and so that only values on the freelist have a valid
1938 * cookie.
1939 */
1940 void
1941 zone_clear_freelist_pointers(zone_t zone, vm_offset_t addr)
1942 {
1943 vm_offset_t perm_value = 0;
1944
1945 if (!zone->zfree_clear_mem) {
1946 perm_value = ZONE_POISON;
1947 }
1948
1949 vm_offset_t *primary = (vm_offset_t *) addr;
1950 vm_offset_t *backup = get_backup_ptr(zone_elem_size(zone), primary);
1951
1952 *primary = perm_value;
1953 *backup = perm_value;
1954 }
1955
1956 #if ZALLOC_ENABLE_POISONING
1957 __abortlike
1958 static void
1959 zone_element_not_clear_panic(zone_t zone, void *addr)
1960 {
1961 panic("Zone element %p was modified after free for zone %s%s: "
1962 "Expected element to be cleared", addr, zone_heap_name(zone),
1963 zone->z_name);
1964 }
1965
1966 /*
1967 * Validate that the element was not tampered with while it was in the
1968 * freelist.
1969 */
1970 void
1971 zalloc_validate_element(zone_t zone, vm_offset_t addr, vm_size_t size, bool validate)
1972 {
1973 if (zone->percpu) {
1974 assert(zone->zfree_clear_mem);
1975 zpercpu_foreach_cpu(i) {
1976 if (memcmp_zero_ptr_aligned((void *)(addr + ptoa(i)), size)) {
1977 zone_element_not_clear_panic(zone, (void *)(addr + ptoa(i)));
1978 }
1979 }
1980 } else if (zone->zfree_clear_mem) {
1981 if (memcmp_zero_ptr_aligned((void *)addr, size)) {
1982 zone_element_not_clear_panic(zone, (void *)addr);
1983 }
1984 } else if (__improbable(validate)) {
1985 const vm_offset_t *p = (vm_offset_t *)addr;
1986 const vm_offset_t *end = (vm_offset_t *)(addr + size);
1987
1988 for (; p < end; p++) {
1989 if (*p != ZONE_POISON) {
1990 zone_element_was_modified_panic(zone, addr,
1991 *p, ZONE_POISON, (vm_offset_t)p - addr);
1992 }
1993 }
1994 } else {
1995 /*
1996 * If element wasn't poisoned or entirely cleared, validate that the
1997 * minimum bytes that were cleared on free haven't been corrupted.
1998 * addr is advanced by ptr size as we have already validated and cleared
1999 * the freelist pointer/zcache canary.
2000 */
2001 if (memcmp_zero_ptr_aligned((void *) (addr + sizeof(vm_offset_t)),
2002 zp_min_size - sizeof(vm_offset_t))) {
2003 zone_element_not_clear_panic(zone, (void *)addr);
2004 }
2005 }
2006 }
2007 #endif /* ZALLOC_ENABLE_POISONING */
2008
2009 #pragma mark Zone Leak Detection
2010
2011 /*
2012 * Zone leak debugging code
2013 *
2014 * When enabled, this code keeps a log to track allocations to a particular zone that have not
2015 * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated
2016 * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is
2017 * off by default.
2018 *
2019 * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
2020 * is the name of the zone you wish to log.
2021 *
2022 * This code only tracks one zone, so you need to identify which one is leaking first.
2023 * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
2024 * garbage collector. Note that the zone name printed in the panic message is not necessarily the one
2025 * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This
2026 * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The
2027 * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
2028 * See the help in the kgmacros for usage info.
2029 *
2030 *
2031 * Zone corruption logging
2032 *
2033 * Logging can also be used to help identify the source of a zone corruption. First, identify the zone
2034 * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args. When -zc is used in conjunction
2035 * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the
2036 * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
2037 * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been
2038 * corrupted to examine its history. This should lead to the source of the corruption.
2039 */
2040
2041 /* Returns TRUE if we rolled over the counter at factor */
2042 __header_always_inline bool
2043 sample_counter(volatile uint32_t *count_p, uint32_t factor)
2044 {
2045 uint32_t old_count, new_count = 0;
2046 if (count_p != NULL) {
2047 os_atomic_rmw_loop(count_p, old_count, new_count, relaxed, {
2048 new_count = old_count + 1;
2049 if (new_count >= factor) {
2050 new_count = 0;
2051 }
2052 });
2053 }
2054
2055 return new_count == 0;
2056 }
2057
2058 #if ZONE_ENABLE_LOGGING
2059 /* Log allocations and frees to help debug a zone element corruption */
2060 TUNABLE(bool, corruption_debug_flag, "-zc", false);
2061
2062 #define MAX_NUM_ZONES_ALLOWED_LOGGING 10 /* Maximum 10 zones can be logged at once */
2063
2064 static int max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
2065 static int num_zones_logged = 0;
2066
2067 /*
2068 * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to
2069 * the number of records you want in the log. For example, "zrecs=10" sets it to 10 records. Since this
2070 * is the number of stacks suspected of leaking, we don't need many records.
2071 */
2072
2073 #if defined(__LP64__)
2074 #define ZRECORDS_MAX 2560 /* Max records allowed in the log */
2075 #else
2076 #define ZRECORDS_MAX 1536 /* Max records allowed in the log */
2077 #endif
2078 #define ZRECORDS_DEFAULT 1024 /* default records in log if zrecs is not specificed in boot-args */
2079
2080 static TUNABLE(uint32_t, log_records, "zrecs", ZRECORDS_DEFAULT);
2081
2082 static void
2083 zone_enable_logging(zone_t z)
2084 {
2085 z->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH,
2086 (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
2087
2088 if (z->zlog_btlog) {
2089 printf("zone: logging started for zone %s%s\n",
2090 zone_heap_name(z), z->z_name);
2091 } else {
2092 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
2093 z->zone_logging = false;
2094 }
2095 }
2096
2097 /**
2098 * @function zone_setup_logging
2099 *
2100 * @abstract
2101 * Optionally sets up a zone for logging.
2102 *
2103 * @discussion
2104 * We recognized two boot-args:
2105 *
2106 * zlog=<zone_to_log>
2107 * zrecs=<num_records_in_log>
2108 *
2109 * The zlog arg is used to specify the zone name that should be logged,
2110 * and zrecs is used to control the size of the log.
2111 *
2112 * If zrecs is not specified, a default value is used.
2113 */
2114 static void
2115 zone_setup_logging(zone_t z)
2116 {
2117 char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */
2118 char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
2119 char zlog_val[MAX_ZONE_NAME]; /* the zone name we're logging, if any */
2120
2121 /*
2122 * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
2123 *
2124 * This prevents accidentally hogging too much kernel memory
2125 * and making the system unusable.
2126 */
2127 if (log_records > ZRECORDS_MAX) {
2128 log_records = ZRECORDS_MAX;
2129 }
2130
2131 /*
2132 * Append kalloc heap name to zone name (if zone is used by kalloc)
2133 */
2134 snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
2135
2136 /* zlog0 isn't allowed. */
2137 for (int i = 1; i <= max_num_zones_to_log; i++) {
2138 snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
2139
2140 if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val)) &&
2141 track_this_zone(zone_name, zlog_val)) {
2142 z->zone_logging = true;
2143 num_zones_logged++;
2144 break;
2145 }
2146 }
2147
2148 /*
2149 * Backwards compat. with the old boot-arg used to specify single zone
2150 * logging i.e. zlog Needs to happen after the newer zlogn checks
2151 * because the prefix will match all the zlogn
2152 * boot-args.
2153 */
2154 if (!z->zone_logging &&
2155 PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val)) &&
2156 track_this_zone(zone_name, zlog_val)) {
2157 z->zone_logging = true;
2158 num_zones_logged++;
2159 }
2160
2161
2162 /*
2163 * If we want to log a zone, see if we need to allocate buffer space for
2164 * the log.
2165 *
2166 * Some vm related zones are zinit'ed before we can do a kmem_alloc, so
2167 * we have to defer allocation in that case.
2168 *
2169 * zone_init() will finish the job.
2170 *
2171 * If we want to log one of the VM related zones that's set up early on,
2172 * we will skip allocation of the log until zinit is called again later
2173 * on some other zone.
2174 */
2175 if (z->zone_logging && startup_phase >= STARTUP_SUB_KMEM_ALLOC) {
2176 zone_enable_logging(z);
2177 }
2178 }
2179
2180 /*
2181 * Each record in the log contains a pointer to the zone element it refers to,
2182 * and a small array to hold the pc's from the stack trace. A
2183 * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging,
2184 * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees.
2185 * If the log fills, old records are replaced as if it were a circular buffer.
2186 */
2187
2188
2189 /*
2190 * Decide if we want to log this zone by doing a string compare between a zone name and the name
2191 * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not
2192 * possible to include spaces in strings passed in via the boot-args, a period in the logname will
2193 * match a space in the zone name.
2194 */
2195
2196 /*
2197 * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and
2198 * the buffer for the records has been allocated.
2199 */
2200
2201 #define DO_LOGGING(z) (z->zlog_btlog != NULL)
2202 #else /* !ZONE_ENABLE_LOGGING */
2203 #define DO_LOGGING(z) 0
2204 #endif /* !ZONE_ENABLE_LOGGING */
2205
2206 #if CONFIG_ZLEAKS
2207
2208 /*
2209 * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
2210 * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a
2211 * backtrace. Every free, we examine the table and determine if the allocation was being tracked,
2212 * and stop tracking it if it was being tracked.
2213 *
2214 * We track the allocations in the zallocations hash table, which stores the address that was returned from
2215 * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which
2216 * stores the backtrace associated with that allocation. This provides uniquing for the relatively large
2217 * backtraces - we don't store them more than once.
2218 *
2219 * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
2220 * a large amount of virtual space.
2221 */
2222 #define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */
2223 #define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */
2224 #define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */
2225 #define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */
2226 uint32_t zleak_state = 0; /* State of collection, as above */
2227
2228 boolean_t panic_include_ztrace = FALSE; /* Enable zleak logging on panic */
2229 vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */
2230 vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */
2231 unsigned int zleak_sample_factor = 1000; /* Allocations per sample attempt */
2232
2233 /*
2234 * Counters for allocation statistics.
2235 */
2236
2237 /* Times two active records want to occupy the same spot */
2238 unsigned int z_alloc_collisions = 0;
2239 unsigned int z_trace_collisions = 0;
2240
2241 /* Times a new record lands on a spot previously occupied by a freed allocation */
2242 unsigned int z_alloc_overwrites = 0;
2243 unsigned int z_trace_overwrites = 0;
2244
2245 /* Times a new alloc or trace is put into the hash table */
2246 unsigned int z_alloc_recorded = 0;
2247 unsigned int z_trace_recorded = 0;
2248
2249 /* Times zleak_log returned false due to not being able to acquire the lock */
2250 unsigned int z_total_conflicts = 0;
2251
2252 /*
2253 * Structure for keeping track of an allocation
2254 * An allocation bucket is in use if its element is not NULL
2255 */
2256 struct zallocation {
2257 uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
2258 vm_size_t za_size; /* how much memory did this allocation take up? */
2259 uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */
2260 /* TODO: #if this out */
2261 uint32_t za_hit_count; /* for determining effectiveness of hash function */
2262 };
2263
2264 /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
2265 uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
2266 uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
2267
2268 vm_size_t zleak_max_zonemap_size;
2269
2270 /* Hashmaps of allocations and their corresponding traces */
2271 static struct zallocation* zallocations;
2272 static struct ztrace* ztraces;
2273
2274 /* not static so that panic can see this, see kern/debug.c */
2275 struct ztrace* top_ztrace;
2276
2277 /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
2278 LCK_GRP_DECLARE(zleak_lock_grp, "zleak_lock");
2279 LCK_SPIN_DECLARE(zleak_lock, &zleak_lock_grp);
2280
2281 /*
2282 * Initializes the zone leak monitor. Called from zone_init()
2283 */
2284 __startup_func
2285 static void
2286 zleak_init(vm_size_t max_zonemap_size)
2287 {
2288 char scratch_buf[16];
2289 boolean_t zleak_enable_flag = FALSE;
2290
2291 zleak_max_zonemap_size = max_zonemap_size;
2292 zleak_global_tracking_threshold = max_zonemap_size / 2;
2293 zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
2294
2295 #if CONFIG_EMBEDDED
2296 if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) {
2297 zleak_enable_flag = TRUE;
2298 printf("zone leak detection enabled\n");
2299 } else {
2300 zleak_enable_flag = FALSE;
2301 printf("zone leak detection disabled\n");
2302 }
2303 #else /* CONFIG_EMBEDDED */
2304 /* -zleakoff (flag to disable zone leak monitor) */
2305 if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
2306 zleak_enable_flag = FALSE;
2307 printf("zone leak detection disabled\n");
2308 } else {
2309 zleak_enable_flag = TRUE;
2310 printf("zone leak detection enabled\n");
2311 }
2312 #endif /* CONFIG_EMBEDDED */
2313
2314 /* zfactor=XXXX (override how often to sample the zone allocator) */
2315 if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
2316 printf("Zone leak factor override: %u\n", zleak_sample_factor);
2317 }
2318
2319 /* zleak-allocs=XXXX (override number of buckets in zallocations) */
2320 if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
2321 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
2322 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
2323 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) {
2324 printf("Override isn't a power of two, bad things might happen!\n");
2325 }
2326 }
2327
2328 /* zleak-traces=XXXX (override number of buckets in ztraces) */
2329 if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
2330 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
2331 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
2332 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) {
2333 printf("Override isn't a power of two, bad things might happen!\n");
2334 }
2335 }
2336
2337 if (zleak_enable_flag) {
2338 zleak_state = ZLEAK_STATE_ENABLED;
2339 }
2340 }
2341
2342 /*
2343 * Support for kern.zleak.active sysctl - a simplified
2344 * version of the zleak_state variable.
2345 */
2346 int
2347 get_zleak_state(void)
2348 {
2349 if (zleak_state & ZLEAK_STATE_FAILED) {
2350 return -1;
2351 }
2352 if (zleak_state & ZLEAK_STATE_ACTIVE) {
2353 return 1;
2354 }
2355 return 0;
2356 }
2357
2358 kern_return_t
2359 zleak_activate(void)
2360 {
2361 kern_return_t retval;
2362 vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
2363 vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
2364 void *allocations_ptr = NULL;
2365 void *traces_ptr = NULL;
2366
2367 /* Only one thread attempts to activate at a time */
2368 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
2369 return KERN_SUCCESS;
2370 }
2371
2372 /* Indicate that we're doing the setup */
2373 lck_spin_lock(&zleak_lock);
2374 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
2375 lck_spin_unlock(&zleak_lock);
2376 return KERN_SUCCESS;
2377 }
2378
2379 zleak_state |= ZLEAK_STATE_ACTIVATING;
2380 lck_spin_unlock(&zleak_lock);
2381
2382 /* Allocate and zero tables */
2383 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK);
2384 if (retval != KERN_SUCCESS) {
2385 goto fail;
2386 }
2387
2388 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK);
2389 if (retval != KERN_SUCCESS) {
2390 goto fail;
2391 }
2392
2393 bzero(allocations_ptr, z_alloc_size);
2394 bzero(traces_ptr, z_trace_size);
2395
2396 /* Everything's set. Install tables, mark active. */
2397 zallocations = allocations_ptr;
2398 ztraces = traces_ptr;
2399
2400 /*
2401 * Initialize the top_ztrace to the first entry in ztraces,
2402 * so we don't have to check for null in zleak_log
2403 */
2404 top_ztrace = &ztraces[0];
2405
2406 /*
2407 * Note that we do need a barrier between installing
2408 * the tables and setting the active flag, because the zfree()
2409 * path accesses the table without a lock if we're active.
2410 */
2411 lck_spin_lock(&zleak_lock);
2412 zleak_state |= ZLEAK_STATE_ACTIVE;
2413 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
2414 lck_spin_unlock(&zleak_lock);
2415
2416 return 0;
2417
2418 fail:
2419 /*
2420 * If we fail to allocate memory, don't further tax
2421 * the system by trying again.
2422 */
2423 lck_spin_lock(&zleak_lock);
2424 zleak_state |= ZLEAK_STATE_FAILED;
2425 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
2426 lck_spin_unlock(&zleak_lock);
2427
2428 if (allocations_ptr != NULL) {
2429 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
2430 }
2431
2432 if (traces_ptr != NULL) {
2433 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
2434 }
2435
2436 return retval;
2437 }
2438
2439 /*
2440 * TODO: What about allocations that never get deallocated,
2441 * especially ones with unique backtraces? Should we wait to record
2442 * until after boot has completed?
2443 * (How many persistent zallocs are there?)
2444 */
2445
2446 /*
2447 * This function records the allocation in the allocations table,
2448 * and stores the associated backtrace in the traces table
2449 * (or just increments the refcount if the trace is already recorded)
2450 * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
2451 * the associated trace's refcount is decremented.
2452 * If the trace slot is in use, it returns.
2453 * The refcount is incremented by the amount of memory the allocation consumes.
2454 * The return value indicates whether to try again next time.
2455 */
2456 static boolean_t
2457 zleak_log(uintptr_t* bt,
2458 uintptr_t addr,
2459 uint32_t depth,
2460 vm_size_t allocation_size)
2461 {
2462 /* Quit if there's someone else modifying the hash tables */
2463 if (!lck_spin_try_lock(&zleak_lock)) {
2464 z_total_conflicts++;
2465 return FALSE;
2466 }
2467
2468 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2469
2470 uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
2471 struct ztrace* trace = &ztraces[trace_index];
2472
2473 allocation->za_hit_count++;
2474 trace->zt_hit_count++;
2475
2476 /*
2477 * If the allocation bucket we want to be in is occupied, and if the occupier
2478 * has the same trace as us, just bail.
2479 */
2480 if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
2481 z_alloc_collisions++;
2482
2483 lck_spin_unlock(&zleak_lock);
2484 return TRUE;
2485 }
2486
2487 /* STEP 1: Store the backtrace in the traces array. */
2488 /* A size of zero indicates that the trace bucket is free. */
2489
2490 if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) {
2491 /*
2492 * Different unique trace with same hash!
2493 * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
2494 * and get out of the way for later chances
2495 */
2496 trace->zt_collisions++;
2497 z_trace_collisions++;
2498
2499 lck_spin_unlock(&zleak_lock);
2500 return TRUE;
2501 } else if (trace->zt_size > 0) {
2502 /* Same trace, already added, so increment refcount */
2503 trace->zt_size += allocation_size;
2504 } else {
2505 /* Found an unused trace bucket, record the trace here! */
2506 if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */
2507 z_trace_overwrites++;
2508 }
2509
2510 z_trace_recorded++;
2511 trace->zt_size = allocation_size;
2512 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)));
2513
2514 trace->zt_depth = depth;
2515 trace->zt_collisions = 0;
2516 }
2517
2518 /* STEP 2: Store the allocation record in the allocations array. */
2519
2520 if (allocation->za_element != (uintptr_t) 0) {
2521 /*
2522 * Straight up replace any allocation record that was there. We don't want to do the work
2523 * to preserve the allocation entries that were there, because we only record a subset of the
2524 * allocations anyways.
2525 */
2526
2527 z_alloc_collisions++;
2528
2529 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
2530 /* Knock off old allocation's size, not the new allocation */
2531 associated_trace->zt_size -= allocation->za_size;
2532 } else if (allocation->za_trace_index != 0) {
2533 /* Slot previously used but not currently in use */
2534 z_alloc_overwrites++;
2535 }
2536
2537 allocation->za_element = addr;
2538 allocation->za_trace_index = trace_index;
2539 allocation->za_size = allocation_size;
2540
2541 z_alloc_recorded++;
2542
2543 if (top_ztrace->zt_size < trace->zt_size) {
2544 top_ztrace = trace;
2545 }
2546
2547 lck_spin_unlock(&zleak_lock);
2548 return TRUE;
2549 }
2550
2551 /*
2552 * Free the allocation record and release the stacktrace.
2553 * This should be as fast as possible because it will be called for every free.
2554 */
2555 __attribute__((noinline))
2556 static void
2557 zleak_free(uintptr_t addr,
2558 vm_size_t allocation_size)
2559 {
2560 if (addr == (uintptr_t) 0) {
2561 return;
2562 }
2563
2564 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2565
2566 /* Double-checked locking: check to find out if we're interested, lock, check to make
2567 * sure it hasn't changed, then modify it, and release the lock.
2568 */
2569
2570 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2571 /* if the allocation was the one, grab the lock, check again, then delete it */
2572 lck_spin_lock(&zleak_lock);
2573
2574 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2575 struct ztrace *trace;
2576
2577 /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
2578 if (allocation->za_size != allocation_size) {
2579 panic("Freeing as size %lu memory that was allocated with size %lu\n",
2580 (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
2581 }
2582
2583 trace = &ztraces[allocation->za_trace_index];
2584
2585 /* size of 0 indicates trace bucket is unused */
2586 if (trace->zt_size > 0) {
2587 trace->zt_size -= allocation_size;
2588 }
2589
2590 /* A NULL element means the allocation bucket is unused */
2591 allocation->za_element = 0;
2592 }
2593 lck_spin_unlock(&zleak_lock);
2594 }
2595 }
2596
2597 #endif /* CONFIG_ZLEAKS */
2598
2599 /* These functions outside of CONFIG_ZLEAKS because they are also used in
2600 * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix.
2601 */
2602
2603 /* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */
2604 uintptr_t
2605 hash_mix(uintptr_t x)
2606 {
2607 #ifndef __LP64__
2608 x += ~(x << 15);
2609 x ^= (x >> 10);
2610 x += (x << 3);
2611 x ^= (x >> 6);
2612 x += ~(x << 11);
2613 x ^= (x >> 16);
2614 #else
2615 x += ~(x << 32);
2616 x ^= (x >> 22);
2617 x += ~(x << 13);
2618 x ^= (x >> 8);
2619 x += (x << 3);
2620 x ^= (x >> 15);
2621 x += ~(x << 27);
2622 x ^= (x >> 31);
2623 #endif
2624 return x;
2625 }
2626
2627 uint32_t
2628 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
2629 {
2630 uintptr_t hash = 0;
2631 uintptr_t mask = max_size - 1;
2632
2633 while (depth) {
2634 hash += bt[--depth];
2635 }
2636
2637 hash = hash_mix(hash) & mask;
2638
2639 assert(hash < max_size);
2640
2641 return (uint32_t) hash;
2642 }
2643
2644 /*
2645 * TODO: Determine how well distributed this is
2646 * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
2647 */
2648 uint32_t
2649 hashaddr(uintptr_t pt, uint32_t max_size)
2650 {
2651 uintptr_t hash = 0;
2652 uintptr_t mask = max_size - 1;
2653
2654 hash = hash_mix(pt) & mask;
2655
2656 assert(hash < max_size);
2657
2658 return (uint32_t) hash;
2659 }
2660
2661 /* End of all leak-detection code */
2662 #pragma mark zone creation, configuration, destruction
2663
2664 static zone_t
2665 zone_init_defaults(zone_id_t zid)
2666 {
2667 zone_t z = &zone_array[zid];
2668
2669 z->page_count_max = ~0u;
2670 z->collectable = true;
2671 z->expandable = true;
2672 z->submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
2673
2674 simple_lock_init(&z->lock, 0);
2675
2676 return z;
2677 }
2678
2679 static bool
2680 zone_is_initializing(zone_t z)
2681 {
2682 return !z->z_self && !z->destroyed;
2683 }
2684
2685 static void
2686 zone_set_max(zone_t z, vm_size_t max)
2687 {
2688 #if KASAN_ZALLOC
2689 if (z->kasan_redzone) {
2690 /*
2691 * Adjust the max memory for the kasan redzones
2692 */
2693 max += (max / z->pcpu_elem_size) * z->kasan_redzone * 2;
2694 }
2695 #endif
2696 if (max < z->percpu ? 1 : z->alloc_pages) {
2697 max = z->percpu ? 1 : z->alloc_pages;
2698 } else {
2699 max = atop(round_page(max));
2700 }
2701 z->page_count_max = max;
2702 }
2703
2704 void
2705 zone_set_submap_idx(zone_t zone, unsigned int sub_map_idx)
2706 {
2707 if (!zone_is_initializing(zone)) {
2708 panic("%s: called after zone_create()", __func__);
2709 }
2710 if (sub_map_idx > zone_last_submap_idx) {
2711 panic("zone_set_submap_idx(%d) > %d", sub_map_idx, zone_last_submap_idx);
2712 }
2713 zone->submap_idx = sub_map_idx;
2714 }
2715
2716 void
2717 zone_set_noexpand(
2718 zone_t zone,
2719 vm_size_t max)
2720 {
2721 if (!zone_is_initializing(zone)) {
2722 panic("%s: called after zone_create()", __func__);
2723 }
2724 zone->expandable = false;
2725 zone_set_max(zone, max);
2726 }
2727
2728 void
2729 zone_set_exhaustible(
2730 zone_t zone,
2731 vm_size_t max)
2732 {
2733 if (!zone_is_initializing(zone)) {
2734 panic("%s: called after zone_create()", __func__);
2735 }
2736 zone->expandable = false;
2737 zone->exhaustible = true;
2738 zone_set_max(zone, max);
2739 }
2740
2741 /**
2742 * @function zone_create_find
2743 *
2744 * @abstract
2745 * Finds an unused zone for the given name and element size.
2746 *
2747 * @param name the zone name
2748 * @param size the element size (including redzones, ...)
2749 * @param flags the flags passed to @c zone_create*
2750 * @param zid the desired zone ID or ZONE_ID_ANY
2751 *
2752 * @returns a zone to initialize further.
2753 */
2754 static zone_t
2755 zone_create_find(
2756 const char *name,
2757 vm_size_t size,
2758 zone_create_flags_t flags,
2759 zone_id_t zid)
2760 {
2761 zone_id_t nzones;
2762 zone_t z;
2763
2764 simple_lock(&all_zones_lock, &zone_locks_grp);
2765
2766 nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed);
2767 assert(num_zones_in_use <= nzones && nzones < MAX_ZONES);
2768
2769 if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) {
2770 /*
2771 * The first time around, make sure the reserved zone IDs
2772 * have an initialized lock as zone_index_foreach() will
2773 * enumerate them.
2774 */
2775 while (nzones < ZONE_ID__FIRST_DYNAMIC) {
2776 zone_init_defaults(nzones++);
2777 }
2778
2779 os_atomic_store(&num_zones, nzones, release);
2780 }
2781
2782 if (zid != ZONE_ID_ANY) {
2783 if (zid >= ZONE_ID__FIRST_DYNAMIC) {
2784 panic("zone_create: invalid desired zone ID %d for %s",
2785 zid, name);
2786 }
2787 if (flags & ZC_DESTRUCTIBLE) {
2788 panic("zone_create: ID %d (%s) must be permanent", zid, name);
2789 }
2790 if (zone_array[zid].z_self) {
2791 panic("zone_create: creating zone ID %d (%s) twice", zid, name);
2792 }
2793 z = &zone_array[zid];
2794 } else {
2795 if (flags & ZC_DESTRUCTIBLE) {
2796 /*
2797 * If possible, find a previously zdestroy'ed zone in the
2798 * zone_array that we can reuse.
2799 */
2800 for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES);
2801 i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) {
2802 z = &zone_array[i];
2803
2804 /*
2805 * If the zone name and the element size are the
2806 * same, we can just reuse the old zone struct.
2807 */
2808 if (strcmp(z->z_name, name) || zone_elem_size(z) != size) {
2809 continue;
2810 }
2811 bitmap_clear(zone_destroyed_bitmap, i);
2812 z->destroyed = false;
2813 z->z_self = z;
2814 zid = (zone_id_t)i;
2815 goto out;
2816 }
2817 }
2818
2819 zid = nzones++;
2820 z = zone_init_defaults(zid);
2821
2822 /*
2823 * The release barrier pairs with the acquire in
2824 * zone_index_foreach() and makes sure that enumeration loops
2825 * always see an initialized zone lock.
2826 */
2827 os_atomic_store(&num_zones, nzones, release);
2828 }
2829
2830 out:
2831 num_zones_in_use++;
2832 simple_unlock(&all_zones_lock);
2833
2834 return z;
2835 }
2836
2837 __abortlike
2838 static void
2839 zone_create_panic(const char *name, const char *f1, const char *f2)
2840 {
2841 panic("zone_create: creating zone %s: flag %s and %s are incompatible",
2842 name, f1, f2);
2843 }
2844 #define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \
2845 if ((flags) & forbidden_flag) { \
2846 zone_create_panic(name, #current_flag, #forbidden_flag); \
2847 }
2848
2849 /*
2850 * Adjusts the size of the element based on minimum size, alignment
2851 * and kasan redzones
2852 */
2853 static vm_size_t
2854 zone_elem_adjust_size(
2855 const char *name __unused,
2856 vm_size_t elem_size,
2857 zone_create_flags_t flags,
2858 vm_size_t *redzone __unused)
2859 {
2860 vm_size_t size;
2861 /*
2862 * Adjust element size for minimum size and pointer alignment
2863 */
2864 size = (elem_size + sizeof(vm_offset_t) - 1) & -sizeof(vm_offset_t);
2865 if (((flags & ZC_PERCPU) == 0) && size < ZONE_MIN_ELEM_SIZE) {
2866 size = ZONE_MIN_ELEM_SIZE;
2867 }
2868
2869 #if KASAN_ZALLOC
2870 /*
2871 * Expand the zone allocation size to include the redzones.
2872 *
2873 * For page-multiple zones add a full guard page because they
2874 * likely require alignment.
2875 */
2876 vm_size_t redzone_tmp;
2877 if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU)) {
2878 redzone_tmp = 0;
2879 } else if ((size & PAGE_MASK) == 0) {
2880 if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) {
2881 panic("zone_create: zone %s can't provide more than PAGE_SIZE"
2882 "alignment", name);
2883 }
2884 redzone_tmp = PAGE_SIZE;
2885 } else if (flags & ZC_ALIGNMENT_REQUIRED) {
2886 redzone_tmp = 0;
2887 } else {
2888 redzone_tmp = KASAN_GUARD_SIZE;
2889 }
2890 size += redzone_tmp * 2;
2891 if (redzone) {
2892 *redzone = redzone_tmp;
2893 }
2894 #endif
2895 return size;
2896 }
2897
2898 /*
2899 * Returns the allocation chunk size that has least framentation
2900 */
2901 static vm_size_t
2902 zone_get_min_alloc_granule(
2903 vm_size_t elem_size,
2904 zone_create_flags_t flags)
2905 {
2906 vm_size_t alloc_granule = PAGE_SIZE;
2907 if (flags & ZC_PERCPU) {
2908 alloc_granule = PAGE_SIZE * zpercpu_count();
2909 if (PAGE_SIZE % elem_size > 256) {
2910 panic("zone_create: per-cpu zone has too much fragmentation");
2911 }
2912 } else if ((elem_size & PAGE_MASK) == 0) {
2913 /* zero fragmentation by definition */
2914 alloc_granule = elem_size;
2915 } else if (alloc_granule % elem_size == 0) {
2916 /* zero fragmentation by definition */
2917 } else {
2918 vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule;
2919 vm_size_t alloc_tmp = PAGE_SIZE;
2920 while ((alloc_tmp += PAGE_SIZE) <= ZONE_MAX_ALLOC_SIZE) {
2921 vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp;
2922 if (frag_tmp < frag) {
2923 frag = frag_tmp;
2924 alloc_granule = alloc_tmp;
2925 }
2926 }
2927 }
2928 return alloc_granule;
2929 }
2930
2931 vm_size_t
2932 zone_get_foreign_alloc_size(
2933 const char *name __unused,
2934 vm_size_t elem_size,
2935 zone_create_flags_t flags,
2936 uint16_t min_pages)
2937 {
2938 vm_size_t adjusted_size = zone_elem_adjust_size(name, elem_size, flags,
2939 NULL);
2940 vm_size_t alloc_granule = zone_get_min_alloc_granule(adjusted_size,
2941 flags);
2942 vm_size_t min_size = min_pages * PAGE_SIZE;
2943 /*
2944 * Round up min_size to a multiple of alloc_granule
2945 */
2946 return ((min_size + alloc_granule - 1) / alloc_granule)
2947 * alloc_granule;
2948 }
2949
2950 zone_t
2951 zone_create_ext(
2952 const char *name,
2953 vm_size_t size,
2954 zone_create_flags_t flags,
2955 zone_id_t desired_zid,
2956 void (^extra_setup)(zone_t))
2957 {
2958 vm_size_t alloc;
2959 vm_size_t redzone;
2960 zone_t z;
2961
2962 if (size > ZONE_MAX_ALLOC_SIZE) {
2963 panic("zone_create: element size too large: %zd", (size_t)size);
2964 }
2965
2966 size = zone_elem_adjust_size(name, size, flags, &redzone);
2967 /*
2968 * Allocate the zone slot, return early if we found an older match.
2969 */
2970 z = zone_create_find(name, size, flags, desired_zid);
2971 if (__improbable(z->z_self)) {
2972 /* We found a zone to reuse */
2973 return z;
2974 }
2975
2976 /*
2977 * Initialize the zone properly.
2978 */
2979
2980 /*
2981 * If the kernel is post lockdown, copy the zone name passed in.
2982 * Else simply maintain a pointer to the name string as it can only
2983 * be a core XNU zone (no unloadable kext exists before lockdown).
2984 */
2985 if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
2986 size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
2987 char *buf = zalloc_permanent(nsz, ZALIGN_NONE);
2988 strlcpy(buf, name, nsz);
2989 z->z_name = buf;
2990 } else {
2991 z->z_name = name;
2992 }
2993 /*
2994 * If zone_init() hasn't run yet, the permanent zones do not exist.
2995 * We can limp along without properly initialized stats for a while,
2996 * zone_init() will rebuild the missing stats when it runs.
2997 */
2998 if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) {
2999 z->z_stats = zalloc_percpu_permanent_type(struct zone_stats);
3000 }
3001
3002 alloc = zone_get_min_alloc_granule(size, flags);
3003
3004 if (flags & ZC_KALLOC_HEAP) {
3005 size_t rem = (alloc % size) / (alloc / size);
3006
3007 /*
3008 * Try to grow the elements size and spread them more if the remaining
3009 * space is large enough.
3010 */
3011 size += rem & ~(KALLOC_MINALIGN - 1);
3012 }
3013
3014 z->pcpu_elem_size = z->z_elem_size = (uint16_t)size;
3015 z->alloc_pages = (uint16_t)atop(alloc);
3016 #if KASAN_ZALLOC
3017 z->kasan_redzone = redzone;
3018 if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) {
3019 z->kasan_fakestacks = true;
3020 }
3021 #endif
3022
3023 /*
3024 * Handle KPI flags
3025 */
3026 #if __LP64__
3027 if (flags & ZC_SEQUESTER) {
3028 z->va_sequester = true;
3029 }
3030 #endif
3031 /* ZC_CACHING applied after all configuration is done */
3032
3033 if (flags & ZC_PERCPU) {
3034 /*
3035 * ZC_CACHING is disallowed because it uses per-cpu zones for its
3036 * implementation and it would be circular. These allocations are
3037 * also quite expensive, so caching feels dangerous memory wise too.
3038 *
3039 * ZC_ZFREE_CLEARMEM is forced because per-cpu zones allow for
3040 * pointer-sized allocations which poisoning doesn't support.
3041 */
3042 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_CACHING);
3043 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_ALLOW_FOREIGN);
3044 z->percpu = true;
3045 z->gzalloc_exempt = true;
3046 z->zfree_clear_mem = true;
3047 z->pcpu_elem_size *= zpercpu_count();
3048 }
3049 if (flags & ZC_ZFREE_CLEARMEM) {
3050 z->zfree_clear_mem = true;
3051 }
3052 if (flags & ZC_NOGC) {
3053 z->collectable = false;
3054 }
3055 if (flags & ZC_NOENCRYPT) {
3056 z->noencrypt = true;
3057 }
3058 if (flags & ZC_ALIGNMENT_REQUIRED) {
3059 z->alignment_required = true;
3060 }
3061 if (flags & ZC_NOGZALLOC) {
3062 z->gzalloc_exempt = true;
3063 }
3064 if (flags & ZC_NOCALLOUT) {
3065 z->no_callout = true;
3066 }
3067 if (flags & ZC_DESTRUCTIBLE) {
3068 zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_CACHING);
3069 zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_ALLOW_FOREIGN);
3070 z->destructible = true;
3071 }
3072
3073 /*
3074 * Handle Internal flags
3075 */
3076 if (flags & ZC_ALLOW_FOREIGN) {
3077 z->allows_foreign = true;
3078 }
3079 if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
3080 (flags & ZC_DATA_BUFFERS)) {
3081 z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
3082 }
3083 if (flags & ZC_KASAN_NOQUARANTINE) {
3084 z->kasan_noquarantine = true;
3085 }
3086 /* ZC_KASAN_NOREDZONE already handled */
3087
3088 /*
3089 * Then if there's extra tuning, do it
3090 */
3091 if (extra_setup) {
3092 extra_setup(z);
3093 }
3094
3095 /*
3096 * Configure debugging features
3097 */
3098 #if CONFIG_GZALLOC
3099 gzalloc_zone_init(z); /* might set z->gzalloc_tracked */
3100 #endif
3101 #if ZONE_ENABLE_LOGGING
3102 if (!z->gzalloc_tracked && num_zones_logged < max_num_zones_to_log) {
3103 /*
3104 * Check for and set up zone leak detection if requested via boot-args.
3105 * might set z->zone_logging
3106 */
3107 zone_setup_logging(z);
3108 }
3109 #endif /* ZONE_ENABLE_LOGGING */
3110 #if VM_MAX_TAG_ZONES
3111 if (!z->gzalloc_tracked && z->kalloc_heap && zone_tagging_on) {
3112 static int tag_zone_index;
3113 vm_offset_t esize = zone_elem_size(z);
3114 z->tags = true;
3115 z->tags_inline = (((page_size + esize - 1) / esize) <=
3116 (sizeof(uint32_t) / sizeof(uint16_t)));
3117 z->tag_zone_index = os_atomic_inc_orig(&tag_zone_index, relaxed);
3118 assert(z->tag_zone_index < VM_MAX_TAG_ZONES);
3119 }
3120 #endif
3121
3122 /*
3123 * Finally, fixup properties based on security policies, boot-args, ...
3124 */
3125 if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
3126 z->kalloc_heap == KHEAP_ID_DATA_BUFFERS) {
3127 z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
3128 }
3129 #if __LP64__
3130 if ((ZSECURITY_OPTIONS_SEQUESTER & zsecurity_options) &&
3131 (flags & ZC_NOSEQUESTER) == 0 &&
3132 z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP) {
3133 z->va_sequester = true;
3134 }
3135 #endif
3136 /*
3137 * Always clear zone elements smaller than a cacheline,
3138 * because it's pretty close to free.
3139 */
3140 if (size <= zp_min_size) {
3141 z->zfree_clear_mem = true;
3142 }
3143 if (zp_factor != 0 && !z->zfree_clear_mem) {
3144 z->zp_count = zone_poison_count_init(z);
3145 }
3146
3147 #if CONFIG_ZCACHE
3148 if ((flags & ZC_NOCACHING) == 0) {
3149 /*
3150 * Append kalloc heap name to zone name (if zone is used by kalloc)
3151 */
3152 char temp_zone_name[MAX_ZONE_NAME] = "";
3153 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
3154
3155 /* Check if boot-arg specified it should have a cache */
3156 if (track_this_zone(temp_zone_name, cache_zone_name)) {
3157 flags |= ZC_CACHING;
3158 } else if (zcc_kalloc && z->kalloc_heap) {
3159 flags |= ZC_CACHING;
3160 }
3161 }
3162 if ((flags & ZC_CACHING) &&
3163 !z->tags && !z->zone_logging && !z->gzalloc_tracked) {
3164 zcache_init(z);
3165 }
3166 #endif /* CONFIG_ZCACHE */
3167
3168 lock_zone(z);
3169 z->z_self = z;
3170 unlock_zone(z);
3171
3172 return z;
3173 }
3174
3175 __startup_func
3176 void
3177 zone_create_startup(struct zone_create_startup_spec *spec)
3178 {
3179 *spec->z_var = zone_create_ext(spec->z_name, spec->z_size,
3180 spec->z_flags, spec->z_zid, spec->z_setup);
3181 }
3182
3183 /*
3184 * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t
3185 * union works. trust but verify.
3186 */
3187 #define zalloc_check_zov_alias(f1, f2) \
3188 static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2))
3189 zalloc_check_zov_alias(z_self, zv_zone);
3190 zalloc_check_zov_alias(z_stats, zv_stats);
3191 zalloc_check_zov_alias(z_name, zv_name);
3192 zalloc_check_zov_alias(z_views, zv_next);
3193 #undef zalloc_check_zov_alias
3194
3195 __startup_func
3196 void
3197 zone_view_startup_init(struct zone_view_startup_spec *spec)
3198 {
3199 struct kalloc_heap *heap = NULL;
3200 zone_view_t zv = spec->zv_view;
3201 zone_t z;
3202
3203 switch (spec->zv_heapid) {
3204 case KHEAP_ID_DEFAULT:
3205 heap = KHEAP_DEFAULT;
3206 break;
3207 case KHEAP_ID_DATA_BUFFERS:
3208 heap = KHEAP_DATA_BUFFERS;
3209 break;
3210 case KHEAP_ID_KEXT:
3211 heap = KHEAP_KEXT;
3212 break;
3213 default:
3214 heap = NULL;
3215 }
3216
3217 if (heap) {
3218 z = kalloc_heap_zone_for_size(heap, spec->zv_size);
3219 assert(z);
3220 } else {
3221 z = spec->zv_zone;
3222 assert(spec->zv_size <= zone_elem_size(z));
3223 }
3224
3225 zv->zv_zone = z;
3226 zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats);
3227 zv->zv_next = z->z_views;
3228 if (z->z_views == NULL && z->kalloc_heap == KHEAP_ID_NONE) {
3229 /*
3230 * count the raw view for zones not in a heap,
3231 * kalloc_heap_init() already counts it for its members.
3232 */
3233 zone_view_count += 2;
3234 } else {
3235 zone_view_count += 1;
3236 }
3237 z->z_views = zv;
3238 }
3239
3240 zone_t
3241 zone_create(
3242 const char *name,
3243 vm_size_t size,
3244 zone_create_flags_t flags)
3245 {
3246 return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL);
3247 }
3248
3249 zone_t
3250 zinit(
3251 vm_size_t size, /* the size of an element */
3252 vm_size_t max, /* maximum memory to use */
3253 vm_size_t alloc __unused, /* allocation size */
3254 const char *name) /* a name for the zone */
3255 {
3256 zone_t z = zone_create(name, size, ZC_DESTRUCTIBLE);
3257 zone_set_max(z, max);
3258 return z;
3259 }
3260
3261 void
3262 zdestroy(zone_t z)
3263 {
3264 unsigned int zindex = zone_index(z);
3265
3266 lock_zone(z);
3267
3268 if (!z->destructible || zone_caching_enabled(z) || z->allows_foreign) {
3269 panic("zdestroy: Zone %s%s isn't destructible",
3270 zone_heap_name(z), z->z_name);
3271 }
3272
3273 if (!z->z_self || z->expanding_no_vm_priv || z->expanding_vm_priv ||
3274 z->async_pending || z->waiting) {
3275 panic("zdestroy: Zone %s%s in an invalid state for destruction",
3276 zone_heap_name(z), z->z_name);
3277 }
3278
3279 #if !KASAN_ZALLOC
3280 /*
3281 * Unset the valid bit. We'll hit an assert failure on further operations
3282 * on this zone, until zinit() is called again.
3283 *
3284 * Leave the zone valid for KASan as we will see zfree's on quarantined free
3285 * elements even after the zone is destroyed.
3286 */
3287 z->z_self = NULL;
3288 #endif
3289 z->destroyed = true;
3290 unlock_zone(z);
3291
3292 /* Dump all the free elements */
3293 zone_drop_free_elements(z);
3294
3295 #if CONFIG_GZALLOC
3296 if (__improbable(z->gzalloc_tracked)) {
3297 /* If the zone is gzalloc managed dump all the elements in the free cache */
3298 gzalloc_empty_free_cache(z);
3299 }
3300 #endif
3301
3302 lock_zone(z);
3303
3304 while (!zone_pva_is_null(z->pages_sequester)) {
3305 struct zone_page_metadata *page_meta;
3306 vm_offset_t free_addr;
3307
3308 page_meta = zone_sequestered_page_get(z, &free_addr);
3309 unlock_zone(z);
3310 kmem_free(submap_for_zone(z), free_addr, ptoa(z->alloc_pages));
3311 lock_zone(z);
3312 }
3313
3314 #if !KASAN_ZALLOC
3315 /* Assert that all counts are zero */
3316 if (z->countavail || z->countfree || zone_size_wired(z) ||
3317 z->allfree_page_count || z->sequester_page_count) {
3318 panic("zdestroy: Zone %s%s isn't empty at zdestroy() time",
3319 zone_heap_name(z), z->z_name);
3320 }
3321
3322 /* consistency check: make sure everything is indeed empty */
3323 assert(zone_pva_is_null(z->pages_any_free_foreign));
3324 assert(zone_pva_is_null(z->pages_all_used_foreign));
3325 assert(zone_pva_is_null(z->pages_all_free));
3326 assert(zone_pva_is_null(z->pages_intermediate));
3327 assert(zone_pva_is_null(z->pages_all_used));
3328 assert(zone_pva_is_null(z->pages_sequester));
3329 #endif
3330
3331 unlock_zone(z);
3332
3333 simple_lock(&all_zones_lock, &zone_locks_grp);
3334
3335 assert(!bitmap_test(zone_destroyed_bitmap, zindex));
3336 /* Mark the zone as empty in the bitmap */
3337 bitmap_set(zone_destroyed_bitmap, zindex);
3338 num_zones_in_use--;
3339 assert(num_zones_in_use > 0);
3340
3341 simple_unlock(&all_zones_lock);
3342 }
3343
3344 #pragma mark zone (re)fill, jetsam
3345
3346 /*
3347 * Dealing with zone allocations from the mach VM code.
3348 *
3349 * The implementation of the mach VM itself uses the zone allocator
3350 * for things like the vm_map_entry data structure. In order to prevent
3351 * an infinite recursion problem when adding more pages to a zone, zalloc
3352 * uses a replenish thread to refill the VM layer's zones before they have
3353 * too few remaining free entries. The reserved remaining free entries
3354 * guarantee that the VM routines can get entries from already mapped pages.
3355 *
3356 * In order for that to work, the amount of allocations in the nested
3357 * case have to be bounded. There are currently 2 replenish zones, and
3358 * if each needs 1 element of each zone to add a new page to itself, that
3359 * gives us a minumum reserve of 2 elements.
3360 *
3361 * There is also a deadlock issue with the zone garbage collection thread,
3362 * or any thread that is trying to free zone pages. While holding
3363 * the kernel's map lock they may need to allocate new VM map entries, hence
3364 * we need enough reserve to allow them to get past the point of holding the
3365 * map lock. After freeing that page, the GC thread will wait in drop_free_elements()
3366 * until the replenish threads can finish. Since there's only 1 GC thread at a time,
3367 * that adds a minimum of 1 to the reserve size.
3368 *
3369 * Since the minumum amount you can add to a zone is 1 page, we'll use 16K (from ARM)
3370 * as the refill size on all platforms.
3371 *
3372 * When a refill zone drops to half that available, i.e. REFILL_SIZE / 2,
3373 * zalloc_ext() will wake the replenish thread. The replenish thread runs
3374 * until at least REFILL_SIZE worth of free elements exist, before sleeping again.
3375 * In the meantime threads may continue to use the reserve until there are only REFILL_SIZE / 4
3376 * elements left. Below that point only the replenish threads themselves and the GC
3377 * thread may continue to use from the reserve.
3378 */
3379 static unsigned zone_replenish_loops;
3380 static unsigned zone_replenish_wakeups;
3381 static unsigned zone_replenish_wakeups_initiated;
3382 static unsigned zone_replenish_throttle_count;
3383
3384 #define ZONE_REPLENISH_TARGET (16 * 1024)
3385 static unsigned zone_replenish_active = 0; /* count of zones currently replenishing */
3386 static unsigned zone_replenish_max_threads = 0;
3387
3388 LCK_GRP_DECLARE(zone_replenish_lock_grp, "zone_replenish_lock");
3389 LCK_SPIN_DECLARE(zone_replenish_lock, &zone_replenish_lock_grp);
3390
3391 __abortlike
3392 static void
3393 zone_replenish_panic(zone_t zone, kern_return_t kr)
3394 {
3395 panic_include_zprint = TRUE;
3396 #if CONFIG_ZLEAKS
3397 if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
3398 panic_include_ztrace = TRUE;
3399 }
3400 #endif /* CONFIG_ZLEAKS */
3401 if (kr == KERN_NO_SPACE) {
3402 zone_t zone_largest = zone_find_largest();
3403 panic("zalloc: zone map exhausted while allocating from zone %s%s, "
3404 "likely due to memory leak in zone %s%s "
3405 "(%lu total bytes, %d elements allocated)",
3406 zone_heap_name(zone), zone->z_name,
3407 zone_heap_name(zone_largest), zone_largest->z_name,
3408 (unsigned long)zone_size_wired(zone_largest),
3409 zone_count_allocated(zone_largest));
3410 }
3411 panic("zalloc: %s%s (%d elements) retry fail %d",
3412 zone_heap_name(zone), zone->z_name,
3413 zone_count_allocated(zone), kr);
3414 }
3415
3416 static void
3417 zone_replenish_locked(zone_t z, zalloc_flags_t flags, bool asynchronously)
3418 {
3419 int kmaflags = KMA_KOBJECT | KMA_ZERO;
3420 vm_offset_t space, alloc_size;
3421 uint32_t retry = 0;
3422 kern_return_t kr;
3423
3424 if (z->noencrypt) {
3425 kmaflags |= KMA_NOENCRYPT;
3426 }
3427 if (flags & Z_NOPAGEWAIT) {
3428 kmaflags |= KMA_NOPAGEWAIT;
3429 }
3430 if (z->permanent) {
3431 kmaflags |= KMA_PERMANENT;
3432 }
3433
3434 for (;;) {
3435 struct zone_page_metadata *page_meta = NULL;
3436
3437 /*
3438 * Try to allocate our regular chunk of pages,
3439 * unless the system is under massive pressure
3440 * and we're looking for more than 2 pages.
3441 */
3442 if (!z->percpu && z->alloc_pages > 2 && (vm_pool_low() || retry > 0)) {
3443 alloc_size = round_page(zone_elem_size(z));
3444 } else {
3445 alloc_size = ptoa(z->alloc_pages);
3446 page_meta = zone_sequestered_page_get(z, &space);
3447 }
3448
3449 unlock_zone(z);
3450
3451 #if CONFIG_ZLEAKS
3452 /*
3453 * Do the zone leak activation here because zleak_activate()
3454 * may block, and can't be done on the way out.
3455 */
3456 if (__improbable(zleak_state & ZLEAK_STATE_ENABLED)) {
3457 if (!(zleak_state & ZLEAK_STATE_ACTIVE) &&
3458 zone_submaps_approx_size() >= zleak_global_tracking_threshold) {
3459 kr = zleak_activate();
3460 if (kr != KERN_SUCCESS) {
3461 printf("Failed to activate live zone leak debugging (%d).\n", kr);
3462 }
3463 }
3464 }
3465 #endif /* CONFIG_ZLEAKS */
3466
3467 /*
3468 * Trigger jetsams via the vm_pageout_garbage_collect thread if
3469 * we're running out of zone memory
3470 */
3471 if (is_zone_map_nearing_exhaustion()) {
3472 thread_wakeup((event_t) &vm_pageout_garbage_collect);
3473 }
3474
3475 if (page_meta) {
3476 kr = zone_sequestered_page_populate(z, page_meta, space,
3477 alloc_size, kmaflags);
3478 } else {
3479 if (z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP && z->kalloc_heap != KHEAP_ID_NONE) {
3480 kmaflags |= KMA_KHEAP;
3481 }
3482 kr = kernel_memory_allocate(submap_for_zone(z),
3483 &space, alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3484 }
3485
3486 #if !__LP64__
3487 if (kr == KERN_NO_SPACE && z->allows_foreign) {
3488 /*
3489 * For zones allowing foreign pages, fallback to the kernel map
3490 */
3491 kr = kernel_memory_allocate(kernel_map, &space,
3492 alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3493 }
3494 #endif
3495
3496 if (kr == KERN_SUCCESS) {
3497 break;
3498 }
3499
3500 if (flags & Z_NOPAGEWAIT) {
3501 lock_zone(z);
3502 return;
3503 }
3504
3505 if (asynchronously) {
3506 assert_wait_timeout(&z->prio_refill_count,
3507 THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
3508 thread_block(THREAD_CONTINUE_NULL);
3509 } else if (++retry == 3) {
3510 zone_replenish_panic(z, kr);
3511 }
3512
3513 lock_zone(z);
3514 }
3515
3516 zcram_and_lock(z, space, alloc_size);
3517
3518 #if CONFIG_ZLEAKS
3519 if (__improbable(zleak_state & ZLEAK_STATE_ACTIVE)) {
3520 if (!z->zleak_on &&
3521 zone_size_wired(z) >= zleak_per_zone_tracking_threshold) {
3522 z->zleak_on = true;
3523 }
3524 }
3525 #endif /* CONFIG_ZLEAKS */
3526 }
3527
3528 /*
3529 * High priority VM privileged thread used to asynchronously refill a given zone.
3530 * These are needed for data structures used by the lower level VM itself. The
3531 * replenish thread maintains a reserve of elements, so that the VM will never
3532 * block in the zone allocator.
3533 */
3534 __dead2
3535 static void
3536 zone_replenish_thread(void *_z, wait_result_t __unused wr)
3537 {
3538 zone_t z = _z;
3539
3540 current_thread()->options |= (TH_OPT_VMPRIV | TH_OPT_ZONE_PRIV);
3541
3542 for (;;) {
3543 lock_zone(z);
3544 assert(z->z_self == z);
3545 assert(z->zone_replenishing);
3546 assert(z->prio_refill_count != 0);
3547
3548 while (z->countfree < z->prio_refill_count) {
3549 assert(!z->expanding_no_vm_priv);
3550 assert(!z->expanding_vm_priv);
3551
3552 zone_replenish_locked(z, Z_WAITOK, true);
3553
3554 assert(z->z_self == z);
3555 zone_replenish_loops++;
3556 }
3557
3558 /* Wakeup any potentially throttled allocations. */
3559 thread_wakeup(z);
3560
3561 assert_wait(&z->prio_refill_count, THREAD_UNINT);
3562
3563 /*
3564 * We finished refilling the zone, so decrement the active count
3565 * and wake up any waiting GC threads.
3566 */
3567 lck_spin_lock(&zone_replenish_lock);
3568 assert(zone_replenish_active > 0);
3569 if (--zone_replenish_active == 0) {
3570 thread_wakeup((event_t)&zone_replenish_active);
3571 }
3572 lck_spin_unlock(&zone_replenish_lock);
3573
3574 z->zone_replenishing = false;
3575 unlock_zone(z);
3576
3577 thread_block(THREAD_CONTINUE_NULL);
3578 zone_replenish_wakeups++;
3579 }
3580 }
3581
3582 void
3583 zone_prio_refill_configure(zone_t z)
3584 {
3585 thread_t th;
3586 kern_return_t tres;
3587
3588 lock_zone(z);
3589 assert(!z->prio_refill_count && !z->destructible);
3590 z->prio_refill_count = (uint16_t)(ZONE_REPLENISH_TARGET / zone_elem_size(z));
3591 z->zone_replenishing = true;
3592 unlock_zone(z);
3593
3594 lck_spin_lock(&zone_replenish_lock);
3595 ++zone_replenish_max_threads;
3596 ++zone_replenish_active;
3597 lck_spin_unlock(&zone_replenish_lock);
3598 OSMemoryBarrier();
3599
3600 tres = kernel_thread_start_priority(zone_replenish_thread, z,
3601 MAXPRI_KERNEL, &th);
3602 if (tres != KERN_SUCCESS) {
3603 panic("zone_prio_refill_configure, thread create: 0x%x", tres);
3604 }
3605
3606 thread_deallocate(th);
3607 }
3608
3609 static void
3610 zone_randomize_freelist(zone_t zone, struct zone_page_metadata *meta,
3611 vm_offset_t size, zone_addr_kind_t kind, unsigned int *entropy_buffer)
3612 {
3613 const vm_size_t elem_size = zone_elem_size(zone);
3614 vm_offset_t left, right, head, base;
3615 vm_offset_t element;
3616
3617 left = ZONE_PAGE_FIRST_OFFSET(kind);
3618 right = size - ((size - left) % elem_size);
3619 head = 0;
3620 base = zone_meta_to_addr(meta, kind);
3621
3622 while (left < right) {
3623 if (zone_leaks_scan_enable || __improbable(zone->tags) ||
3624 random_bool_gen_bits(&zone_bool_gen, entropy_buffer, MAX_ENTROPY_PER_ZCRAM, 1)) {
3625 element = base + left;
3626 left += elem_size;
3627 } else {
3628 right -= elem_size;
3629 element = base + right;
3630 }
3631
3632 vm_offset_t *primary = (vm_offset_t *)element;
3633 vm_offset_t *backup = get_backup_ptr(elem_size, primary);
3634
3635 *primary = *backup = head ^ zp_nopoison_cookie;
3636 head = element;
3637 }
3638
3639 meta->zm_freelist_offs = (uint16_t)(head - base);
3640 }
3641
3642 /*
3643 * Cram the given memory into the specified zone. Update the zone page count accordingly.
3644 */
3645 static void
3646 zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size)
3647 {
3648 unsigned int entropy_buffer[MAX_ENTROPY_PER_ZCRAM] = { 0 };
3649 struct zone_page_metadata *meta;
3650 zone_addr_kind_t kind;
3651 uint32_t pg_count = (uint32_t)atop(size);
3652 uint32_t zindex = zone_index(zone);
3653 uint32_t free_count;
3654 uint16_t empty_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
3655
3656 /* Basic sanity checks */
3657 assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
3658 assert((newmem & PAGE_MASK) == 0);
3659 assert((size & PAGE_MASK) == 0);
3660
3661 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START,
3662 zindex, size);
3663
3664 kind = zone_addr_kind(newmem, size);
3665 #if DEBUG || DEVELOPMENT
3666 if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) {
3667 kprintf("zcram(%p[%s%s], 0x%lx%s, 0x%lx)\n", zone,
3668 zone_heap_name(zone), zone->z_name, (uintptr_t)newmem,
3669 kind == ZONE_ADDR_FOREIGN ? "[F]" : "", (uintptr_t)size);
3670 }
3671 #endif /* DEBUG || DEVELOPMENT */
3672
3673 /*
3674 * Initialize the metadata for all pages. We dont need the zone lock
3675 * here because we are not manipulating any zone related state yet.
3676 *
3677 * This includes randomizing the freelists as the metadata isn't
3678 * published yet.
3679 */
3680
3681 if (kind == ZONE_ADDR_NATIVE) {
3682 /*
3683 * We're being called by zfill,
3684 * zone_replenish_thread or vm_page_more_fictitious,
3685 *
3686 * which will only either allocate a single page, or `alloc_pages`
3687 * worth.
3688 */
3689 assert(pg_count <= zone->alloc_pages);
3690
3691 /*
3692 * Make sure the range of metadata entries we're about to init
3693 * have proper physical backing, then initialize them.
3694 */
3695 meta = zone_meta_from_addr(newmem, kind);
3696 zone_meta_populate(meta, meta + pg_count);
3697
3698 if (zone->permanent) {
3699 empty_freelist_offs = 0;
3700 }
3701
3702 meta[0] = (struct zone_page_metadata){
3703 .zm_index = zindex,
3704 .zm_page_count = pg_count,
3705 .zm_percpu = zone->percpu,
3706 .zm_freelist_offs = empty_freelist_offs,
3707 };
3708
3709 for (uint32_t i = 1; i < pg_count; i++) {
3710 meta[i] = (struct zone_page_metadata){
3711 .zm_index = zindex,
3712 .zm_page_count = i,
3713 .zm_percpu = zone->percpu,
3714 .zm_secondary_page = true,
3715 .zm_freelist_offs = empty_freelist_offs,
3716 };
3717 }
3718
3719 if (!zone->permanent) {
3720 zone_randomize_freelist(zone, meta,
3721 zone->percpu ? PAGE_SIZE : size, kind, entropy_buffer);
3722 }
3723 } else {
3724 if (!zone->allows_foreign || !from_foreign_range(newmem, size)) {
3725 panic("zcram_and_lock: foreign memory [%lx] being crammed is "
3726 "outside of foreign range", (uintptr_t)newmem);
3727 }
3728
3729 /*
3730 * We cannot support elements larger than page size for foreign
3731 * memory because we put metadata on the page itself for each
3732 * page of foreign memory.
3733 *
3734 * We need to do this in order to be able to reach the metadata
3735 * when any element is freed.
3736 */
3737 assert(!zone->percpu && !zone->permanent);
3738 assert(zone_elem_size(zone) <= PAGE_SIZE - sizeof(struct zone_page_metadata));
3739
3740 bzero((void *)newmem, size);
3741
3742 for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
3743 meta = (struct zone_page_metadata *)(newmem + offs);
3744 *meta = (struct zone_page_metadata){
3745 .zm_index = zindex,
3746 .zm_page_count = 1,
3747 .zm_freelist_offs = empty_freelist_offs,
3748 };
3749 meta->zm_foreign_cookie[0] = ZONE_FOREIGN_COOKIE;
3750 zone_randomize_freelist(zone, meta, PAGE_SIZE, kind,
3751 entropy_buffer);
3752 }
3753 }
3754
3755 #if VM_MAX_TAG_ZONES
3756 if (__improbable(zone->tags)) {
3757 assert(kind == ZONE_ADDR_NATIVE && !zone->percpu);
3758 ztMemoryAdd(zone, newmem, size);
3759 }
3760 #endif /* VM_MAX_TAG_ZONES */
3761
3762 /*
3763 * Insert the initialized pages / metadatas into the right lists.
3764 */
3765
3766 lock_zone(zone);
3767 assert(zone->z_self == zone);
3768
3769 zone->page_count += pg_count;
3770 if (zone->page_count_hwm < zone->page_count) {
3771 zone->page_count_hwm = zone->page_count;
3772 }
3773 os_atomic_add(&zones_phys_page_count, pg_count, relaxed);
3774
3775 if (kind == ZONE_ADDR_NATIVE) {
3776 os_atomic_add(&zones_phys_page_mapped_count, pg_count, relaxed);
3777 if (zone->permanent) {
3778 zone_meta_queue_push(zone, &zone->pages_intermediate, meta, kind);
3779 } else {
3780 zone_meta_queue_push(zone, &zone->pages_all_free, meta, kind);
3781 zone->allfree_page_count += meta->zm_page_count;
3782 }
3783 free_count = zone_elem_count(zone, size, kind);
3784 zone->countfree += free_count;
3785 zone->countavail += free_count;
3786 } else {
3787 free_count = zone_elem_count(zone, PAGE_SIZE, kind);
3788 for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
3789 meta = (struct zone_page_metadata *)(newmem + offs);
3790 zone_meta_queue_push(zone, &zone->pages_any_free_foreign, meta, kind);
3791 zone->countfree += free_count;
3792 zone->countavail += free_count;
3793 }
3794 }
3795
3796 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, zindex);
3797 }
3798
3799 void
3800 zcram(zone_t zone, vm_offset_t newmem, vm_size_t size)
3801 {
3802 zcram_and_lock(zone, newmem, size);
3803 unlock_zone(zone);
3804 }
3805
3806 /*
3807 * Fill a zone with enough memory to contain at least nelem elements.
3808 * Return the number of elements actually put into the zone, which may
3809 * be more than the caller asked for since the memory allocation is
3810 * rounded up to the next zone allocation size.
3811 */
3812 int
3813 zfill(
3814 zone_t zone,
3815 int nelem)
3816 {
3817 kern_return_t kr;
3818 vm_offset_t memory;
3819
3820 vm_size_t alloc_size = ptoa(zone->alloc_pages);
3821 vm_size_t nalloc_inc = zone_elem_count(zone, alloc_size, ZONE_ADDR_NATIVE);
3822 vm_size_t nalloc = 0, goal = MAX(0, nelem);
3823 int kmaflags = KMA_KOBJECT | KMA_ZERO;
3824
3825 if (zone->noencrypt) {
3826 kmaflags |= KMA_NOENCRYPT;
3827 }
3828
3829 assert(!zone->allows_foreign && !zone->permanent);
3830
3831 /*
3832 * Trigger jetsams via the vm_pageout_garbage_collect thread if we're
3833 * running out of zone memory
3834 */
3835 if (is_zone_map_nearing_exhaustion()) {
3836 thread_wakeup((event_t) &vm_pageout_garbage_collect);
3837 }
3838
3839 if (zone->va_sequester) {
3840 lock_zone(zone);
3841
3842 do {
3843 struct zone_page_metadata *page_meta;
3844 page_meta = zone_sequestered_page_get(zone, &memory);
3845 if (NULL == page_meta) {
3846 break;
3847 }
3848 unlock_zone(zone);
3849
3850 kr = zone_sequestered_page_populate(zone, page_meta,
3851 memory, alloc_size, kmaflags);
3852 if (KERN_SUCCESS != kr) {
3853 goto out_nolock;
3854 }
3855
3856 zcram_and_lock(zone, memory, alloc_size);
3857 nalloc += nalloc_inc;
3858 } while (nalloc < goal);
3859
3860 unlock_zone(zone);
3861 }
3862
3863 out_nolock:
3864 while (nalloc < goal) {
3865 kr = kernel_memory_allocate(submap_for_zone(zone), &memory,
3866 alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3867 if (kr != KERN_SUCCESS) {
3868 printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
3869 __func__, (unsigned long)(nalloc * alloc_size));
3870 break;
3871 }
3872
3873 zcram(zone, memory, alloc_size);
3874 nalloc += nalloc_inc;
3875 }
3876
3877 return (int)nalloc;
3878 }
3879
3880 /*
3881 * We're being very conservative here and picking a value of 95%. We might need to lower this if
3882 * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
3883 */
3884 #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
3885
3886 /*
3887 * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
3888 * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
3889 */
3890 TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit",
3891 ZONE_MAP_JETSAM_LIMIT_DEFAULT);
3892
3893 void
3894 get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
3895 {
3896 vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
3897 *current_size = ptoa_64(phys_pages);
3898 *capacity = zone_phys_mapped_max;
3899 }
3900
3901 void
3902 get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
3903 {
3904 zone_t largest_zone = zone_find_largest();
3905
3906 /*
3907 * Append kalloc heap name to zone name (if zone is used by kalloc)
3908 */
3909 snprintf(zone_name, zone_name_len, "%s%s",
3910 zone_heap_name(largest_zone), largest_zone->z_name);
3911
3912 *zone_size = zone_size_wired(largest_zone);
3913 }
3914
3915 boolean_t
3916 is_zone_map_nearing_exhaustion(void)
3917 {
3918 vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
3919 return ptoa_64(phys_pages) > (zone_phys_mapped_max * zone_map_jetsam_limit) / 100;
3920 }
3921
3922
3923 #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
3924
3925 /*
3926 * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
3927 * to walk through the jetsam priority bands and kill processes.
3928 */
3929 static void
3930 kill_process_in_largest_zone(void)
3931 {
3932 pid_t pid = -1;
3933 zone_t largest_zone = zone_find_largest();
3934
3935 printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, map size %lld, capacity %lld [jetsam limit %d%%]\n",
3936 ptoa_64(os_atomic_load(&zones_phys_page_mapped_count, relaxed)), ptoa_64(zone_phys_mapped_max),
3937 ptoa_64(os_atomic_load(&zones_phys_page_count, relaxed)),
3938 (uint64_t)zone_submaps_approx_size(),
3939 (uint64_t)zone_range_size(&zone_info.zi_map_range),
3940 zone_map_jetsam_limit);
3941 printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone),
3942 largest_zone->z_name, (uintptr_t)zone_size_wired(largest_zone));
3943
3944 /*
3945 * We want to make sure we don't call this function from userspace.
3946 * Or we could end up trying to synchronously kill the process
3947 * whose context we're in, causing the system to hang.
3948 */
3949 assert(current_task() == kernel_task);
3950
3951 /*
3952 * If vm_object_zone is the largest, check to see if the number of
3953 * elements in vm_map_entry_zone is comparable.
3954 *
3955 * If so, consider vm_map_entry_zone as the largest. This lets us target
3956 * a specific process to jetsam to quickly recover from the zone map
3957 * bloat.
3958 */
3959 if (largest_zone == vm_object_zone) {
3960 unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone);
3961 unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone);
3962 /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
3963 if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
3964 largest_zone = vm_map_entry_zone;
3965 printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n",
3966 (uintptr_t)zone_size_wired(largest_zone));
3967 }
3968 }
3969
3970 /* TODO: Extend this to check for the largest process in other zones as well. */
3971 if (largest_zone == vm_map_entry_zone) {
3972 pid = find_largest_process_vm_map_entries();
3973 } else {
3974 printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. "
3975 "Waking up memorystatus thread.\n", zone_heap_name(largest_zone),
3976 largest_zone->z_name);
3977 }
3978 if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
3979 printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
3980 }
3981 }
3982
3983 #pragma mark zalloc module init
3984
3985 /*
3986 * Initialize the "zone of zones" which uses fixed memory allocated
3987 * earlier in memory initialization. zone_bootstrap is called
3988 * before zone_init.
3989 */
3990 __startup_func
3991 void
3992 zone_bootstrap(void)
3993 {
3994 /* Validate struct zone_page_metadata expectations */
3995 if ((1U << ZONE_PAGECOUNT_BITS) <
3996 atop(ZONE_MAX_ALLOC_SIZE) * sizeof(struct zone_page_metadata)) {
3997 panic("ZONE_PAGECOUNT_BITS is not large enough to hold page counts");
3998 }
3999
4000 /* Validate struct zone_packed_virtual_address expectations */
4001 static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1");
4002 if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) {
4003 panic("zone_pva_t can't pack a kernel page address in 31 bits");
4004 }
4005
4006 zpercpu_early_count = ml_early_cpu_max_number() + 1;
4007
4008 /* Set up zone element poisoning */
4009 zp_bootstrap();
4010
4011 random_bool_init(&zone_bool_gen);
4012
4013 /*
4014 * the KASAN quarantine for kalloc doesn't understand heaps
4015 * and trips the heap confusion panics. At the end of the day,
4016 * all these security measures are double duty with KASAN.
4017 *
4018 * On 32bit kernels, these protections are just too expensive.
4019 */
4020 #if !defined(__LP64__) || KASAN_ZALLOC
4021 zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER;
4022 zsecurity_options &= ~ZSECURITY_OPTIONS_SUBMAP_USER_DATA;
4023 zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC;
4024 #endif
4025
4026 thread_call_setup(&call_async_alloc, zalloc_async, NULL);
4027
4028 #if CONFIG_ZCACHE
4029 /* zcc_enable_for_zone_name=<zone>: enable per-cpu zone caching for <zone>. */
4030 if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) {
4031 printf("zcache: caching enabled for zone %s\n", cache_zone_name);
4032 }
4033 #endif /* CONFIG_ZCACHE */
4034 }
4035
4036 #if __LP64__
4037 #if CONFIG_EMBEDDED
4038 #define ZONE_MAP_VIRTUAL_SIZE_LP64 (32ULL * 1024ULL * 1024 * 1024)
4039 #else
4040 #define ZONE_MAP_VIRTUAL_SIZE_LP64 (128ULL * 1024ULL * 1024 * 1024)
4041 #endif
4042 #endif /* __LP64__ */
4043
4044 #define SINGLE_GUARD 16384
4045 #define MULTI_GUARD (3 * SINGLE_GUARD)
4046
4047 #if __LP64__
4048 static inline vm_offset_t
4049 zone_restricted_va_max(void)
4050 {
4051 vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR);
4052 vm_offset_t vm_page_max = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR);
4053
4054 return trunc_page(MIN(compressor_max, vm_page_max));
4055 }
4056 #endif
4057
4058 __startup_func
4059 static void
4060 zone_tunables_fixup(void)
4061 {
4062 if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) {
4063 zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
4064 }
4065 }
4066 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup);
4067
4068 __startup_func
4069 static vm_size_t
4070 zone_phys_size_max(void)
4071 {
4072 mach_vm_size_t zsize;
4073 vm_size_t zsizearg;
4074
4075 if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) {
4076 zsize = zsizearg * (1024ULL * 1024);
4077 } else {
4078 zsize = sane_size >> 2; /* Set target zone size as 1/4 of physical memory */
4079 #if defined(__LP64__)
4080 zsize += zsize >> 1;
4081 #endif /* __LP64__ */
4082 }
4083
4084 if (zsize < CONFIG_ZONE_MAP_MIN) {
4085 zsize = CONFIG_ZONE_MAP_MIN; /* Clamp to min */
4086 }
4087 if (zsize > sane_size >> 1) {
4088 zsize = sane_size >> 1; /* Clamp to half of RAM max */
4089 }
4090 if (zsizearg == 0 && zsize > ZONE_MAP_MAX) {
4091 /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */
4092 vm_size_t orig_zsize = zsize;
4093 zsize = ZONE_MAP_MAX;
4094 printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n",
4095 (uintptr_t)orig_zsize, (uintptr_t)zsize);
4096 }
4097
4098 assert((vm_size_t) zsize == zsize);
4099 return (vm_size_t)trunc_page(zsize);
4100 }
4101
4102 __startup_func
4103 static struct zone_map_range
4104 zone_init_allocate_va(vm_offset_t *submap_min, vm_size_t size, bool guard)
4105 {
4106 struct zone_map_range r;
4107 kern_return_t kr;
4108
4109 if (guard) {
4110 vm_map_offset_t addr = *submap_min;
4111 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
4112
4113 vmk_flags.vmkf_permanent = TRUE;
4114 kr = vm_map_enter(kernel_map, &addr, size, 0,
4115 VM_FLAGS_FIXED, vmk_flags, VM_KERN_MEMORY_ZONE, kernel_object,
4116 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
4117 *submap_min = (vm_offset_t)addr;
4118 } else {
4119 kr = kernel_memory_allocate(kernel_map, submap_min, size,
4120 0, KMA_KOBJECT | KMA_PAGEABLE | KMA_VAONLY, VM_KERN_MEMORY_ZONE);
4121 }
4122 if (kr != KERN_SUCCESS) {
4123 panic("zone_init_allocate_va(0x%lx:0x%zx) failed: %d",
4124 (uintptr_t)*submap_min, (size_t)size, kr);
4125 }
4126
4127 r.min_address = *submap_min;
4128 *submap_min += size;
4129 r.max_address = *submap_min;
4130
4131 return r;
4132 }
4133
4134 __startup_func
4135 static void
4136 zone_submap_init(
4137 vm_offset_t *submap_min,
4138 unsigned idx,
4139 uint64_t zone_sub_map_numer,
4140 uint64_t *remaining_denom,
4141 vm_offset_t *remaining_size,
4142 vm_size_t guard_size)
4143 {
4144 vm_offset_t submap_start, submap_end;
4145 vm_size_t submap_size;
4146 vm_map_t submap;
4147 kern_return_t kr;
4148
4149 submap_size = trunc_page(zone_sub_map_numer * *remaining_size /
4150 *remaining_denom);
4151 submap_start = *submap_min;
4152 submap_end = submap_start + submap_size;
4153
4154 #if defined(__LP64__)
4155 if (idx == Z_SUBMAP_IDX_VA_RESTRICTED_MAP) {
4156 vm_offset_t restricted_va_max = zone_restricted_va_max();
4157 if (submap_end > restricted_va_max) {
4158 #if DEBUG || DEVELOPMENT
4159 printf("zone_init: submap[%d] clipped to %zdM of %zdM\n", idx,
4160 (size_t)(restricted_va_max - submap_start) >> 20,
4161 (size_t)submap_size >> 20);
4162 #endif /* DEBUG || DEVELOPMENT */
4163 guard_size += submap_end - restricted_va_max;
4164 *remaining_size -= submap_end - restricted_va_max;
4165 submap_end = restricted_va_max;
4166 submap_size = restricted_va_max - submap_start;
4167 }
4168
4169 vm_packing_verify_range("vm_compressor",
4170 submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR));
4171 vm_packing_verify_range("vm_page",
4172 submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR));
4173 }
4174 #endif /* defined(__LP64__) */
4175
4176 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
4177 vmk_flags.vmkf_permanent = TRUE;
4178 kr = kmem_suballoc(kernel_map, submap_min, submap_size,
4179 FALSE, VM_FLAGS_FIXED, vmk_flags,
4180 VM_KERN_MEMORY_ZONE, &submap);
4181 if (kr != KERN_SUCCESS) {
4182 panic("kmem_suballoc(kernel_map[%d] %p:%p) failed: %d",
4183 idx, (void *)submap_start, (void *)submap_end, kr);
4184 }
4185
4186 #if DEBUG || DEVELOPMENT
4187 printf("zone_init: submap[%d] %p:%p (%zuM)\n",
4188 idx, (void *)submap_start, (void *)submap_end,
4189 (size_t)submap_size >> 20);
4190 #endif /* DEBUG || DEVELOPMENT */
4191
4192 zone_submaps[idx] = submap;
4193 *submap_min = submap_end;
4194 *remaining_size -= submap_size;
4195 *remaining_denom -= zone_sub_map_numer;
4196
4197 zone_init_allocate_va(submap_min, guard_size, true);
4198 }
4199
4200 /* Global initialization of Zone Allocator.
4201 * Runs after zone_bootstrap.
4202 */
4203 __startup_func
4204 static void
4205 zone_init(void)
4206 {
4207 vm_size_t zone_meta_size;
4208 vm_size_t zone_map_size;
4209 vm_size_t remaining_size;
4210 vm_offset_t submap_min = 0;
4211
4212 if (ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) {
4213 zone_last_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
4214 } else {
4215 zone_last_submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
4216 }
4217 zone_phys_mapped_max = zone_phys_size_max();
4218
4219 #if __LP64__
4220 zone_map_size = ZONE_MAP_VIRTUAL_SIZE_LP64;
4221 #else
4222 zone_map_size = zone_phys_mapped_max;
4223 #endif
4224 zone_meta_size = round_page(atop(zone_map_size) *
4225 sizeof(struct zone_page_metadata));
4226
4227 /*
4228 * Zone "map" setup:
4229 *
4230 * [ VA_RESTRICTED ] <-- LP64 only
4231 * [ SINGLE_GUARD ] <-- LP64 only
4232 * [ meta ]
4233 * [ SINGLE_GUARD ]
4234 * [ map<i> ] \ for each extra map
4235 * [ MULTI_GUARD ] /
4236 */
4237 remaining_size = zone_map_size;
4238 #if defined(__LP64__)
4239 remaining_size -= SINGLE_GUARD;
4240 #endif
4241 remaining_size -= zone_meta_size + SINGLE_GUARD;
4242 remaining_size -= MULTI_GUARD * (zone_last_submap_idx -
4243 Z_SUBMAP_IDX_GENERAL_MAP + 1);
4244
4245 #if VM_MAX_TAG_ZONES
4246 if (zone_tagging_on) {
4247 zone_tagging_init(zone_map_size);
4248 }
4249 #endif
4250
4251 uint64_t remaining_denom = 0;
4252 uint64_t zone_sub_map_numer[Z_SUBMAP_IDX_COUNT] = {
4253 #ifdef __LP64__
4254 [Z_SUBMAP_IDX_VA_RESTRICTED_MAP] = 20,
4255 #endif /* defined(__LP64__) */
4256 [Z_SUBMAP_IDX_GENERAL_MAP] = 40,
4257 [Z_SUBMAP_IDX_BAG_OF_BYTES_MAP] = 40,
4258 };
4259
4260 for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
4261 #if DEBUG || DEVELOPMENT
4262 char submap_name[MAX_SUBMAP_NAME];
4263 snprintf(submap_name, MAX_SUBMAP_NAME, "submap%d", idx);
4264 PE_parse_boot_argn(submap_name, &zone_sub_map_numer[idx], sizeof(uint64_t));
4265 #endif
4266 remaining_denom += zone_sub_map_numer[idx];
4267 }
4268
4269 /*
4270 * And now allocate the various pieces of VA and submaps.
4271 *
4272 * Make a first allocation of contiguous VA, that we'll deallocate,
4273 * and we'll carve-out memory in that range again linearly.
4274 * The kernel is stil single threaded at this stage.
4275 */
4276
4277 struct zone_map_range *map_range = &zone_info.zi_map_range;
4278
4279 *map_range = zone_init_allocate_va(&submap_min, zone_map_size, false);
4280 submap_min = map_range->min_address;
4281 kmem_free(kernel_map, submap_min, zone_map_size);
4282
4283 #if defined(__LP64__)
4284 /*
4285 * Allocate `Z_SUBMAP_IDX_VA_RESTRICTED_MAP` first because its VA range
4286 * can't go beyond RESTRICTED_VA_MAX for the vm_page_t packing to work.
4287 */
4288 zone_submap_init(&submap_min, Z_SUBMAP_IDX_VA_RESTRICTED_MAP,
4289 zone_sub_map_numer[Z_SUBMAP_IDX_VA_RESTRICTED_MAP], &remaining_denom,
4290 &remaining_size, SINGLE_GUARD);
4291 #endif /* defined(__LP64__) */
4292
4293 /*
4294 * Allocate metadata array
4295 */
4296 zone_info.zi_meta_range =
4297 zone_init_allocate_va(&submap_min, zone_meta_size, true);
4298 zone_init_allocate_va(&submap_min, SINGLE_GUARD, true);
4299
4300 zone_info.zi_array_base =
4301 (struct zone_page_metadata *)zone_info.zi_meta_range.min_address -
4302 zone_pva_from_addr(map_range->min_address).packed_address;
4303
4304 /*
4305 * Allocate other submaps
4306 */
4307 for (unsigned idx = Z_SUBMAP_IDX_GENERAL_MAP; idx <= zone_last_submap_idx; idx++) {
4308 zone_submap_init(&submap_min, idx, zone_sub_map_numer[idx],
4309 &remaining_denom, &remaining_size, MULTI_GUARD);
4310 }
4311
4312 vm_map_t general_map = zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP];
4313 zone_info.zi_general_range.min_address = vm_map_min(general_map);
4314 zone_info.zi_general_range.max_address = vm_map_max(general_map);
4315
4316 assert(submap_min == map_range->max_address);
4317
4318 #if CONFIG_GZALLOC
4319 gzalloc_init(zone_map_size);
4320 #endif
4321
4322 zone_create_flags_t kma_flags = ZC_NOCACHING |
4323 ZC_NOGC | ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT |
4324 ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE;
4325
4326 (void)zone_create_ext("vm.permanent", 1, kma_flags,
4327 ZONE_ID_PERMANENT, ^(zone_t z){
4328 z->permanent = true;
4329 z->z_elem_size = 1;
4330 z->pcpu_elem_size = 1;
4331 #if defined(__LP64__)
4332 z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
4333 #endif
4334 });
4335 (void)zone_create_ext("vm.permanent.percpu", 1, kma_flags | ZC_PERCPU,
4336 ZONE_ID_PERCPU_PERMANENT, ^(zone_t z){
4337 z->permanent = true;
4338 z->z_elem_size = 1;
4339 z->pcpu_elem_size = zpercpu_count();
4340 #if defined(__LP64__)
4341 z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
4342 #endif
4343 });
4344
4345 /*
4346 * Now fix the zones that are missing their zone stats
4347 * we don't really know if zfree()s happened so our stats
4348 * are slightly off for early boot. ¯\_(ツ)_/¯
4349 */
4350 zone_index_foreach(idx) {
4351 zone_t tz = &zone_array[idx];
4352
4353 if (tz->z_self) {
4354 zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats);
4355
4356 zpercpu_get_cpu(zs, 0)->zs_mem_allocated +=
4357 (tz->countavail - tz->countfree) *
4358 zone_elem_size(tz);
4359 assert(tz->z_stats == NULL);
4360 tz->z_stats = zs;
4361 #if ZONE_ENABLE_LOGGING
4362 if (tz->zone_logging && !tz->zlog_btlog) {
4363 zone_enable_logging(tz);
4364 }
4365 #endif
4366 }
4367 }
4368
4369 #if CONFIG_ZLEAKS
4370 /*
4371 * Initialize the zone leak monitor
4372 */
4373 zleak_init(zone_map_size);
4374 #endif /* CONFIG_ZLEAKS */
4375
4376 #if VM_MAX_TAG_ZONES
4377 if (zone_tagging_on) {
4378 vm_allocation_zones_init();
4379 }
4380 #endif
4381 }
4382 STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init);
4383
4384 __startup_func
4385 static void
4386 zone_set_foreign_range(
4387 vm_offset_t range_min,
4388 vm_offset_t range_max)
4389 {
4390 zone_info.zi_foreign_range.min_address = range_min;
4391 zone_info.zi_foreign_range.max_address = range_max;
4392 }
4393
4394 __startup_func
4395 vm_offset_t
4396 zone_foreign_mem_init(vm_size_t size)
4397 {
4398 vm_offset_t mem = (vm_offset_t) pmap_steal_memory(size);
4399 zone_set_foreign_range(mem, mem + size);
4400 return mem;
4401 }
4402
4403 #pragma mark zalloc
4404
4405 #if KASAN_ZALLOC
4406 /*
4407 * Called from zfree() to add the element being freed to the KASan quarantine.
4408 *
4409 * Returns true if the newly-freed element made it into the quarantine without
4410 * displacing another, false otherwise. In the latter case, addrp points to the
4411 * address of the displaced element, which will be freed by the zone.
4412 */
4413 static bool
4414 kasan_quarantine_freed_element(
4415 zone_t *zonep, /* the zone the element is being freed to */
4416 void **addrp) /* address of the element being freed */
4417 {
4418 zone_t zone = *zonep;
4419 void *addr = *addrp;
4420
4421 /*
4422 * Resize back to the real allocation size and hand off to the KASan
4423 * quarantine. `addr` may then point to a different allocation, if the
4424 * current element replaced another in the quarantine. The zone then
4425 * takes ownership of the swapped out free element.
4426 */
4427 vm_size_t usersz = zone_elem_size(zone) - 2 * zone->kasan_redzone;
4428 vm_size_t sz = usersz;
4429
4430 if (addr && zone->kasan_redzone) {
4431 kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
4432 addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
4433 assert(sz == zone_elem_size(zone));
4434 }
4435 if (addr && !zone->kasan_noquarantine) {
4436 kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true);
4437 if (!addr) {
4438 return TRUE;
4439 }
4440 }
4441 if (addr && zone->kasan_noquarantine) {
4442 kasan_unpoison(addr, zone_elem_size(zone));
4443 }
4444 *addrp = addr;
4445 return FALSE;
4446 }
4447
4448 #endif /* KASAN_ZALLOC */
4449
4450 static inline bool
4451 zone_needs_async_refill(zone_t zone)
4452 {
4453 if (zone->countfree != 0 || zone->async_pending || zone->no_callout) {
4454 return false;
4455 }
4456
4457 return zone->expandable || zone->page_count < zone->page_count_max;
4458 }
4459
4460 __attribute__((noinline))
4461 static void
4462 zone_refill_synchronously_locked(
4463 zone_t zone,
4464 zalloc_flags_t flags)
4465 {
4466 thread_t thr = current_thread();
4467 bool set_expanding_vm_priv = false;
4468 zone_pva_t orig = zone->pages_intermediate;
4469
4470 while ((flags & Z_NOWAIT) == 0 && (zone->permanent
4471 ? zone_pva_is_equal(zone->pages_intermediate, orig)
4472 : zone->countfree == 0)) {
4473 /*
4474 * zone is empty, try to expand it
4475 *
4476 * Note that we now allow up to 2 threads (1 vm_privliged and
4477 * 1 non-vm_privliged) to expand the zone concurrently...
4478 *
4479 * this is necessary to avoid stalling vm_privileged threads
4480 * running critical code necessary to continue
4481 * compressing/swapping pages (i.e. making new free pages) from
4482 * stalling behind non-vm_privileged threads waiting to acquire
4483 * free pages when the vm_page_free_count is below the
4484 * vm_page_free_reserved limit.
4485 */
4486 if ((zone->expanding_no_vm_priv || zone->expanding_vm_priv) &&
4487 (((thr->options & TH_OPT_VMPRIV) == 0) || zone->expanding_vm_priv)) {
4488 /*
4489 * This is a non-vm_privileged thread and a non-vm_privileged or
4490 * a vm_privileged thread is already expanding the zone...
4491 * OR
4492 * this is a vm_privileged thread and a vm_privileged thread is
4493 * already expanding the zone...
4494 *
4495 * In either case wait for a thread to finish, then try again.
4496 */
4497 zone->waiting = true;
4498 assert_wait(zone, THREAD_UNINT);
4499 unlock_zone(zone);
4500 thread_block(THREAD_CONTINUE_NULL);
4501 lock_zone(zone);
4502 continue;
4503 }
4504
4505 if (zone->page_count >= zone->page_count_max) {
4506 if (zone->exhaustible) {
4507 break;
4508 }
4509 if (zone->expandable) {
4510 /*
4511 * If we're expandable, just don't go through this again.
4512 */
4513 zone->page_count_max = ~0u;
4514 } else {
4515 unlock_zone(zone);
4516
4517 panic_include_zprint = true;
4518 #if CONFIG_ZLEAKS
4519 if (zleak_state & ZLEAK_STATE_ACTIVE) {
4520 panic_include_ztrace = true;
4521 }
4522 #endif /* CONFIG_ZLEAKS */
4523 panic("zalloc: zone \"%s\" empty.", zone->z_name);
4524 }
4525 }
4526
4527 /*
4528 * It is possible that a BG thread is refilling/expanding the zone
4529 * and gets pre-empted during that operation. That blocks all other
4530 * threads from making progress leading to a watchdog timeout. To
4531 * avoid that, boost the thread priority using the rwlock boost
4532 */
4533 set_thread_rwlock_boost();
4534
4535 if ((thr->options & TH_OPT_VMPRIV)) {
4536 zone->expanding_vm_priv = true;
4537 set_expanding_vm_priv = true;
4538 } else {
4539 zone->expanding_no_vm_priv = true;
4540 }
4541
4542 zone_replenish_locked(zone, flags, false);
4543
4544 if (set_expanding_vm_priv == true) {
4545 zone->expanding_vm_priv = false;
4546 } else {
4547 zone->expanding_no_vm_priv = false;
4548 }
4549
4550 if (zone->waiting) {
4551 zone->waiting = false;
4552 thread_wakeup(zone);
4553 }
4554 clear_thread_rwlock_boost();
4555
4556 if (zone->countfree == 0) {
4557 assert(flags & Z_NOPAGEWAIT);
4558 break;
4559 }
4560 }
4561
4562 if ((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) &&
4563 zone_needs_async_refill(zone) && !vm_pool_low()) {
4564 zone->async_pending = true;
4565 unlock_zone(zone);
4566 thread_call_enter(&call_async_alloc);
4567 lock_zone(zone);
4568 assert(zone->z_self == zone);
4569 }
4570 }
4571
4572 __attribute__((noinline))
4573 static void
4574 zone_refill_asynchronously_locked(zone_t zone)
4575 {
4576 uint32_t min_free = zone->prio_refill_count / 2;
4577 uint32_t resv_free = zone->prio_refill_count / 4;
4578 thread_t thr = current_thread();
4579
4580 /*
4581 * Nothing to do if there are plenty of elements.
4582 */
4583 while (zone->countfree <= min_free) {
4584 /*
4585 * Wakeup the replenish thread if not running.
4586 */
4587 if (!zone->zone_replenishing) {
4588 lck_spin_lock(&zone_replenish_lock);
4589 assert(zone_replenish_active < zone_replenish_max_threads);
4590 ++zone_replenish_active;
4591 lck_spin_unlock(&zone_replenish_lock);
4592 zone->zone_replenishing = true;
4593 zone_replenish_wakeups_initiated++;
4594 thread_wakeup(&zone->prio_refill_count);
4595 }
4596
4597 /*
4598 * We'll let VM_PRIV threads to continue to allocate until the
4599 * reserve drops to 25%. After that only TH_OPT_ZONE_PRIV threads
4600 * may continue.
4601 *
4602 * TH_OPT_ZONE_PRIV threads are the GC thread and a replenish thread itself.
4603 * Replenish threads *need* to use the reserve. GC threads need to
4604 * get through the current allocation, but then will wait at a higher
4605 * level after they've dropped any locks which would deadlock the
4606 * replenish thread.
4607 */
4608 if ((zone->countfree > resv_free && (thr->options & TH_OPT_VMPRIV)) ||
4609 (thr->options & TH_OPT_ZONE_PRIV)) {
4610 break;
4611 }
4612
4613 /*
4614 * Wait for the replenish threads to add more elements for us to allocate from.
4615 */
4616 zone_replenish_throttle_count++;
4617 unlock_zone(zone);
4618 assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
4619 thread_block(THREAD_CONTINUE_NULL);
4620 lock_zone(zone);
4621
4622 assert(zone->z_self == zone);
4623 }
4624
4625 /*
4626 * If we're here because of zone_gc(), we didn't wait for
4627 * zone_replenish_thread to finish. So we need to ensure that
4628 * we will successfully grab an element.
4629 *
4630 * zones that have a replenish thread configured.
4631 * The value of (refill_level / 2) in the previous bit of code should have
4632 * given us headroom even though this thread didn't wait.
4633 */
4634 if (thr->options & TH_OPT_ZONE_PRIV) {
4635 assert(zone->countfree != 0);
4636 }
4637 }
4638
4639 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
4640 __attribute__((noinline))
4641 static void
4642 zalloc_log_or_trace_leaks(zone_t zone, vm_offset_t addr)
4643 {
4644 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */
4645 unsigned int numsaved = 0;
4646
4647 #if ZONE_ENABLE_LOGGING
4648 if (DO_LOGGING(zone)) {
4649 numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4650 __builtin_frame_address(0), NULL);
4651 btlog_add_entry(zone->zlog_btlog, (void *)addr,
4652 ZOP_ALLOC, (void **)zbt, numsaved);
4653 }
4654 #endif
4655
4656 #if CONFIG_ZLEAKS
4657 /*
4658 * Zone leak detection: capture a backtrace every zleak_sample_factor
4659 * allocations in this zone.
4660 */
4661 if (__improbable(zone->zleak_on)) {
4662 if (sample_counter(&zone->zleak_capture, zleak_sample_factor)) {
4663 /* Avoid backtracing twice if zone logging is on */
4664 if (numsaved == 0) {
4665 numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4666 __builtin_frame_address(1), NULL);
4667 }
4668 /* Sampling can fail if another sample is happening at the same time in a different zone. */
4669 if (!zleak_log(zbt, addr, numsaved, zone_elem_size(zone))) {
4670 /* If it failed, roll back the counter so we sample the next allocation instead. */
4671 zone->zleak_capture = zleak_sample_factor;
4672 }
4673 }
4674 }
4675
4676 if (__improbable(zone_leaks_scan_enable &&
4677 !(zone_elem_size(zone) & (sizeof(uintptr_t) - 1)))) {
4678 unsigned int count, idx;
4679 /* Fill element, from tail, with backtrace in reverse order */
4680 if (numsaved == 0) {
4681 numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4682 __builtin_frame_address(1), NULL);
4683 }
4684 count = (unsigned int)(zone_elem_size(zone) / sizeof(uintptr_t));
4685 if (count >= numsaved) {
4686 count = numsaved - 1;
4687 }
4688 for (idx = 0; idx < count; idx++) {
4689 ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
4690 }
4691 }
4692 #endif /* CONFIG_ZLEAKS */
4693 }
4694
4695 static inline bool
4696 zalloc_should_log_or_trace_leaks(zone_t zone, vm_size_t elem_size)
4697 {
4698 #if ZONE_ENABLE_LOGGING
4699 if (DO_LOGGING(zone)) {
4700 return true;
4701 }
4702 #endif
4703 #if CONFIG_ZLEAKS
4704 /*
4705 * Zone leak detection: capture a backtrace every zleak_sample_factor
4706 * allocations in this zone.
4707 */
4708 if (zone->zleak_on) {
4709 return true;
4710 }
4711 if (zone_leaks_scan_enable && !(elem_size & (sizeof(uintptr_t) - 1))) {
4712 return true;
4713 }
4714 #endif /* CONFIG_ZLEAKS */
4715 return false;
4716 }
4717 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
4718 #if ZONE_ENABLE_LOGGING
4719
4720 __attribute__((noinline))
4721 static void
4722 zfree_log_trace(zone_t zone, vm_offset_t addr)
4723 {
4724 /*
4725 * See if we're doing logging on this zone.
4726 *
4727 * There are two styles of logging used depending on
4728 * whether we're trying to catch a leak or corruption.
4729 */
4730 if (__improbable(DO_LOGGING(zone))) {
4731 if (corruption_debug_flag) {
4732 uintptr_t zbt[MAX_ZTRACE_DEPTH];
4733 unsigned int numsaved;
4734 /*
4735 * We're logging to catch a corruption.
4736 *
4737 * Add a record of this zfree operation to log.
4738 */
4739 numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4740 __builtin_frame_address(1), NULL);
4741 btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE,
4742 (void **)zbt, numsaved);
4743 } else {
4744 /*
4745 * We're logging to catch a leak.
4746 *
4747 * Remove any record we might have for this element
4748 * since it's being freed. Note that we may not find it
4749 * if the buffer overflowed and that's OK.
4750 *
4751 * Since the log is of a limited size, old records get
4752 * overwritten if there are more zallocs than zfrees.
4753 */
4754 btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
4755 }
4756 }
4757 }
4758 #endif /* ZONE_ENABLE_LOGGING */
4759
4760 /*
4761 * Removes an element from the zone's free list, returning 0 if the free list is empty.
4762 * Verifies that the next-pointer and backup next-pointer are intact,
4763 * and verifies that a poisoned element hasn't been modified.
4764 */
4765 vm_offset_t
4766 zalloc_direct_locked(
4767 zone_t zone,
4768 zalloc_flags_t flags __unused,
4769 vm_size_t waste __unused)
4770 {
4771 struct zone_page_metadata *page_meta;
4772 zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
4773 vm_offset_t element, page, validate_bit = 0;
4774
4775 /* if zone is empty, bail */
4776 if (!zone_pva_is_null(zone->pages_any_free_foreign)) {
4777 kind = ZONE_ADDR_FOREIGN;
4778 page_meta = zone_pva_to_meta(zone->pages_any_free_foreign, kind);
4779 page = (vm_offset_t)page_meta;
4780 } else if (!zone_pva_is_null(zone->pages_intermediate)) {
4781 page_meta = zone_pva_to_meta(zone->pages_intermediate, kind);
4782 page = zone_pva_to_addr(zone->pages_intermediate);
4783 } else if (!zone_pva_is_null(zone->pages_all_free)) {
4784 page_meta = zone_pva_to_meta(zone->pages_all_free, kind);
4785 page = zone_pva_to_addr(zone->pages_all_free);
4786 if (os_sub_overflow(zone->allfree_page_count,
4787 page_meta->zm_page_count, &zone->allfree_page_count)) {
4788 zone_accounting_panic(zone, "allfree_page_count wrap-around");
4789 }
4790 } else {
4791 zone_accounting_panic(zone, "countfree corruption");
4792 }
4793
4794 if (!zone_has_index(zone, page_meta->zm_index)) {
4795 zone_page_metadata_index_confusion_panic(zone, page, page_meta);
4796 }
4797
4798 element = zone_page_meta_get_freelist(zone, page_meta, page);
4799
4800 vm_offset_t *primary = (vm_offset_t *) element;
4801 vm_offset_t *backup = get_backup_ptr(zone_elem_size(zone), primary);
4802
4803 /*
4804 * since the primary next pointer is xor'ed with zp_nopoison_cookie
4805 * for obfuscation, retrieve the original value back
4806 */
4807 vm_offset_t next_element = *primary ^ zp_nopoison_cookie;
4808 vm_offset_t next_element_primary = *primary;
4809 vm_offset_t next_element_backup = *backup;
4810
4811 /*
4812 * backup_ptr_mismatch_panic will determine what next_element
4813 * should have been, and print it appropriately
4814 */
4815 if (!zone_page_meta_is_sane_element(zone, page_meta, page, next_element, kind)) {
4816 backup_ptr_mismatch_panic(zone, page_meta, page, element);
4817 }
4818
4819 /* Check the backup pointer for the regular cookie */
4820 if (__improbable(next_element_primary != next_element_backup)) {
4821 /* Check for the poisoned cookie instead */
4822 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) {
4823 /* Neither cookie is valid, corruption has occurred */
4824 backup_ptr_mismatch_panic(zone, page_meta, page, element);
4825 }
4826
4827 /*
4828 * Element was marked as poisoned, so check its integrity before using it.
4829 */
4830 validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
4831 } else if (zone->zfree_clear_mem) {
4832 validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
4833 }
4834
4835 /* Remove this element from the free list */
4836 zone_page_meta_set_freelist(page_meta, page, next_element);
4837
4838 if (kind == ZONE_ADDR_FOREIGN) {
4839 if (next_element == 0) {
4840 /* last foreign element allocated on page, move to all_used_foreign */
4841 zone_meta_requeue(zone, &zone->pages_all_used_foreign, page_meta, kind);
4842 }
4843 } else if (next_element == 0) {
4844 zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
4845 } else if (page_meta->zm_alloc_count == 0) {
4846 /* remove from free, move to intermediate */
4847 zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
4848 }
4849
4850 if (os_add_overflow(page_meta->zm_alloc_count, 1,
4851 &page_meta->zm_alloc_count)) {
4852 /*
4853 * This will not catch a lot of errors, the proper check
4854 * would be against the number of elements this run should
4855 * have which is expensive to count.
4856 *
4857 * But zm_alloc_count is a 16 bit number which could
4858 * theoretically be valuable to cause to wrap around,
4859 * so catch this.
4860 */
4861 zone_page_meta_accounting_panic(zone, page_meta,
4862 "zm_alloc_count overflow");
4863 }
4864 if (os_sub_overflow(zone->countfree, 1, &zone->countfree)) {
4865 zone_accounting_panic(zone, "countfree wrap-around");
4866 }
4867
4868 #if VM_MAX_TAG_ZONES
4869 if (__improbable(zone->tags)) {
4870 vm_tag_t tag = zalloc_flags_get_tag(flags);
4871 // set the tag with b0 clear so the block remains inuse
4872 ZTAG(zone, element)[0] = (vm_tag_t)(tag << 1);
4873 vm_tag_update_zone_size(tag, zone->tag_zone_index,
4874 zone_elem_size(zone), waste);
4875 }
4876 #endif /* VM_MAX_TAG_ZONES */
4877 #if KASAN_ZALLOC
4878 if (zone->percpu) {
4879 zpercpu_foreach_cpu(i) {
4880 kasan_poison_range(element + ptoa(i),
4881 zone_elem_size(zone), ASAN_VALID);
4882 }
4883 } else {
4884 kasan_poison_range(element, zone_elem_size(zone), ASAN_VALID);
4885 }
4886 #endif
4887
4888 return element | validate_bit;
4889 }
4890
4891 /*
4892 * zalloc returns an element from the specified zone.
4893 *
4894 * The function is noinline when zlog can be used so that the backtracing can
4895 * reliably skip the zalloc_ext() and zalloc_log_or_trace_leaks()
4896 * boring frames.
4897 */
4898 #if ZONE_ENABLE_LOGGING
4899 __attribute__((noinline))
4900 #endif
4901 void *
4902 zalloc_ext(
4903 zone_t zone,
4904 zone_stats_t zstats,
4905 zalloc_flags_t flags,
4906 vm_size_t waste)
4907 {
4908 vm_offset_t addr = 0;
4909 vm_size_t elem_size = zone_elem_size(zone);
4910
4911 /*
4912 * KASan uses zalloc() for fakestack, which can be called anywhere.
4913 * However, we make sure these calls can never block.
4914 */
4915 assert(zone->kasan_fakestacks ||
4916 ml_get_interrupts_enabled() ||
4917 ml_is_quiescing() ||
4918 debug_mode_active() ||
4919 startup_phase < STARTUP_SUB_EARLY_BOOT);
4920
4921 /*
4922 * Make sure Z_NOFAIL was not obviously misused
4923 */
4924 if ((flags & Z_NOFAIL) && !zone->prio_refill_count) {
4925 assert(!zone->exhaustible && (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
4926 }
4927
4928 #if CONFIG_ZCACHE
4929 /*
4930 * Note: if zone caching is on, gzalloc and tags aren't used
4931 * so we can always check this first
4932 */
4933 if (zone_caching_enabled(zone)) {
4934 addr = zcache_alloc_from_cpu_cache(zone, zstats, waste);
4935 if (__probable(addr)) {
4936 goto allocated_from_cache;
4937 }
4938 }
4939 #endif /* CONFIG_ZCACHE */
4940
4941 #if CONFIG_GZALLOC
4942 if (__improbable(zone->gzalloc_tracked)) {
4943 addr = gzalloc_alloc(zone, zstats, flags);
4944 goto allocated_from_gzalloc;
4945 }
4946 #endif /* CONFIG_GZALLOC */
4947 #if VM_MAX_TAG_ZONES
4948 if (__improbable(zone->tags)) {
4949 vm_tag_t tag = zalloc_flags_get_tag(flags);
4950 if (tag == VM_KERN_MEMORY_NONE) {
4951 /*
4952 * zone views into heaps can lead to a site-less call
4953 * and we fallback to KALLOC as a tag for those.
4954 */
4955 tag = VM_KERN_MEMORY_KALLOC;
4956 flags |= Z_VM_TAG(tag);
4957 }
4958 vm_tag_will_update_zone(tag, zone->tag_zone_index);
4959 }
4960 #endif /* VM_MAX_TAG_ZONES */
4961
4962 lock_zone(zone);
4963 assert(zone->z_self == zone);
4964
4965 /*
4966 * Check if we need another thread to replenish the zone or
4967 * if we have to wait for a replenish thread to finish.
4968 * This is used for elements, like vm_map_entry, which are
4969 * needed themselves to implement zalloc().
4970 */
4971 if (__improbable(zone->prio_refill_count &&
4972 zone->countfree <= zone->prio_refill_count / 2)) {
4973 zone_refill_asynchronously_locked(zone);
4974 } else if (__improbable(zone->countfree == 0)) {
4975 zone_refill_synchronously_locked(zone, flags);
4976 if (__improbable(zone->countfree == 0)) {
4977 unlock_zone(zone);
4978 if (__improbable(flags & Z_NOFAIL)) {
4979 zone_nofail_panic(zone);
4980 }
4981 goto out_nomem;
4982 }
4983 }
4984
4985 addr = zalloc_direct_locked(zone, flags, waste);
4986 if (__probable(zstats != NULL)) {
4987 /*
4988 * The few vm zones used before zone_init() runs do not have
4989 * per-cpu stats yet
4990 */
4991 int cpu = cpu_number();
4992 zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated += elem_size;
4993 #if ZALLOC_DETAILED_STATS
4994 if (waste) {
4995 zpercpu_get_cpu(zstats, cpu)->zs_mem_wasted += waste;
4996 }
4997 #endif /* ZALLOC_DETAILED_STATS */
4998 }
4999
5000 unlock_zone(zone);
5001
5002 #if ZALLOC_ENABLE_POISONING
5003 bool validate = addr & ZALLOC_ELEMENT_NEEDS_VALIDATION;
5004 #endif
5005 addr &= ~ZALLOC_ELEMENT_NEEDS_VALIDATION;
5006 zone_clear_freelist_pointers(zone, addr);
5007 #if ZALLOC_ENABLE_POISONING
5008 /*
5009 * Note: percpu zones do not respect ZONE_MIN_ELEM_SIZE,
5010 * so we will check the first word even if we just
5011 * cleared it.
5012 */
5013 zalloc_validate_element(zone, addr, elem_size - sizeof(vm_offset_t),
5014 validate);
5015 #endif /* ZALLOC_ENABLE_POISONING */
5016
5017 allocated_from_cache:
5018 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
5019 if (__improbable(zalloc_should_log_or_trace_leaks(zone, elem_size))) {
5020 zalloc_log_or_trace_leaks(zone, addr);
5021 }
5022 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
5023
5024 #if CONFIG_GZALLOC
5025 allocated_from_gzalloc:
5026 #endif
5027 #if KASAN_ZALLOC
5028 if (zone->kasan_redzone) {
5029 addr = kasan_alloc(addr, elem_size,
5030 elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
5031 elem_size -= 2 * zone->kasan_redzone;
5032 }
5033 /*
5034 * Initialize buffer with unique pattern only if memory
5035 * wasn't expected to be zeroed.
5036 */
5037 if (!zone->zfree_clear_mem && !(flags & Z_ZERO)) {
5038 kasan_leak_init(addr, elem_size);
5039 }
5040 #endif /* KASAN_ZALLOC */
5041 if ((flags & Z_ZERO) && !zone->zfree_clear_mem) {
5042 bzero((void *)addr, elem_size);
5043 }
5044
5045 TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, elem_size, addr);
5046
5047 out_nomem:
5048 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
5049 return (void *)addr;
5050 }
5051
5052 void *
5053 zalloc(union zone_or_view zov)
5054 {
5055 return zalloc_flags(zov, Z_WAITOK);
5056 }
5057
5058 void *
5059 zalloc_noblock(union zone_or_view zov)
5060 {
5061 return zalloc_flags(zov, Z_NOWAIT);
5062 }
5063
5064 void *
5065 zalloc_flags(union zone_or_view zov, zalloc_flags_t flags)
5066 {
5067 zone_t zone = zov.zov_view->zv_zone;
5068 zone_stats_t zstats = zov.zov_view->zv_stats;
5069 assert(!zone->percpu);
5070 return zalloc_ext(zone, zstats, flags, 0);
5071 }
5072
5073 void *
5074 zalloc_percpu(union zone_or_view zov, zalloc_flags_t flags)
5075 {
5076 zone_t zone = zov.zov_view->zv_zone;
5077 zone_stats_t zstats = zov.zov_view->zv_stats;
5078 assert(zone->percpu);
5079 return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags, 0));
5080 }
5081
5082 static void *
5083 _zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask)
5084 {
5085 const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
5086 struct zone_page_metadata *page_meta;
5087 vm_offset_t offs, addr;
5088 zone_pva_t pva;
5089
5090 assert(ml_get_interrupts_enabled() ||
5091 ml_is_quiescing() ||
5092 debug_mode_active() ||
5093 startup_phase < STARTUP_SUB_EARLY_BOOT);
5094
5095 size = (size + mask) & ~mask;
5096 assert(size <= PAGE_SIZE);
5097
5098 lock_zone(zone);
5099 assert(zone->z_self == zone);
5100
5101 for (;;) {
5102 pva = zone->pages_intermediate;
5103 while (!zone_pva_is_null(pva)) {
5104 page_meta = zone_pva_to_meta(pva, kind);
5105 if (page_meta->zm_freelist_offs + size <= PAGE_SIZE) {
5106 goto found;
5107 }
5108 pva = page_meta->zm_page_next;
5109 }
5110
5111 zone_refill_synchronously_locked(zone, Z_WAITOK);
5112 }
5113
5114 found:
5115 offs = (page_meta->zm_freelist_offs + mask) & ~mask;
5116 page_meta->zm_freelist_offs = offs + size;
5117 page_meta->zm_alloc_count += size;
5118 zone->countfree -= size;
5119 if (__probable(zone->z_stats)) {
5120 zpercpu_get(zone->z_stats)->zs_mem_allocated += size;
5121 }
5122
5123 if (page_meta->zm_alloc_count >= PAGE_SIZE - sizeof(vm_offset_t)) {
5124 zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
5125 }
5126
5127 unlock_zone(zone);
5128
5129 addr = offs + zone_pva_to_addr(pva);
5130
5131 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
5132 return (void *)addr;
5133 }
5134
5135 static void *
5136 _zalloc_permanent_large(size_t size, vm_offset_t mask)
5137 {
5138 kern_return_t kr;
5139 vm_offset_t addr;
5140
5141 kr = kernel_memory_allocate(kernel_map, &addr, size, mask,
5142 KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO,
5143 VM_KERN_MEMORY_KALLOC);
5144 if (kr != 0) {
5145 panic("zalloc_permanent: unable to allocate %zd bytes (%d)",
5146 size, kr);
5147 }
5148 return (void *)addr;
5149 }
5150
5151 void *
5152 zalloc_permanent(vm_size_t size, vm_offset_t mask)
5153 {
5154 if (size <= PAGE_SIZE) {
5155 zone_t zone = &zone_array[ZONE_ID_PERMANENT];
5156 return _zalloc_permanent(zone, size, mask);
5157 }
5158 return _zalloc_permanent_large(size, mask);
5159 }
5160
5161 void *
5162 zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask)
5163 {
5164 zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT];
5165 return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask));
5166 }
5167
5168 void
5169 zalloc_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
5170 {
5171 zone_index_foreach(i) {
5172 zone_t z = &zone_array[i];
5173
5174 if (z->no_callout) {
5175 /* async_pending will never be set */
5176 continue;
5177 }
5178
5179 lock_zone(z);
5180 if (z->z_self && z->async_pending) {
5181 z->async_pending = false;
5182 zone_refill_synchronously_locked(z, Z_WAITOK);
5183 }
5184 unlock_zone(z);
5185 }
5186 }
5187
5188 /*
5189 * Adds the element to the head of the zone's free list
5190 * Keeps a backup next-pointer at the end of the element
5191 */
5192 void
5193 zfree_direct_locked(zone_t zone, vm_offset_t element, bool poison)
5194 {
5195 struct zone_page_metadata *page_meta;
5196 vm_offset_t page, old_head;
5197 zone_addr_kind_t kind;
5198 vm_size_t elem_size = zone_elem_size(zone);
5199
5200 vm_offset_t *primary = (vm_offset_t *) element;
5201 vm_offset_t *backup = get_backup_ptr(elem_size, primary);
5202
5203 page_meta = zone_allocated_element_resolve(zone, element, &page, &kind);
5204 old_head = zone_page_meta_get_freelist(zone, page_meta, page);
5205
5206 if (__improbable(old_head == element)) {
5207 panic("zfree: double free of %p to zone %s%s\n",
5208 (void *) element, zone_heap_name(zone), zone->z_name);
5209 }
5210
5211 #if ZALLOC_ENABLE_POISONING
5212 if (poison && elem_size < ZONE_MIN_ELEM_SIZE) {
5213 assert(zone->percpu);
5214 poison = false;
5215 }
5216 #else
5217 poison = false;
5218 #endif
5219
5220 /*
5221 * Always write a redundant next pointer
5222 * So that it is more difficult to forge, xor it with a random cookie
5223 * A poisoned element is indicated by using zp_poisoned_cookie
5224 * instead of zp_nopoison_cookie
5225 */
5226
5227 *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
5228
5229 /*
5230 * Insert this element at the head of the free list. We also xor the
5231 * primary pointer with the zp_nopoison_cookie to make sure a free
5232 * element does not provide the location of the next free element directly.
5233 */
5234 *primary = old_head ^ zp_nopoison_cookie;
5235
5236 #if VM_MAX_TAG_ZONES
5237 if (__improbable(zone->tags)) {
5238 vm_tag_t tag = (ZTAG(zone, element)[0] >> 1);
5239 // set the tag with b0 clear so the block remains inuse
5240 ZTAG(zone, element)[0] = 0xFFFE;
5241 vm_tag_update_zone_size(tag, zone->tag_zone_index,
5242 -((int64_t)elem_size), 0);
5243 }
5244 #endif /* VM_MAX_TAG_ZONES */
5245
5246 zone_page_meta_set_freelist(page_meta, page, element);
5247 if (os_sub_overflow(page_meta->zm_alloc_count, 1,
5248 &page_meta->zm_alloc_count)) {
5249 zone_page_meta_accounting_panic(zone, page_meta,
5250 "alloc_count wrap-around");
5251 }
5252 zone->countfree++;
5253
5254 if (kind == ZONE_ADDR_FOREIGN) {
5255 if (old_head == 0) {
5256 /* first foreign element freed on page, move from all_used_foreign */
5257 zone_meta_requeue(zone, &zone->pages_any_free_foreign, page_meta, kind);
5258 }
5259 } else if (page_meta->zm_alloc_count == 0) {
5260 /* whether the page was on the intermediate or all_used, queue, move it to free */
5261 zone_meta_requeue(zone, &zone->pages_all_free, page_meta, kind);
5262 zone->allfree_page_count += page_meta->zm_page_count;
5263 } else if (old_head == 0) {
5264 /* first free element on page, move from all_used */
5265 zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
5266 }
5267
5268 #if KASAN_ZALLOC
5269 if (zone->percpu) {
5270 zpercpu_foreach_cpu(i) {
5271 kasan_poison_range(element + ptoa(i), elem_size,
5272 ASAN_HEAP_FREED);
5273 }
5274 } else {
5275 kasan_poison_range(element, elem_size, ASAN_HEAP_FREED);
5276 }
5277 #endif
5278 }
5279
5280 /*
5281 * The function is noinline when zlog can be used so that the backtracing can
5282 * reliably skip the zfree_ext() and zfree_log_trace()
5283 * boring frames.
5284 */
5285 #if ZONE_ENABLE_LOGGING
5286 __attribute__((noinline))
5287 #endif
5288 void
5289 zfree_ext(zone_t zone, zone_stats_t zstats, void *addr)
5290 {
5291 vm_offset_t elem = (vm_offset_t)addr;
5292 vm_size_t elem_size = zone_elem_size(zone);
5293 bool poison = false;
5294
5295 DTRACE_VM2(zfree, zone_t, zone, void*, addr);
5296 TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, elem_size, elem);
5297
5298 #if KASAN_ZALLOC
5299 if (kasan_quarantine_freed_element(&zone, &addr)) {
5300 return;
5301 }
5302 /*
5303 * kasan_quarantine_freed_element() might return a different
5304 * {zone, addr} than the one being freed for kalloc heaps.
5305 *
5306 * Make sure we reload everything.
5307 */
5308 elem = (vm_offset_t)addr;
5309 elem_size = zone_elem_size(zone);
5310 #endif
5311
5312 #if CONFIG_ZLEAKS
5313 /*
5314 * Zone leak detection: un-track the allocation
5315 */
5316 if (__improbable(zone->zleak_on)) {
5317 zleak_free(elem, elem_size);
5318 }
5319 #endif /* CONFIG_ZLEAKS */
5320
5321 #if CONFIG_ZCACHE
5322 /*
5323 * Note: if zone caching is on, gzalloc and tags aren't used
5324 * so we can always check this first
5325 */
5326 if (zone_caching_enabled(zone)) {
5327 return zcache_free_to_cpu_cache(zone, zstats, (vm_offset_t)addr);
5328 }
5329 #endif /* CONFIG_ZCACHE */
5330
5331 #if CONFIG_GZALLOC
5332 if (__improbable(zone->gzalloc_tracked)) {
5333 return gzalloc_free(zone, zstats, addr);
5334 }
5335 #endif /* CONFIG_GZALLOC */
5336
5337 #if ZONE_ENABLE_LOGGING
5338 if (__improbable(DO_LOGGING(zone))) {
5339 zfree_log_trace(zone, elem);
5340 }
5341 #endif /* ZONE_ENABLE_LOGGING */
5342
5343 if (zone->zfree_clear_mem) {
5344 poison = zfree_clear(zone, elem, elem_size);
5345 }
5346
5347 lock_zone(zone);
5348 assert(zone->z_self == zone);
5349
5350 if (!poison) {
5351 poison = zfree_poison_element(zone, &zone->zp_count, elem);
5352 }
5353
5354 if (__probable(zstats != NULL)) {
5355 /*
5356 * The few vm zones used before zone_init() runs do not have
5357 * per-cpu stats yet
5358 */
5359 zpercpu_get(zstats)->zs_mem_freed += elem_size;
5360 }
5361
5362 zfree_direct_locked(zone, elem, poison);
5363
5364 unlock_zone(zone);
5365 }
5366
5367 void
5368 (zfree)(union zone_or_view zov, void *addr)
5369 {
5370 zone_t zone = zov.zov_view->zv_zone;
5371 zone_stats_t zstats = zov.zov_view->zv_stats;
5372 assert(!zone->percpu);
5373 zfree_ext(zone, zstats, addr);
5374 }
5375
5376 void
5377 zfree_percpu(union zone_or_view zov, void *addr)
5378 {
5379 zone_t zone = zov.zov_view->zv_zone;
5380 zone_stats_t zstats = zov.zov_view->zv_stats;
5381 assert(zone->percpu);
5382 zfree_ext(zone, zstats, (void *)__zpcpu_demangle(addr));
5383 }
5384
5385 #pragma mark vm integration, MIG routines
5386
5387 /*
5388 * Drops (i.e. frees) the elements in the all free pages queue of a zone.
5389 * Called by zone_gc() on each zone and when a zone is zdestroy()ed.
5390 */
5391 static void
5392 zone_drop_free_elements(zone_t z)
5393 {
5394 const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
5395 unsigned int total_freed_pages = 0;
5396 struct zone_page_metadata *page_meta, *seq_meta;
5397 vm_address_t page_addr;
5398 vm_size_t size_to_free;
5399 vm_size_t free_count;
5400 uint32_t page_count;
5401
5402 current_thread()->options |= TH_OPT_ZONE_PRIV;
5403 lock_zone(z);
5404
5405 while (!zone_pva_is_null(z->pages_all_free)) {
5406 /*
5407 * If any replenishment threads are running, defer to them,
5408 * so that we don't deplete reserved zones.
5409 *
5410 * The timing of the check isn't super important, as there are
5411 * enough reserves to allow freeing an extra page_meta.
5412 *
5413 * Hence, we can check without grabbing the lock every time
5414 * through the loop. We do need the lock however to avoid
5415 * missing a wakeup when we decide to block.
5416 */
5417 if (zone_replenish_active > 0) {
5418 lck_spin_lock(&zone_replenish_lock);
5419 if (zone_replenish_active > 0) {
5420 assert_wait(&zone_replenish_active, THREAD_UNINT);
5421 lck_spin_unlock(&zone_replenish_lock);
5422 unlock_zone(z);
5423 thread_block(THREAD_CONTINUE_NULL);
5424 lock_zone(z);
5425 continue;
5426 }
5427 lck_spin_unlock(&zone_replenish_lock);
5428 }
5429
5430 page_meta = zone_pva_to_meta(z->pages_all_free, kind);
5431 page_count = page_meta->zm_page_count;
5432 free_count = zone_elem_count(z, ptoa(page_count), kind);
5433
5434 /*
5435 * Don't drain zones with async refill to below the refill
5436 * threshold, as they need some reserve to function properly.
5437 */
5438 if (!z->destroyed && z->prio_refill_count &&
5439 (vm_size_t)(z->countfree - free_count) < z->prio_refill_count) {
5440 break;
5441 }
5442
5443 zone_meta_queue_pop(z, &z->pages_all_free, kind, &page_addr);
5444
5445 if (os_sub_overflow(z->countfree, free_count, &z->countfree)) {
5446 zone_accounting_panic(z, "countfree wrap-around");
5447 }
5448 if (os_sub_overflow(z->countavail, free_count, &z->countavail)) {
5449 zone_accounting_panic(z, "countavail wrap-around");
5450 }
5451 if (os_sub_overflow(z->allfree_page_count, page_count,
5452 &z->allfree_page_count)) {
5453 zone_accounting_panic(z, "allfree_page_count wrap-around");
5454 }
5455 if (os_sub_overflow(z->page_count, page_count, &z->page_count)) {
5456 zone_accounting_panic(z, "page_count wrap-around");
5457 }
5458
5459 os_atomic_sub(&zones_phys_page_count, page_count, relaxed);
5460 os_atomic_sub(&zones_phys_page_mapped_count, page_count, relaxed);
5461
5462 bzero(page_meta, sizeof(*page_meta) * page_count);
5463 seq_meta = page_meta;
5464 page_meta = NULL; /* page_meta fields are zeroed, prevent reuse */
5465
5466 unlock_zone(z);
5467
5468 /* Free the pages for metadata and account for them */
5469 total_freed_pages += page_count;
5470 size_to_free = ptoa(page_count);
5471 #if KASAN_ZALLOC
5472 kasan_poison_range(page_addr, size_to_free, ASAN_VALID);
5473 #endif
5474 #if VM_MAX_TAG_ZONES
5475 if (z->tags) {
5476 ztMemoryRemove(z, page_addr, size_to_free);
5477 }
5478 #endif /* VM_MAX_TAG_ZONES */
5479
5480 if (z->va_sequester && z->alloc_pages == page_count) {
5481 kernel_memory_depopulate(submap_for_zone(z), page_addr,
5482 size_to_free, KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
5483 } else {
5484 kmem_free(submap_for_zone(z), page_addr, size_to_free);
5485 seq_meta = NULL;
5486 }
5487 thread_yield_to_preemption();
5488
5489 lock_zone(z);
5490
5491 if (seq_meta) {
5492 zone_meta_queue_push(z, &z->pages_sequester, seq_meta, kind);
5493 z->sequester_page_count += page_count;
5494 }
5495 }
5496 if (z->destroyed) {
5497 assert(zone_pva_is_null(z->pages_all_free));
5498 assert(z->allfree_page_count == 0);
5499 }
5500 unlock_zone(z);
5501 current_thread()->options &= ~TH_OPT_ZONE_PRIV;
5502
5503 #if DEBUG || DEVELOPMENT
5504 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
5505 kprintf("zone_gc() of zone %s%s freed %lu elements, %d pages\n",
5506 zone_heap_name(z), z->z_name,
5507 (unsigned long)(ptoa(total_freed_pages) / z->pcpu_elem_size),
5508 total_freed_pages);
5509 }
5510 #endif /* DEBUG || DEVELOPMENT */
5511 }
5512
5513 /* Zone garbage collection
5514 *
5515 * zone_gc will walk through all the free elements in all the
5516 * zones that are marked collectable looking for reclaimable
5517 * pages. zone_gc is called by consider_zone_gc when the system
5518 * begins to run out of memory.
5519 *
5520 * We should ensure that zone_gc never blocks.
5521 */
5522 void
5523 zone_gc(boolean_t consider_jetsams)
5524 {
5525 if (consider_jetsams) {
5526 kill_process_in_largest_zone();
5527 /*
5528 * If we do end up jetsamming something, we need to do a zone_gc so that
5529 * we can reclaim free zone elements and update the zone map size.
5530 * Fall through.
5531 */
5532 }
5533
5534 lck_mtx_lock(&zone_gc_lock);
5535
5536 #if DEBUG || DEVELOPMENT
5537 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
5538 kprintf("zone_gc() starting...\n");
5539 }
5540 #endif /* DEBUG || DEVELOPMENT */
5541
5542 zone_index_foreach(i) {
5543 zone_t z = &zone_array[i];
5544
5545 if (!z->collectable) {
5546 continue;
5547 }
5548 #if CONFIG_ZCACHE
5549 if (zone_caching_enabled(z)) {
5550 zcache_drain_depot(z);
5551 }
5552 #endif /* CONFIG_ZCACHE */
5553 if (zone_pva_is_null(z->pages_all_free)) {
5554 continue;
5555 }
5556
5557 zone_drop_free_elements(z);
5558 }
5559
5560 lck_mtx_unlock(&zone_gc_lock);
5561 }
5562
5563 /*
5564 * consider_zone_gc:
5565 *
5566 * Called by the pageout daemon when the system needs more free pages.
5567 */
5568
5569 void
5570 consider_zone_gc(boolean_t consider_jetsams)
5571 {
5572 /*
5573 * One-time reclaim of kernel_map resources we allocated in
5574 * early boot.
5575 *
5576 * Use atomic exchange in case multiple threads race into here.
5577 */
5578 vm_offset_t deallocate_kaddr;
5579 if (kmapoff_kaddr != 0 &&
5580 (deallocate_kaddr = os_atomic_xchg(&kmapoff_kaddr, 0, relaxed)) != 0) {
5581 vm_deallocate(kernel_map, deallocate_kaddr, ptoa_64(kmapoff_pgcnt));
5582 }
5583
5584 zone_gc(consider_jetsams);
5585 }
5586
5587 /*
5588 * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
5589 * requesting zone information.
5590 * Frees unused pages towards the end of the region, and zero'es out unused
5591 * space on the last page.
5592 */
5593 static vm_map_copy_t
5594 create_vm_map_copy(
5595 vm_offset_t start_addr,
5596 vm_size_t total_size,
5597 vm_size_t used_size)
5598 {
5599 kern_return_t kr;
5600 vm_offset_t end_addr;
5601 vm_size_t free_size;
5602 vm_map_copy_t copy;
5603
5604 if (used_size != total_size) {
5605 end_addr = start_addr + used_size;
5606 free_size = total_size - (round_page(end_addr) - start_addr);
5607
5608 if (free_size >= PAGE_SIZE) {
5609 kmem_free(ipc_kernel_map,
5610 round_page(end_addr), free_size);
5611 }
5612 bzero((char *) end_addr, round_page(end_addr) - end_addr);
5613 }
5614
5615 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
5616 (vm_map_size_t)used_size, TRUE, &copy);
5617 assert(kr == KERN_SUCCESS);
5618
5619 return copy;
5620 }
5621
5622 static boolean_t
5623 get_zone_info(
5624 zone_t z,
5625 mach_zone_name_t *zn,
5626 mach_zone_info_t *zi)
5627 {
5628 struct zone zcopy;
5629
5630 assert(z != ZONE_NULL);
5631 lock_zone(z);
5632 if (!z->z_self) {
5633 unlock_zone(z);
5634 return FALSE;
5635 }
5636 zcopy = *z;
5637 unlock_zone(z);
5638
5639 if (zn != NULL) {
5640 /*
5641 * Append kalloc heap name to zone name (if zone is used by kalloc)
5642 */
5643 char temp_zone_name[MAX_ZONE_NAME] = "";
5644 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5645 zone_heap_name(z), z->z_name);
5646
5647 /* assuming here the name data is static */
5648 (void) __nosan_strlcpy(zn->mzn_name, temp_zone_name,
5649 strlen(temp_zone_name) + 1);
5650 }
5651
5652 if (zi != NULL) {
5653 *zi = (mach_zone_info_t) {
5654 .mzi_count = zone_count_allocated(&zcopy),
5655 .mzi_cur_size = ptoa_64(zcopy.page_count),
5656 // max_size for zprint is now high-watermark of pages used
5657 .mzi_max_size = ptoa_64(zcopy.page_count_hwm),
5658 .mzi_elem_size = zcopy.pcpu_elem_size,
5659 .mzi_alloc_size = ptoa_64(zcopy.alloc_pages),
5660 .mzi_exhaustible = (uint64_t)zcopy.exhaustible,
5661 };
5662 zpercpu_foreach(zs, zcopy.z_stats) {
5663 zi->mzi_sum_size += zs->zs_mem_allocated;
5664 }
5665 if (zcopy.collectable) {
5666 SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable,
5667 ptoa_64(zcopy.allfree_page_count));
5668 SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
5669 }
5670 }
5671
5672 return TRUE;
5673 }
5674
5675 kern_return_t
5676 task_zone_info(
5677 __unused task_t task,
5678 __unused mach_zone_name_array_t *namesp,
5679 __unused mach_msg_type_number_t *namesCntp,
5680 __unused task_zone_info_array_t *infop,
5681 __unused mach_msg_type_number_t *infoCntp)
5682 {
5683 return KERN_FAILURE;
5684 }
5685
5686 kern_return_t
5687 mach_zone_info(
5688 host_priv_t host,
5689 mach_zone_name_array_t *namesp,
5690 mach_msg_type_number_t *namesCntp,
5691 mach_zone_info_array_t *infop,
5692 mach_msg_type_number_t *infoCntp)
5693 {
5694 return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL);
5695 }
5696
5697
5698 kern_return_t
5699 mach_memory_info(
5700 host_priv_t host,
5701 mach_zone_name_array_t *namesp,
5702 mach_msg_type_number_t *namesCntp,
5703 mach_zone_info_array_t *infop,
5704 mach_msg_type_number_t *infoCntp,
5705 mach_memory_info_array_t *memoryInfop,
5706 mach_msg_type_number_t *memoryInfoCntp)
5707 {
5708 mach_zone_name_t *names;
5709 vm_offset_t names_addr;
5710 vm_size_t names_size;
5711
5712 mach_zone_info_t *info;
5713 vm_offset_t info_addr;
5714 vm_size_t info_size;
5715
5716 mach_memory_info_t *memory_info;
5717 vm_offset_t memory_info_addr;
5718 vm_size_t memory_info_size;
5719 vm_size_t memory_info_vmsize;
5720 unsigned int num_info;
5721
5722 unsigned int max_zones, used_zones, i;
5723 mach_zone_name_t *zn;
5724 mach_zone_info_t *zi;
5725 kern_return_t kr;
5726
5727 uint64_t zones_collectable_bytes = 0;
5728
5729 if (host == HOST_NULL) {
5730 return KERN_INVALID_HOST;
5731 }
5732 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5733 if (!PE_i_can_has_debugger(NULL)) {
5734 return KERN_INVALID_HOST;
5735 }
5736 #endif
5737
5738 /*
5739 * We assume that zones aren't freed once allocated.
5740 * We won't pick up any zones that are allocated later.
5741 */
5742
5743 max_zones = os_atomic_load(&num_zones, relaxed);
5744
5745 names_size = round_page(max_zones * sizeof *names);
5746 kr = kmem_alloc_pageable(ipc_kernel_map,
5747 &names_addr, names_size, VM_KERN_MEMORY_IPC);
5748 if (kr != KERN_SUCCESS) {
5749 return kr;
5750 }
5751 names = (mach_zone_name_t *) names_addr;
5752
5753 info_size = round_page(max_zones * sizeof *info);
5754 kr = kmem_alloc_pageable(ipc_kernel_map,
5755 &info_addr, info_size, VM_KERN_MEMORY_IPC);
5756 if (kr != KERN_SUCCESS) {
5757 kmem_free(ipc_kernel_map,
5758 names_addr, names_size);
5759 return kr;
5760 }
5761 info = (mach_zone_info_t *) info_addr;
5762
5763 zn = &names[0];
5764 zi = &info[0];
5765
5766 used_zones = max_zones;
5767 for (i = 0; i < max_zones; i++) {
5768 if (!get_zone_info(&(zone_array[i]), zn, zi)) {
5769 used_zones--;
5770 continue;
5771 }
5772 zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
5773 zn++;
5774 zi++;
5775 }
5776
5777 *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
5778 *namesCntp = used_zones;
5779
5780 *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
5781 *infoCntp = used_zones;
5782
5783 num_info = 0;
5784 memory_info_addr = 0;
5785
5786 if (memoryInfop && memoryInfoCntp) {
5787 vm_map_copy_t copy;
5788 num_info = vm_page_diagnose_estimate();
5789 memory_info_size = num_info * sizeof(*memory_info);
5790 memory_info_vmsize = round_page(memory_info_size);
5791 kr = kmem_alloc_pageable(ipc_kernel_map,
5792 &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
5793 if (kr != KERN_SUCCESS) {
5794 return kr;
5795 }
5796
5797 kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
5798 VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
5799 assert(kr == KERN_SUCCESS);
5800
5801 memory_info = (mach_memory_info_t *) memory_info_addr;
5802 vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
5803
5804 kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
5805 assert(kr == KERN_SUCCESS);
5806
5807 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
5808 (vm_map_size_t)memory_info_size, TRUE, &copy);
5809 assert(kr == KERN_SUCCESS);
5810
5811 *memoryInfop = (mach_memory_info_t *) copy;
5812 *memoryInfoCntp = num_info;
5813 }
5814
5815 return KERN_SUCCESS;
5816 }
5817
5818 kern_return_t
5819 mach_zone_info_for_zone(
5820 host_priv_t host,
5821 mach_zone_name_t name,
5822 mach_zone_info_t *infop)
5823 {
5824 zone_t zone_ptr;
5825
5826 if (host == HOST_NULL) {
5827 return KERN_INVALID_HOST;
5828 }
5829 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5830 if (!PE_i_can_has_debugger(NULL)) {
5831 return KERN_INVALID_HOST;
5832 }
5833 #endif
5834
5835 if (infop == NULL) {
5836 return KERN_INVALID_ARGUMENT;
5837 }
5838
5839 zone_ptr = ZONE_NULL;
5840 zone_index_foreach(i) {
5841 zone_t z = &(zone_array[i]);
5842 assert(z != ZONE_NULL);
5843
5844 /*
5845 * Append kalloc heap name to zone name (if zone is used by kalloc)
5846 */
5847 char temp_zone_name[MAX_ZONE_NAME] = "";
5848 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5849 zone_heap_name(z), z->z_name);
5850
5851 /* Find the requested zone by name */
5852 if (track_this_zone(temp_zone_name, name.mzn_name)) {
5853 zone_ptr = z;
5854 break;
5855 }
5856 }
5857
5858 /* No zones found with the requested zone name */
5859 if (zone_ptr == ZONE_NULL) {
5860 return KERN_INVALID_ARGUMENT;
5861 }
5862
5863 if (get_zone_info(zone_ptr, NULL, infop)) {
5864 return KERN_SUCCESS;
5865 }
5866 return KERN_FAILURE;
5867 }
5868
5869 kern_return_t
5870 mach_zone_info_for_largest_zone(
5871 host_priv_t host,
5872 mach_zone_name_t *namep,
5873 mach_zone_info_t *infop)
5874 {
5875 if (host == HOST_NULL) {
5876 return KERN_INVALID_HOST;
5877 }
5878 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5879 if (!PE_i_can_has_debugger(NULL)) {
5880 return KERN_INVALID_HOST;
5881 }
5882 #endif
5883
5884 if (namep == NULL || infop == NULL) {
5885 return KERN_INVALID_ARGUMENT;
5886 }
5887
5888 if (get_zone_info(zone_find_largest(), namep, infop)) {
5889 return KERN_SUCCESS;
5890 }
5891 return KERN_FAILURE;
5892 }
5893
5894 uint64_t
5895 get_zones_collectable_bytes(void)
5896 {
5897 uint64_t zones_collectable_bytes = 0;
5898 mach_zone_info_t zi;
5899
5900 zone_index_foreach(i) {
5901 if (get_zone_info(&zone_array[i], NULL, &zi)) {
5902 zones_collectable_bytes +=
5903 GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
5904 }
5905 }
5906
5907 return zones_collectable_bytes;
5908 }
5909
5910 kern_return_t
5911 mach_zone_get_zlog_zones(
5912 host_priv_t host,
5913 mach_zone_name_array_t *namesp,
5914 mach_msg_type_number_t *namesCntp)
5915 {
5916 #if ZONE_ENABLE_LOGGING
5917 unsigned int max_zones, logged_zones, i;
5918 kern_return_t kr;
5919 zone_t zone_ptr;
5920 mach_zone_name_t *names;
5921 vm_offset_t names_addr;
5922 vm_size_t names_size;
5923
5924 if (host == HOST_NULL) {
5925 return KERN_INVALID_HOST;
5926 }
5927
5928 if (namesp == NULL || namesCntp == NULL) {
5929 return KERN_INVALID_ARGUMENT;
5930 }
5931
5932 max_zones = os_atomic_load(&num_zones, relaxed);
5933
5934 names_size = round_page(max_zones * sizeof *names);
5935 kr = kmem_alloc_pageable(ipc_kernel_map,
5936 &names_addr, names_size, VM_KERN_MEMORY_IPC);
5937 if (kr != KERN_SUCCESS) {
5938 return kr;
5939 }
5940 names = (mach_zone_name_t *) names_addr;
5941
5942 zone_ptr = ZONE_NULL;
5943 logged_zones = 0;
5944 for (i = 0; i < max_zones; i++) {
5945 zone_t z = &(zone_array[i]);
5946 assert(z != ZONE_NULL);
5947
5948 /* Copy out the zone name if zone logging is enabled */
5949 if (z->zlog_btlog) {
5950 get_zone_info(z, &names[logged_zones], NULL);
5951 logged_zones++;
5952 }
5953 }
5954
5955 *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
5956 *namesCntp = logged_zones;
5957
5958 return KERN_SUCCESS;
5959
5960 #else /* ZONE_ENABLE_LOGGING */
5961 #pragma unused(host, namesp, namesCntp)
5962 return KERN_FAILURE;
5963 #endif /* ZONE_ENABLE_LOGGING */
5964 }
5965
5966 kern_return_t
5967 mach_zone_get_btlog_records(
5968 host_priv_t host,
5969 mach_zone_name_t name,
5970 zone_btrecord_array_t *recsp,
5971 mach_msg_type_number_t *recsCntp)
5972 {
5973 #if DEBUG || DEVELOPMENT
5974 unsigned int numrecs = 0;
5975 zone_btrecord_t *recs;
5976 kern_return_t kr;
5977 zone_t zone_ptr;
5978 vm_offset_t recs_addr;
5979 vm_size_t recs_size;
5980
5981 if (host == HOST_NULL) {
5982 return KERN_INVALID_HOST;
5983 }
5984
5985 if (recsp == NULL || recsCntp == NULL) {
5986 return KERN_INVALID_ARGUMENT;
5987 }
5988
5989 zone_ptr = ZONE_NULL;
5990 zone_index_foreach(i) {
5991 zone_t z = &zone_array[i];
5992
5993 /*
5994 * Append kalloc heap name to zone name (if zone is used by kalloc)
5995 */
5996 char temp_zone_name[MAX_ZONE_NAME] = "";
5997 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5998 zone_heap_name(z), z->z_name);
5999
6000 /* Find the requested zone by name */
6001 if (track_this_zone(temp_zone_name, name.mzn_name)) {
6002 zone_ptr = z;
6003 break;
6004 }
6005 }
6006
6007 /* No zones found with the requested zone name */
6008 if (zone_ptr == ZONE_NULL) {
6009 return KERN_INVALID_ARGUMENT;
6010 }
6011
6012 /* Logging not turned on for the requested zone */
6013 if (!DO_LOGGING(zone_ptr)) {
6014 return KERN_FAILURE;
6015 }
6016
6017 /* Allocate memory for btlog records */
6018 numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog));
6019 recs_size = round_page(numrecs * sizeof *recs);
6020
6021 kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC);
6022 if (kr != KERN_SUCCESS) {
6023 return kr;
6024 }
6025
6026 /*
6027 * We will call get_btlog_records() below which populates this region while holding a spinlock
6028 * (the btlog lock). So these pages need to be wired.
6029 */
6030 kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size,
6031 VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
6032 assert(kr == KERN_SUCCESS);
6033
6034 recs = (zone_btrecord_t *)recs_addr;
6035 get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs);
6036
6037 kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE);
6038 assert(kr == KERN_SUCCESS);
6039
6040 *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs);
6041 *recsCntp = numrecs;
6042
6043 return KERN_SUCCESS;
6044
6045 #else /* DEBUG || DEVELOPMENT */
6046 #pragma unused(host, name, recsp, recsCntp)
6047 return KERN_FAILURE;
6048 #endif /* DEBUG || DEVELOPMENT */
6049 }
6050
6051
6052 #if DEBUG || DEVELOPMENT
6053
6054 kern_return_t
6055 mach_memory_info_check(void)
6056 {
6057 mach_memory_info_t * memory_info;
6058 mach_memory_info_t * info;
6059 unsigned int num_info;
6060 vm_offset_t memory_info_addr;
6061 kern_return_t kr;
6062 size_t memory_info_size, memory_info_vmsize;
6063 uint64_t top_wired, zonestotal, total;
6064
6065 num_info = vm_page_diagnose_estimate();
6066 memory_info_size = num_info * sizeof(*memory_info);
6067 memory_info_vmsize = round_page(memory_info_size);
6068 kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG);
6069 assert(kr == KERN_SUCCESS);
6070
6071 memory_info = (mach_memory_info_t *) memory_info_addr;
6072 vm_page_diagnose(memory_info, num_info, 0);
6073
6074 top_wired = total = zonestotal = 0;
6075 zone_index_foreach(idx) {
6076 zonestotal += zone_size_wired(&zone_array[idx]);
6077 }
6078
6079 for (uint32_t idx = 0; idx < num_info; idx++) {
6080 info = &memory_info[idx];
6081 if (!info->size) {
6082 continue;
6083 }
6084 if (VM_KERN_COUNT_WIRED == info->site) {
6085 top_wired = info->size;
6086 }
6087 if (VM_KERN_SITE_HIDE & info->flags) {
6088 continue;
6089 }
6090 if (!(VM_KERN_SITE_WIRED & info->flags)) {
6091 continue;
6092 }
6093 total += info->size;
6094 }
6095 total += zonestotal;
6096
6097 printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n",
6098 total, top_wired, zonestotal, top_wired - total);
6099
6100 kmem_free(kernel_map, memory_info_addr, memory_info_vmsize);
6101
6102 return kr;
6103 }
6104
6105 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
6106
6107 #endif /* DEBUG || DEVELOPMENT */
6108
6109 kern_return_t
6110 mach_zone_force_gc(
6111 host_t host)
6112 {
6113 if (host == HOST_NULL) {
6114 return KERN_INVALID_HOST;
6115 }
6116
6117 #if DEBUG || DEVELOPMENT
6118 /* Callout to buffer cache GC to drop elements in the apfs zones */
6119 if (consider_buffer_cache_collect != NULL) {
6120 (void)(*consider_buffer_cache_collect)(0);
6121 }
6122 consider_zone_gc(FALSE);
6123 #endif /* DEBUG || DEVELOPMENT */
6124 return KERN_SUCCESS;
6125 }
6126
6127 zone_t
6128 zone_find_largest(void)
6129 {
6130 uint32_t largest_idx = 0;
6131 vm_offset_t largest_size = zone_size_wired(&zone_array[0]);
6132
6133 zone_index_foreach(i) {
6134 vm_offset_t size = zone_size_wired(&zone_array[i]);
6135 if (size > largest_size) {
6136 largest_idx = i;
6137 largest_size = size;
6138 }
6139 }
6140
6141 return &zone_array[largest_idx];
6142 }
6143
6144 #pragma mark - tests
6145 #if DEBUG || DEVELOPMENT
6146
6147 /*
6148 * Used for sysctl kern.run_zone_test which is not thread-safe. Ensure only one
6149 * thread goes through at a time. Or we can end up with multiple test zones (if
6150 * a second zinit() comes through before zdestroy()), which could lead us to
6151 * run out of zones.
6152 */
6153 SIMPLE_LOCK_DECLARE(zone_test_lock, 0);
6154 static boolean_t zone_test_running = FALSE;
6155 static zone_t test_zone_ptr = NULL;
6156
6157 static uintptr_t *
6158 zone_copy_allocations(zone_t z, uintptr_t *elems, bitmap_t *bits,
6159 zone_pva_t page_index, zone_addr_kind_t kind)
6160 {
6161 vm_offset_t free, first, end, page;
6162 struct zone_page_metadata *meta;
6163
6164 while (!zone_pva_is_null(page_index)) {
6165 page = zone_pva_to_addr(page_index);
6166 meta = zone_pva_to_meta(page_index, kind);
6167 end = page + ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
6168 first = page + ZONE_PAGE_FIRST_OFFSET(kind);
6169
6170 bitmap_clear(bits, (uint32_t)((end - first) / zone_elem_size(z)));
6171
6172 // construct bitmap of all freed elements
6173 free = zone_page_meta_get_freelist(z, meta, page);
6174 while (free) {
6175 bitmap_set(bits, (uint32_t)((free - first) / zone_elem_size(z)));
6176
6177 // next free element
6178 free = *(vm_offset_t *)free ^ zp_nopoison_cookie;
6179 }
6180
6181 for (unsigned i = 0; first < end; i++, first += zone_elem_size(z)) {
6182 if (!bitmap_test(bits, i)) {
6183 *elems++ = INSTANCE_PUT(first);
6184 }
6185 }
6186
6187 page_index = meta->zm_page_next;
6188 }
6189 return elems;
6190 }
6191
6192 kern_return_t
6193 zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon)
6194 {
6195 uintptr_t zbt[MAX_ZTRACE_DEPTH];
6196 zone_t zone = NULL;
6197 uintptr_t * array;
6198 uintptr_t * next;
6199 uintptr_t element, bt;
6200 uint32_t idx, count, found;
6201 uint32_t btidx, btcount, nobtcount, btfound;
6202 uint32_t elemSize;
6203 uint64_t maxElems;
6204 kern_return_t kr;
6205 bitmap_t *bits;
6206
6207 zone_index_foreach(i) {
6208 if (!strncmp(zoneName, zone_array[i].z_name, nameLen)) {
6209 zone = &zone_array[i];
6210 break;
6211 }
6212 }
6213 if (zone == NULL) {
6214 return KERN_INVALID_NAME;
6215 }
6216
6217 elemSize = zone_elem_size(zone);
6218 maxElems = (zone->countavail + 1) & ~1ul;
6219
6220 if ((ptoa(zone->percpu ? 1 : zone->alloc_pages) % elemSize) &&
6221 !zone_leaks_scan_enable) {
6222 return KERN_INVALID_CAPABILITY;
6223 }
6224
6225 kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array,
6226 maxElems * sizeof(uintptr_t) + BITMAP_LEN(ZONE_CHUNK_MAXELEMENTS),
6227 VM_KERN_MEMORY_DIAG);
6228 if (KERN_SUCCESS != kr) {
6229 return kr;
6230 }
6231
6232 /* maxElems is a 2-multiple so we're always aligned */
6233 bits = CAST_DOWN_EXPLICIT(bitmap_t *, array + maxElems);
6234
6235 lock_zone(zone);
6236
6237 next = array;
6238 next = zone_copy_allocations(zone, next, bits,
6239 zone->pages_any_free_foreign, ZONE_ADDR_FOREIGN);
6240 next = zone_copy_allocations(zone, next, bits,
6241 zone->pages_all_used_foreign, ZONE_ADDR_FOREIGN);
6242 next = zone_copy_allocations(zone, next, bits,
6243 zone->pages_intermediate, ZONE_ADDR_NATIVE);
6244 next = zone_copy_allocations(zone, next, bits,
6245 zone->pages_all_used, ZONE_ADDR_NATIVE);
6246 count = (uint32_t)(next - array);
6247
6248 unlock_zone(zone);
6249
6250 zone_leaks_scan(array, count, zone_elem_size(zone), &found);
6251 assert(found <= count);
6252
6253 for (idx = 0; idx < count; idx++) {
6254 element = array[idx];
6255 if (kInstanceFlagReferenced & element) {
6256 continue;
6257 }
6258 element = INSTANCE_PUT(element) & ~kInstanceFlags;
6259 }
6260
6261 #if ZONE_ENABLE_LOGGING
6262 if (zone->zlog_btlog && !corruption_debug_flag) {
6263 // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
6264 btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon);
6265 }
6266 #endif /* ZONE_ENABLE_LOGGING */
6267
6268 for (nobtcount = idx = 0; idx < count; idx++) {
6269 element = array[idx];
6270 if (!element) {
6271 continue;
6272 }
6273 if (kInstanceFlagReferenced & element) {
6274 continue;
6275 }
6276 element = INSTANCE_PUT(element) & ~kInstanceFlags;
6277
6278 // see if we can find any backtrace left in the element
6279 btcount = (typeof(btcount))(zone_elem_size(zone) / sizeof(uintptr_t));
6280 if (btcount >= MAX_ZTRACE_DEPTH) {
6281 btcount = MAX_ZTRACE_DEPTH - 1;
6282 }
6283 for (btfound = btidx = 0; btidx < btcount; btidx++) {
6284 bt = ((uintptr_t *)element)[btcount - 1 - btidx];
6285 if (!VM_KERNEL_IS_SLID(bt)) {
6286 break;
6287 }
6288 zbt[btfound++] = bt;
6289 }
6290 if (btfound) {
6291 (*proc)(refCon, 1, elemSize, &zbt[0], btfound);
6292 } else {
6293 nobtcount++;
6294 }
6295 }
6296 if (nobtcount) {
6297 // fake backtrace when we found nothing
6298 zbt[0] = (uintptr_t) &zalloc;
6299 (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1);
6300 }
6301
6302 kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t));
6303
6304 return KERN_SUCCESS;
6305 }
6306
6307 boolean_t
6308 run_zone_test(void)
6309 {
6310 unsigned int i = 0, max_iter = 5;
6311 void * test_ptr;
6312 zone_t test_zone;
6313
6314 simple_lock(&zone_test_lock, &zone_locks_grp);
6315 if (!zone_test_running) {
6316 zone_test_running = TRUE;
6317 } else {
6318 simple_unlock(&zone_test_lock);
6319 printf("run_zone_test: Test already running.\n");
6320 return FALSE;
6321 }
6322 simple_unlock(&zone_test_lock);
6323
6324 printf("run_zone_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
6325
6326 /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
6327 do {
6328 test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
6329 if (test_zone == NULL) {
6330 printf("run_zone_test: zinit() failed\n");
6331 return FALSE;
6332 }
6333
6334 #if KASAN_ZALLOC
6335 if (test_zone_ptr == NULL && test_zone->countfree != 0) {
6336 #else
6337 if (test_zone->countfree != 0) {
6338 #endif
6339 printf("run_zone_test: free count is not zero\n");
6340 return FALSE;
6341 }
6342
6343 if (test_zone_ptr == NULL) {
6344 /* Stash the zone pointer returned on the fist zinit */
6345 printf("run_zone_test: zone created for the first time\n");
6346 test_zone_ptr = test_zone;
6347 } else if (test_zone != test_zone_ptr) {
6348 printf("run_zone_test: old zone pointer and new zone pointer don't match\n");
6349 return FALSE;
6350 }
6351
6352 test_ptr = zalloc(test_zone);
6353 if (test_ptr == NULL) {
6354 printf("run_zone_test: zalloc() failed\n");
6355 return FALSE;
6356 }
6357 zfree(test_zone, test_ptr);
6358
6359 zdestroy(test_zone);
6360 i++;
6361
6362 printf("run_zone_test: Iteration %d successful\n", i);
6363 } while (i < max_iter);
6364
6365 /* test Z_VA_SEQUESTER */
6366 if (zsecurity_options & ZSECURITY_OPTIONS_SEQUESTER) {
6367 int idx, num_allocs = 8;
6368 vm_size_t elem_size = 2 * PAGE_SIZE / num_allocs;
6369 void *allocs[num_allocs];
6370 vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_count, relaxed);
6371 vm_size_t zone_map_size = zone_range_size(&zone_info.zi_map_range);
6372
6373 test_zone = zone_create("test_zone_sysctl", elem_size,
6374 ZC_DESTRUCTIBLE | ZC_SEQUESTER);
6375 if (test_zone == NULL) {
6376 printf("run_zone_test: zinit() failed\n");
6377 return FALSE;
6378 }
6379
6380 for (idx = 0; idx < num_allocs; idx++) {
6381 allocs[idx] = zalloc(test_zone);
6382 assert(NULL != allocs[idx]);
6383 printf("alloc[%d] %p\n", idx, allocs[idx]);
6384 }
6385 for (idx = 0; idx < num_allocs; idx++) {
6386 zfree(test_zone, allocs[idx]);
6387 }
6388 assert(!zone_pva_is_null(test_zone->pages_all_free));
6389
6390 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6391 vm_page_wire_count, vm_page_free_count,
6392 (100ULL * ptoa_64(phys_pages)) / zone_map_size);
6393 zone_gc(FALSE);
6394 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6395 vm_page_wire_count, vm_page_free_count,
6396 (100ULL * ptoa_64(phys_pages)) / zone_map_size);
6397 unsigned int allva = 0;
6398 zone_index_foreach(zidx) {
6399 zone_t z = &zone_array[zidx];
6400 lock_zone(z);
6401 allva += z->page_count;
6402 if (!z->sequester_page_count) {
6403 unlock_zone(z);
6404 continue;
6405 }
6406 unsigned count = 0;
6407 uint64_t size;
6408 zone_pva_t pg = z->pages_sequester;
6409 struct zone_page_metadata *page_meta;
6410 while (pg.packed_address) {
6411 page_meta = zone_pva_to_meta(pg, ZONE_ADDR_NATIVE);
6412 count += z->alloc_pages;
6413 pg = page_meta->zm_page_next;
6414 }
6415 assert(count == z->sequester_page_count);
6416 size = zone_size_wired(z);
6417 if (!size) {
6418 size = 1;
6419 }
6420 printf("%s%s: seq %d, res %d, %qd %%\n",
6421 zone_heap_name(z), z->z_name, z->sequester_page_count,
6422 z->page_count, zone_size_allocated(z) * 100ULL / size);
6423 unlock_zone(z);
6424 }
6425
6426 printf("total va: %d\n", allva);
6427
6428 assert(zone_pva_is_null(test_zone->pages_all_free));
6429 assert(!zone_pva_is_null(test_zone->pages_sequester));
6430 assert(2 == test_zone->sequester_page_count);
6431 for (idx = 0; idx < num_allocs; idx++) {
6432 assert(0 == pmap_find_phys(kernel_pmap, (addr64_t)(uintptr_t) allocs[idx]));
6433 }
6434 for (idx = 0; idx < num_allocs; idx++) {
6435 allocs[idx] = zalloc(test_zone);
6436 assert(allocs[idx]);
6437 printf("alloc[%d] %p\n", idx, allocs[idx]);
6438 }
6439 assert(zone_pva_is_null(test_zone->pages_sequester));
6440 assert(0 == test_zone->sequester_page_count);
6441 for (idx = 0; idx < num_allocs; idx++) {
6442 zfree(test_zone, allocs[idx]);
6443 }
6444 zdestroy(test_zone);
6445 } else {
6446 printf("run_zone_test: skipping sequester test (not enabled)\n");
6447 }
6448
6449 printf("run_zone_test: Test passed\n");
6450
6451 simple_lock(&zone_test_lock, &zone_locks_grp);
6452 zone_test_running = FALSE;
6453 simple_unlock(&zone_test_lock);
6454
6455 return TRUE;
6456 }
6457
6458 /*
6459 * Routines to test that zone garbage collection and zone replenish threads
6460 * running at the same time don't cause problems.
6461 */
6462
6463 void
6464 zone_gc_replenish_test(void)
6465 {
6466 zone_gc(FALSE);
6467 }
6468
6469
6470 void
6471 zone_alloc_replenish_test(void)
6472 {
6473 zone_t z = NULL;
6474 struct data { struct data *next; } *node, *list = NULL;
6475
6476 /*
6477 * Find a zone that has a replenish thread
6478 */
6479 zone_index_foreach(i) {
6480 z = &zone_array[i];
6481 if (z->prio_refill_count &&
6482 zone_elem_size(z) >= sizeof(struct data)) {
6483 z = &zone_array[i];
6484 break;
6485 }
6486 }
6487 if (z == NULL) {
6488 printf("Couldn't find a replenish zone\n");
6489 return;
6490 }
6491
6492 for (uint32_t i = 0; i < 2000; ++i) { /* something big enough to go past replenishment */
6493 node = zalloc(z);
6494 node->next = list;
6495 list = node;
6496 }
6497
6498 /*
6499 * release the memory we allocated
6500 */
6501 while (list != NULL) {
6502 node = list;
6503 list = list->next;
6504 zfree(z, node);
6505 }
6506 }
6507
6508 #endif /* DEBUG || DEVELOPMENT */