]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/zalloc.c
xnu-6153.61.1.tar.gz
[apple/xnu.git] / osfmk / kern / zalloc.c
1 /*
2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: kern/zalloc.c
60 * Author: Avadis Tevanian, Jr.
61 *
62 * Zone-based memory allocator. A zone is a collection of fixed size
63 * data blocks for which quick allocation/deallocation is possible.
64 */
65 #include <zone_debug.h>
66
67 #include <mach/mach_types.h>
68 #include <mach/vm_param.h>
69 #include <mach/kern_return.h>
70 #include <mach/mach_host_server.h>
71 #include <mach/task_server.h>
72 #include <mach/machine/vm_types.h>
73 #include <mach/vm_map.h>
74 #include <mach/sdt.h>
75
76 #include <kern/bits.h>
77 #include <kern/kern_types.h>
78 #include <kern/assert.h>
79 #include <kern/backtrace.h>
80 #include <kern/host.h>
81 #include <kern/macro_help.h>
82 #include <kern/sched.h>
83 #include <kern/locks.h>
84 #include <kern/sched_prim.h>
85 #include <kern/misc_protos.h>
86 #include <kern/thread_call.h>
87 #include <kern/zalloc.h>
88 #include <kern/kalloc.h>
89
90 #include <prng/random.h>
91
92 #include <vm/pmap.h>
93 #include <vm/vm_map.h>
94 #include <vm/vm_kern.h>
95 #include <vm/vm_page.h>
96
97 #include <pexpert/pexpert.h>
98
99 #include <machine/machparam.h>
100 #include <machine/machine_routines.h> /* ml_cpu_get_info */
101
102 #include <libkern/OSDebug.h>
103 #include <libkern/OSAtomic.h>
104 #include <libkern/section_keywords.h>
105 #include <sys/kdebug.h>
106
107 #include <san/kasan.h>
108
109 /*
110 * The zone_locks_grp allows for collecting lock statistics.
111 * All locks are associated to this group in zinit.
112 * Look at tools/lockstat for debugging lock contention.
113 */
114
115 lck_grp_t zone_locks_grp;
116 lck_grp_attr_t zone_locks_grp_attr;
117
118 /*
119 * ZONE_ALIAS_ADDR (deprecated)
120 */
121
122 #define from_zone_map(addr, size) \
123 ((vm_offset_t)(addr) >= zone_map_min_address && \
124 ((vm_offset_t)(addr) + size - 1) < zone_map_max_address )
125
126 /*
127 * Zone Corruption Debugging
128 *
129 * We use three techniques to detect modification of a zone element
130 * after it's been freed.
131 *
132 * (1) Check the freelist next pointer for sanity.
133 * (2) Store a backup of the next pointer at the end of the element,
134 * and compare it to the primary next pointer when the element is allocated
135 * to detect corruption of the freelist due to use-after-free bugs.
136 * The backup pointer is also XORed with a per-boot random cookie.
137 * (3) Poison the freed element by overwriting it with 0xdeadbeef,
138 * and check for that value when the element is being reused to make sure
139 * no part of the element has been modified while it was on the freelist.
140 * This will also help catch read-after-frees, as code will now dereference
141 * 0xdeadbeef instead of a valid but freed pointer.
142 *
143 * (1) and (2) occur for every allocation and free to a zone.
144 * This is done to make it slightly more difficult for an attacker to
145 * manipulate the freelist to behave in a specific way.
146 *
147 * Poisoning (3) occurs periodically for every N frees (counted per-zone)
148 * and on every free for zones smaller than a cacheline. If -zp
149 * is passed as a boot arg, poisoning occurs for every free.
150 *
151 * Performance slowdown is inversely proportional to the frequency of poisoning,
152 * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
153 * and higher. You can expect to find a 100% reproducible bug in an average of
154 * N tries, with a standard deviation of about N, but you will want to set
155 * "-zp" to always poison every free if you are attempting to reproduce
156 * a known bug.
157 *
158 * For a more heavyweight, but finer-grained method of detecting misuse
159 * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
160 *
161 * Zone Corruption Logging
162 *
163 * You can also track where corruptions come from by using the boot-arguments
164 * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
165 * in this document for more implementation and usage information.
166 *
167 * Zone Leak Detection
168 *
169 * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
170 * found later in this file via the showtopztrace and showz* macros in kgmacros,
171 * or use zlog without the -zc argument.
172 *
173 */
174
175 /* Returns TRUE if we rolled over the counter at factor */
176 static inline boolean_t
177 sample_counter(volatile uint32_t * count_p, uint32_t factor)
178 {
179 uint32_t old_count, new_count;
180 boolean_t rolled_over;
181
182 do {
183 new_count = old_count = *count_p;
184
185 if (++new_count >= factor) {
186 rolled_over = TRUE;
187 new_count = 0;
188 } else {
189 rolled_over = FALSE;
190 }
191 } while (!OSCompareAndSwap(old_count, new_count, count_p));
192
193 return rolled_over;
194 }
195
196 #if defined(__LP64__)
197 #define ZP_POISON 0xdeadbeefdeadbeef
198 #else
199 #define ZP_POISON 0xdeadbeef
200 #endif
201
202 boolean_t zfree_poison_element(zone_t zone, vm_offset_t elem);
203 void zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr);
204
205 #define ZP_DEFAULT_SAMPLING_FACTOR 16
206 #define ZP_DEFAULT_SCALE_FACTOR 4
207
208 /*
209 * A zp_factor of 0 indicates zone poisoning is disabled,
210 * however, we still poison zones smaller than zp_tiny_zone_limit (a cacheline).
211 * Passing the -no-zp boot-arg disables even this behavior.
212 * In all cases, we record and check the integrity of a backup pointer.
213 */
214
215 /* set by zp-factor=N boot arg, zero indicates non-tiny poisoning disabled */
216 #if DEBUG
217 #define DEFAULT_ZP_FACTOR (1)
218 #else
219 #define DEFAULT_ZP_FACTOR (0)
220 #endif
221 uint32_t zp_factor = DEFAULT_ZP_FACTOR;
222
223 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
224 uint32_t zp_scale = 0;
225
226 /* set in zp_init, zero indicates -no-zp boot-arg */
227 vm_size_t zp_tiny_zone_limit = 0;
228
229 /* initialized to a per-boot random value in zp_init */
230 uintptr_t zp_poisoned_cookie = 0;
231 uintptr_t zp_nopoison_cookie = 0;
232
233 #if VM_MAX_TAG_ZONES
234 boolean_t zone_tagging_on;
235 #endif /* VM_MAX_TAG_ZONES */
236
237 SECURITY_READ_ONLY_LATE(boolean_t) copyio_zalloc_check = TRUE;
238 static struct bool_gen zone_bool_gen;
239
240 /*
241 * initialize zone poisoning
242 * called from zone_bootstrap before any allocations are made from zalloc
243 */
244 static inline void
245 zp_init(void)
246 {
247 char temp_buf[16];
248
249 /*
250 * Initialize backup pointer random cookie for poisoned elements
251 * Try not to call early_random() back to back, it may return
252 * the same value if mach_absolute_time doesn't have sufficient time
253 * to tick over between calls. <rdar://problem/11597395>
254 * (This is only a problem on embedded devices)
255 */
256 zp_poisoned_cookie = (uintptr_t) early_random();
257
258 /*
259 * Always poison zones smaller than a cacheline,
260 * because it's pretty close to free
261 */
262 ml_cpu_info_t cpu_info;
263 ml_cpu_get_info(&cpu_info);
264 zp_tiny_zone_limit = (vm_size_t) cpu_info.cache_line_size;
265
266 zp_factor = ZP_DEFAULT_SAMPLING_FACTOR;
267 zp_scale = ZP_DEFAULT_SCALE_FACTOR;
268
269 //TODO: Bigger permutation?
270 /*
271 * Permute the default factor +/- 1 to make it less predictable
272 * This adds or subtracts ~4 poisoned objects per 1000 frees.
273 */
274 if (zp_factor != 0) {
275 uint32_t rand_bits = early_random() & 0x3;
276
277 if (rand_bits == 0x1) {
278 zp_factor += 1;
279 } else if (rand_bits == 0x2) {
280 zp_factor -= 1;
281 }
282 /* if 0x0 or 0x3, leave it alone */
283 }
284
285 /* -zp: enable poisoning for every alloc and free */
286 if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
287 zp_factor = 1;
288 }
289
290 /* -no-zp: disable poisoning completely even for tiny zones */
291 if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
292 zp_factor = 0;
293 zp_tiny_zone_limit = 0;
294 printf("Zone poisoning disabled\n");
295 }
296
297 /* zp-factor=XXXX: override how often to poison freed zone elements */
298 if (PE_parse_boot_argn("zp-factor", &zp_factor, sizeof(zp_factor))) {
299 printf("Zone poisoning factor override: %u\n", zp_factor);
300 }
301
302 /* zp-scale=XXXX: override how much zone size scales zp-factor by */
303 if (PE_parse_boot_argn("zp-scale", &zp_scale, sizeof(zp_scale))) {
304 printf("Zone poisoning scale factor override: %u\n", zp_scale);
305 }
306
307 /* Initialize backup pointer random cookie for unpoisoned elements */
308 zp_nopoison_cookie = (uintptr_t) early_random();
309
310 #if MACH_ASSERT
311 if (zp_poisoned_cookie == zp_nopoison_cookie) {
312 panic("early_random() is broken: %p and %p are not random\n",
313 (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
314 }
315 #endif
316
317 /*
318 * Use the last bit in the backup pointer to hint poisoning state
319 * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
320 * the low bits are zero.
321 */
322 zp_poisoned_cookie |= (uintptr_t)0x1ULL;
323 zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
324
325 #if defined(__LP64__)
326 /*
327 * Make backup pointers more obvious in GDB for 64 bit
328 * by making OxFFFFFF... ^ cookie = 0xFACADE...
329 * (0xFACADE = 0xFFFFFF ^ 0x053521)
330 * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
331 * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
332 * by the sanity check, so it's OK for that part of the cookie to be predictable.
333 *
334 * TODO: Use #defines, xors, and shifts
335 */
336
337 zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
338 zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
339
340 zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
341 zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
342 #endif
343 }
344
345 /*
346 * These macros are used to keep track of the number
347 * of pages being used by the zone currently. The
348 * z->page_count is not protected by the zone lock.
349 */
350 #define ZONE_PAGE_COUNT_INCR(z, count) \
351 { \
352 OSAddAtomic64(count, &(z->page_count)); \
353 }
354
355 #define ZONE_PAGE_COUNT_DECR(z, count) \
356 { \
357 OSAddAtomic64(-count, &(z->page_count)); \
358 }
359
360 vm_map_t zone_map = VM_MAP_NULL;
361
362 /* for is_sane_zone_element and garbage collection */
363
364 vm_offset_t zone_map_min_address = 0; /* initialized in zone_init */
365 vm_offset_t zone_map_max_address = 0;
366
367 /* Globals for random boolean generator for elements in free list */
368 #define MAX_ENTROPY_PER_ZCRAM 4
369
370 /* VM region for all metadata structures */
371 vm_offset_t zone_metadata_region_min = 0;
372 vm_offset_t zone_metadata_region_max = 0;
373 decl_lck_mtx_data(static, zone_metadata_region_lck);
374 lck_attr_t zone_metadata_lock_attr;
375 lck_mtx_ext_t zone_metadata_region_lck_ext;
376
377 /* Helpful for walking through a zone's free element list. */
378 struct zone_free_element {
379 struct zone_free_element *next;
380 /* ... */
381 /* void *backup_ptr; */
382 };
383
384 #if CONFIG_ZCACHE
385
386 /*
387 * Decides whether per-cpu zone caching is to be enabled for all zones.
388 * Can be set to TRUE via the boot-arg '-zcache_all'.
389 */
390 bool cache_all_zones = FALSE;
391
392 /*
393 * Specifies a single zone to enable CPU caching for.
394 * Can be set using boot-args: zcc_enable_for_zone_name=<zone>
395 */
396 static char cache_zone_name[MAX_ZONE_NAME];
397
398 static inline bool
399 zone_caching_enabled(zone_t z)
400 {
401 return z->cpu_cache_enabled && !z->tags && !z->zleak_on;
402 }
403
404 #endif /* CONFIG_ZCACHE */
405
406 /*
407 * Protects zone_array, num_zones, num_zones_in_use, and zone_empty_bitmap
408 */
409 decl_simple_lock_data(, all_zones_lock);
410 unsigned int num_zones_in_use;
411 unsigned int num_zones;
412
413 #if KASAN
414 #define MAX_ZONES 512
415 #else /* !KASAN */
416 #define MAX_ZONES 320
417 #endif/* !KASAN */
418 struct zone zone_array[MAX_ZONES];
419
420 /* Used to keep track of empty slots in the zone_array */
421 bitmap_t zone_empty_bitmap[BITMAP_LEN(MAX_ZONES)];
422
423 #if DEBUG || DEVELOPMENT
424 /*
425 * Used for sysctl kern.run_zone_test which is not thread-safe. Ensure only one thread goes through at a time.
426 * Or we can end up with multiple test zones (if a second zinit() comes through before zdestroy()), which could lead us to
427 * run out of zones.
428 */
429 decl_simple_lock_data(, zone_test_lock);
430 static boolean_t zone_test_running = FALSE;
431 static zone_t test_zone_ptr = NULL;
432 #endif /* DEBUG || DEVELOPMENT */
433
434 #define PAGE_METADATA_GET_ZINDEX(page_meta) \
435 (page_meta->zindex)
436
437 #define PAGE_METADATA_GET_ZONE(page_meta) \
438 (&(zone_array[page_meta->zindex]))
439
440 #define PAGE_METADATA_SET_ZINDEX(page_meta, index) \
441 page_meta->zindex = (index);
442
443 struct zone_page_metadata {
444 queue_chain_t pages; /* linkage pointer for metadata lists */
445
446 /* Union for maintaining start of element free list and real metadata (for multipage allocations) */
447 union {
448 /*
449 * The start of the freelist can be maintained as a 32-bit offset instead of a pointer because
450 * the free elements would be at max ZONE_MAX_ALLOC_SIZE bytes away from the metadata. Offset
451 * from start of the allocation chunk to free element list head.
452 */
453 uint32_t freelist_offset;
454 /*
455 * This field is used to lookup the real metadata for multipage allocations, where we mark the
456 * metadata for all pages except the first as "fake" metadata using MULTIPAGE_METADATA_MAGIC.
457 * Offset from this fake metadata to real metadata of allocation chunk (-ve offset).
458 */
459 uint32_t real_metadata_offset;
460 };
461
462 /*
463 * For the first page in the allocation chunk, this represents the total number of free elements in
464 * the chunk.
465 */
466 uint16_t free_count;
467 unsigned zindex : ZINDEX_BITS; /* Zone index within the zone_array */
468 unsigned page_count : PAGECOUNT_BITS; /* Count of pages within the allocation chunk */
469 };
470
471 /* Macro to get page index (within zone_map) of page containing element */
472 #define PAGE_INDEX_FOR_ELEMENT(element) \
473 (((vm_offset_t)trunc_page(element) - zone_map_min_address) / PAGE_SIZE)
474
475 /* Macro to get metadata structure given a page index in zone_map */
476 #define PAGE_METADATA_FOR_PAGE_INDEX(index) \
477 (zone_metadata_region_min + ((index) * sizeof(struct zone_page_metadata)))
478
479 /* Macro to get index (within zone_map) for given metadata */
480 #define PAGE_INDEX_FOR_METADATA(page_meta) \
481 (((vm_offset_t)page_meta - zone_metadata_region_min) / sizeof(struct zone_page_metadata))
482
483 /* Macro to get page for given page index in zone_map */
484 #define PAGE_FOR_PAGE_INDEX(index) \
485 (zone_map_min_address + (PAGE_SIZE * (index)))
486
487 /* Macro to get the actual metadata for a given address */
488 #define PAGE_METADATA_FOR_ELEMENT(element) \
489 (struct zone_page_metadata *)(PAGE_METADATA_FOR_PAGE_INDEX(PAGE_INDEX_FOR_ELEMENT(element)))
490
491 /* Magic value to indicate empty element free list */
492 #define PAGE_METADATA_EMPTY_FREELIST ((uint32_t)(~0))
493
494 vm_map_copy_t create_vm_map_copy(vm_offset_t start_addr, vm_size_t total_size, vm_size_t used_size);
495 boolean_t get_zone_info(zone_t z, mach_zone_name_t *zn, mach_zone_info_t *zi);
496 boolean_t is_zone_map_nearing_exhaustion(void);
497 extern void vm_pageout_garbage_collect(int collect);
498
499 static inline void *
500 page_metadata_get_freelist(struct zone_page_metadata *page_meta)
501 {
502 assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
503 if (page_meta->freelist_offset == PAGE_METADATA_EMPTY_FREELIST) {
504 return NULL;
505 } else {
506 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) {
507 return (void *)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)) + page_meta->freelist_offset);
508 } else {
509 return (void *)((vm_offset_t)page_meta + page_meta->freelist_offset);
510 }
511 }
512 }
513
514 static inline void
515 page_metadata_set_freelist(struct zone_page_metadata *page_meta, void *addr)
516 {
517 assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
518 if (addr == NULL) {
519 page_meta->freelist_offset = PAGE_METADATA_EMPTY_FREELIST;
520 } else {
521 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) {
522 page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)));
523 } else {
524 page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - (vm_offset_t)page_meta);
525 }
526 }
527 }
528
529 static inline struct zone_page_metadata *
530 page_metadata_get_realmeta(struct zone_page_metadata *page_meta)
531 {
532 assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC);
533 return (struct zone_page_metadata *)((vm_offset_t)page_meta - page_meta->real_metadata_offset);
534 }
535
536 static inline void
537 page_metadata_set_realmeta(struct zone_page_metadata *page_meta, struct zone_page_metadata *real_meta)
538 {
539 assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC);
540 assert(PAGE_METADATA_GET_ZINDEX(real_meta) != MULTIPAGE_METADATA_MAGIC);
541 assert((vm_offset_t)page_meta > (vm_offset_t)real_meta);
542 vm_offset_t offset = (vm_offset_t)page_meta - (vm_offset_t)real_meta;
543 assert(offset <= UINT32_MAX);
544 page_meta->real_metadata_offset = (uint32_t)offset;
545 }
546
547 /* The backup pointer is stored in the last pointer-sized location in an element. */
548 static inline vm_offset_t *
549 get_backup_ptr(vm_size_t elem_size,
550 vm_offset_t *element)
551 {
552 return (vm_offset_t *) ((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
553 }
554
555 /*
556 * Routine to populate a page backing metadata in the zone_metadata_region.
557 * Must be called without the zone lock held as it might potentially block.
558 */
559 static inline void
560 zone_populate_metadata_page(struct zone_page_metadata *page_meta)
561 {
562 vm_offset_t page_metadata_begin = trunc_page(page_meta);
563 vm_offset_t page_metadata_end = trunc_page((vm_offset_t)page_meta + sizeof(struct zone_page_metadata));
564
565 for (; page_metadata_begin <= page_metadata_end; page_metadata_begin += PAGE_SIZE) {
566 #if !KASAN
567 /*
568 * This can race with another thread doing a populate on the same metadata
569 * page, where we see an updated pmap but unmapped KASan shadow, causing a
570 * fault in the shadow when we first access the metadata page. Avoid this
571 * by always synchronizing on the zone_metadata_region lock with KASan.
572 */
573 if (pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) {
574 continue;
575 }
576 #endif
577 /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
578 lck_mtx_lock(&zone_metadata_region_lck);
579 if (0 == pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) {
580 kern_return_t __assert_only ret = kernel_memory_populate(zone_map,
581 page_metadata_begin,
582 PAGE_SIZE,
583 KMA_KOBJECT,
584 VM_KERN_MEMORY_OSFMK);
585
586 /* should not fail with the given arguments */
587 assert(ret == KERN_SUCCESS);
588 }
589 lck_mtx_unlock(&zone_metadata_region_lck);
590 }
591 return;
592 }
593
594 static inline uint16_t
595 get_metadata_alloc_count(struct zone_page_metadata *page_meta)
596 {
597 assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
598 struct zone *z = PAGE_METADATA_GET_ZONE(page_meta);
599 return (page_meta->page_count * PAGE_SIZE) / z->elem_size;
600 }
601
602 /*
603 * Routine to lookup metadata for any given address.
604 * If init is marked as TRUE, this should be called without holding the zone lock
605 * since the initialization might block.
606 */
607 static inline struct zone_page_metadata *
608 get_zone_page_metadata(struct zone_free_element *element, boolean_t init)
609 {
610 struct zone_page_metadata *page_meta = 0;
611
612 if (from_zone_map(element, sizeof(struct zone_free_element))) {
613 page_meta = (struct zone_page_metadata *)(PAGE_METADATA_FOR_ELEMENT(element));
614 if (init) {
615 zone_populate_metadata_page(page_meta);
616 }
617 } else {
618 page_meta = (struct zone_page_metadata *)(trunc_page((vm_offset_t)element));
619 }
620 if (init) {
621 bzero((char *)page_meta, sizeof(struct zone_page_metadata));
622 }
623 return (PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC) ? page_meta : page_metadata_get_realmeta(page_meta);
624 }
625
626 /* Routine to get the page for a given metadata */
627 static inline vm_offset_t
628 get_zone_page(struct zone_page_metadata *page_meta)
629 {
630 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) {
631 return (vm_offset_t)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)));
632 } else {
633 return (vm_offset_t)(trunc_page(page_meta));
634 }
635 }
636
637 /*
638 * Routine to panic if a pointer is not mapped to an expected zone.
639 * This can be used as a means of pinning an object to the zone it is expected
640 * to be a part of. Causes a panic if the address does not belong to any
641 * specified zone, does not belong to any zone, has been freed and therefore
642 * unmapped from the zone, or the pointer contains an uninitialized value that
643 * does not belong to any zone.
644 */
645
646 void
647 zone_require(void *addr, zone_t expected_zone)
648 {
649 struct zone *src_zone = NULL;
650 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
651
652 src_zone = PAGE_METADATA_GET_ZONE(page_meta);
653 if (__improbable(src_zone == NULL)) {
654 panic("Address not in a zone for zone_require check (addr: %p)", addr);
655 }
656
657 if (__improbable(src_zone != expected_zone)) {
658 panic("Address not in expected zone for zone_require check (addr: %p, zone: %s)", addr, src_zone->zone_name);
659 }
660 }
661
662 /*
663 * ZTAGS
664 */
665
666 #if VM_MAX_TAG_ZONES
667
668 // for zones with tagging enabled:
669
670 // calculate a pointer to the tag base entry,
671 // holding either a uint32_t the first tag offset for a page in the zone map,
672 // or two uint16_t tags if the page can only hold one or two elements
673
674 #define ZTAGBASE(zone, element) \
675 (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_map_min_address)])
676
677 // pointer to the tag for an element
678 #define ZTAG(zone, element) \
679 ({ \
680 vm_tag_t * result; \
681 if ((zone)->tags_inline) { \
682 result = (vm_tag_t *) ZTAGBASE((zone), (element)); \
683 if ((page_mask & element) >= (zone)->elem_size) result++; \
684 } else { \
685 result = &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / (zone)->elem_size]; \
686 } \
687 result; \
688 })
689
690
691 static vm_offset_t zone_tagbase_min;
692 static vm_offset_t zone_tagbase_max;
693 static vm_offset_t zone_tagbase_map_size;
694 static vm_map_t zone_tagbase_map;
695
696 static vm_offset_t zone_tags_min;
697 static vm_offset_t zone_tags_max;
698 static vm_offset_t zone_tags_map_size;
699 static vm_map_t zone_tags_map;
700
701 // simple heap allocator for allocating the tags for new memory
702
703 decl_lck_mtx_data(, ztLock); /* heap lock */
704 enum{
705 ztFreeIndexCount = 8,
706 ztFreeIndexMax = (ztFreeIndexCount - 1),
707 ztTagsPerBlock = 4
708 };
709
710 struct ztBlock {
711 #if __LITTLE_ENDIAN__
712 uint64_t free:1,
713 next:21,
714 prev:21,
715 size:21;
716 #else
717 // ztBlock needs free bit least significant
718 #error !__LITTLE_ENDIAN__
719 #endif
720 };
721 typedef struct ztBlock ztBlock;
722
723 static ztBlock * ztBlocks;
724 static uint32_t ztBlocksCount;
725 static uint32_t ztBlocksFree;
726
727 static uint32_t
728 ztLog2up(uint32_t size)
729 {
730 if (1 == size) {
731 size = 0;
732 } else {
733 size = 32 - __builtin_clz(size - 1);
734 }
735 return size;
736 }
737
738 static uint32_t
739 ztLog2down(uint32_t size)
740 {
741 size = 31 - __builtin_clz(size);
742 return size;
743 }
744
745 static void
746 ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags)
747 {
748 vm_map_offset_t addr = (vm_map_offset_t) address;
749 vm_map_offset_t page, end;
750
751 page = trunc_page(addr);
752 end = round_page(addr + size);
753
754 for (; page < end; page += page_size) {
755 if (!pmap_find_phys(kernel_pmap, page)) {
756 kern_return_t __unused
757 ret = kernel_memory_populate(map, page, PAGE_SIZE,
758 KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG);
759 assert(ret == KERN_SUCCESS);
760 }
761 }
762 }
763
764 static boolean_t
765 ztPresent(const void * address, size_t size)
766 {
767 vm_map_offset_t addr = (vm_map_offset_t) address;
768 vm_map_offset_t page, end;
769 boolean_t result;
770
771 page = trunc_page(addr);
772 end = round_page(addr + size);
773 for (result = TRUE; (page < end); page += page_size) {
774 result = pmap_find_phys(kernel_pmap, page);
775 if (!result) {
776 break;
777 }
778 }
779 return result;
780 }
781
782
783 void __unused
784 ztDump(boolean_t sanity);
785 void __unused
786 ztDump(boolean_t sanity)
787 {
788 uint32_t q, cq, p;
789
790 for (q = 0; q <= ztFreeIndexMax; q++) {
791 p = q;
792 do{
793 if (sanity) {
794 cq = ztLog2down(ztBlocks[p].size);
795 if (cq > ztFreeIndexMax) {
796 cq = ztFreeIndexMax;
797 }
798 if (!ztBlocks[p].free
799 || ((p != q) && (q != cq))
800 || (ztBlocks[ztBlocks[p].next].prev != p)
801 || (ztBlocks[ztBlocks[p].prev].next != p)) {
802 kprintf("zterror at %d", p);
803 ztDump(FALSE);
804 kprintf("zterror at %d", p);
805 assert(FALSE);
806 }
807 continue;
808 }
809 kprintf("zt[%03d]%c %d, %d, %d\n",
810 p, ztBlocks[p].free ? 'F' : 'A',
811 ztBlocks[p].next, ztBlocks[p].prev,
812 ztBlocks[p].size);
813 p = ztBlocks[p].next;
814 if (p == q) {
815 break;
816 }
817 }while (p != q);
818 if (!sanity) {
819 printf("\n");
820 }
821 }
822 if (!sanity) {
823 printf("-----------------------\n");
824 }
825 }
826
827
828
829 #define ZTBDEQ(idx) \
830 ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next; \
831 ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
832
833 static void
834 ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
835 {
836 uint32_t q, w, p, size, merge;
837
838 assert(count);
839 ztBlocksFree += count;
840
841 // merge with preceding
842 merge = (index + count);
843 if ((merge < ztBlocksCount)
844 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
845 && ztBlocks[merge].free) {
846 ZTBDEQ(merge);
847 count += ztBlocks[merge].size;
848 }
849
850 // merge with following
851 merge = (index - 1);
852 if ((merge > ztFreeIndexMax)
853 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
854 && ztBlocks[merge].free) {
855 size = ztBlocks[merge].size;
856 count += size;
857 index -= size;
858 ZTBDEQ(index);
859 }
860
861 q = ztLog2down(count);
862 if (q > ztFreeIndexMax) {
863 q = ztFreeIndexMax;
864 }
865 w = q;
866 // queue in order of size
867 while (TRUE) {
868 p = ztBlocks[w].next;
869 if (p == q) {
870 break;
871 }
872 if (ztBlocks[p].size >= count) {
873 break;
874 }
875 w = p;
876 }
877 ztBlocks[p].prev = index;
878 ztBlocks[w].next = index;
879
880 // fault in first
881 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
882
883 // mark first & last with free flag and size
884 ztBlocks[index].free = TRUE;
885 ztBlocks[index].size = count;
886 ztBlocks[index].prev = w;
887 ztBlocks[index].next = p;
888 if (count > 1) {
889 index += (count - 1);
890 // fault in last
891 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
892 ztBlocks[index].free = TRUE;
893 ztBlocks[index].size = count;
894 }
895 }
896
897 static uint32_t
898 ztAlloc(zone_t zone, uint32_t count)
899 {
900 uint32_t q, w, p, leftover;
901
902 assert(count);
903
904 q = ztLog2up(count);
905 if (q > ztFreeIndexMax) {
906 q = ztFreeIndexMax;
907 }
908 do{
909 w = q;
910 while (TRUE) {
911 p = ztBlocks[w].next;
912 if (p == q) {
913 break;
914 }
915 if (ztBlocks[p].size >= count) {
916 // dequeue, mark both ends allocated
917 ztBlocks[w].next = ztBlocks[p].next;
918 ztBlocks[ztBlocks[p].next].prev = w;
919 ztBlocks[p].free = FALSE;
920 ztBlocksFree -= ztBlocks[p].size;
921 if (ztBlocks[p].size > 1) {
922 ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
923 }
924
925 // fault all the allocation
926 ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
927 // mark last as allocated
928 if (count > 1) {
929 ztBlocks[p + count - 1].free = FALSE;
930 }
931 // free remainder
932 leftover = ztBlocks[p].size - count;
933 if (leftover) {
934 ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
935 }
936
937 return p;
938 }
939 w = p;
940 }
941 q++;
942 }while (q <= ztFreeIndexMax);
943
944 return -1U;
945 }
946
947 static void
948 ztInit(vm_size_t max_zonemap_size, lck_grp_t * group)
949 {
950 kern_return_t ret;
951 vm_map_kernel_flags_t vmk_flags;
952 uint32_t idx;
953
954 lck_mtx_init(&ztLock, group, LCK_ATTR_NULL);
955
956 // allocate submaps VM_KERN_MEMORY_DIAG
957
958 zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t);
959 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
960 vmk_flags.vmkf_permanent = TRUE;
961 ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size,
962 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
963 &zone_tagbase_map);
964
965 if (ret != KERN_SUCCESS) {
966 panic("zone_init: kmem_suballoc failed");
967 }
968 zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size);
969
970 zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t);
971 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
972 vmk_flags.vmkf_permanent = TRUE;
973 ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size,
974 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
975 &zone_tags_map);
976
977 if (ret != KERN_SUCCESS) {
978 panic("zone_init: kmem_suballoc failed");
979 }
980 zone_tags_max = zone_tags_min + round_page(zone_tags_map_size);
981
982 ztBlocks = (ztBlock *) zone_tags_min;
983 ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
984
985 // initialize the qheads
986 lck_mtx_lock(&ztLock);
987
988 ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0);
989 for (idx = 0; idx < ztFreeIndexCount; idx++) {
990 ztBlocks[idx].free = TRUE;
991 ztBlocks[idx].next = idx;
992 ztBlocks[idx].prev = idx;
993 ztBlocks[idx].size = 0;
994 }
995 // free remaining space
996 ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
997
998 lck_mtx_unlock(&ztLock);
999 }
1000
1001 static void
1002 ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
1003 {
1004 uint32_t * tagbase;
1005 uint32_t count, block, blocks, idx;
1006 size_t pages;
1007
1008 pages = atop(size);
1009 tagbase = ZTAGBASE(zone, mem);
1010
1011 lck_mtx_lock(&ztLock);
1012
1013 // fault tagbase
1014 ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0);
1015
1016 if (!zone->tags_inline) {
1017 // allocate tags
1018 count = (uint32_t)(size / zone->elem_size);
1019 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1020 block = ztAlloc(zone, blocks);
1021 if (-1U == block) {
1022 ztDump(false);
1023 }
1024 assert(-1U != block);
1025 }
1026
1027 lck_mtx_unlock(&ztLock);
1028
1029 if (!zone->tags_inline) {
1030 // set tag base for each page
1031 block *= ztTagsPerBlock;
1032 for (idx = 0; idx < pages; idx++) {
1033 tagbase[idx] = block + (uint32_t)((ptoa(idx) + (zone->elem_size - 1)) / zone->elem_size);
1034 }
1035 }
1036 }
1037
1038 static void
1039 ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
1040 {
1041 uint32_t * tagbase;
1042 uint32_t count, block, blocks, idx;
1043 size_t pages;
1044
1045 // set tag base for each page
1046 pages = atop(size);
1047 tagbase = ZTAGBASE(zone, mem);
1048 block = tagbase[0];
1049 for (idx = 0; idx < pages; idx++) {
1050 tagbase[idx] = 0xFFFFFFFF;
1051 }
1052
1053 lck_mtx_lock(&ztLock);
1054 if (!zone->tags_inline) {
1055 count = (uint32_t)(size / zone->elem_size);
1056 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1057 assert(block != 0xFFFFFFFF);
1058 block /= ztTagsPerBlock;
1059 ztFree(NULL /* zone is unlocked */, block, blocks);
1060 }
1061
1062 lck_mtx_unlock(&ztLock);
1063 }
1064
1065 uint32_t
1066 zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size)
1067 {
1068 zone_t z;
1069 uint32_t idx;
1070
1071 simple_lock(&all_zones_lock, &zone_locks_grp);
1072
1073 for (idx = 0; idx < num_zones; idx++) {
1074 z = &(zone_array[idx]);
1075 if (!z->tags) {
1076 continue;
1077 }
1078 if (tag_zone_index != z->tag_zone_index) {
1079 continue;
1080 }
1081 *elem_size = z->elem_size;
1082 break;
1083 }
1084
1085 simple_unlock(&all_zones_lock);
1086
1087 if (idx == num_zones) {
1088 idx = -1U;
1089 }
1090
1091 return idx;
1092 }
1093
1094 #endif /* VM_MAX_TAG_ZONES */
1095
1096 /* Routine to get the size of a zone allocated address. If the address doesnt belong to the
1097 * zone_map, returns 0.
1098 */
1099 vm_size_t
1100 zone_element_size(void *addr, zone_t *z)
1101 {
1102 struct zone *src_zone;
1103 if (from_zone_map(addr, sizeof(void *))) {
1104 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
1105 src_zone = PAGE_METADATA_GET_ZONE(page_meta);
1106 if (z) {
1107 *z = src_zone;
1108 }
1109 return src_zone->elem_size;
1110 } else {
1111 #if CONFIG_GZALLOC
1112 vm_size_t gzsize;
1113 if (gzalloc_element_size(addr, z, &gzsize)) {
1114 return gzsize;
1115 }
1116 #endif /* CONFIG_GZALLOC */
1117
1118 return 0;
1119 }
1120 }
1121
1122 #if DEBUG || DEVELOPMENT
1123
1124 vm_size_t
1125 zone_element_info(void *addr, vm_tag_t * ptag)
1126 {
1127 vm_size_t size = 0;
1128 vm_tag_t tag = VM_KERN_MEMORY_NONE;
1129 struct zone * src_zone;
1130
1131 if (from_zone_map(addr, sizeof(void *))) {
1132 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
1133 src_zone = PAGE_METADATA_GET_ZONE(page_meta);
1134 #if VM_MAX_TAG_ZONES
1135 if (__improbable(src_zone->tags)) {
1136 tag = (ZTAG(src_zone, (vm_offset_t) addr)[0] >> 1);
1137 }
1138 #endif /* VM_MAX_TAG_ZONES */
1139 size = src_zone->elem_size;
1140 } else {
1141 #if CONFIG_GZALLOC
1142 gzalloc_element_size(addr, NULL, &size);
1143 #endif /* CONFIG_GZALLOC */
1144 }
1145 *ptag = tag;
1146 return size;
1147 }
1148
1149 #endif /* DEBUG || DEVELOPMENT */
1150
1151 /*
1152 * Zone checking helper function.
1153 * A pointer that satisfies these conditions is OK to be a freelist next pointer
1154 * A pointer that doesn't satisfy these conditions indicates corruption
1155 */
1156 static inline boolean_t
1157 is_sane_zone_ptr(zone_t zone,
1158 vm_offset_t addr,
1159 size_t obj_size)
1160 {
1161 /* Must be aligned to pointer boundary */
1162 if (__improbable((addr & (sizeof(vm_offset_t) - 1)) != 0)) {
1163 return FALSE;
1164 }
1165
1166 /* Must be a kernel address */
1167 if (__improbable(!pmap_kernel_va(addr))) {
1168 return FALSE;
1169 }
1170
1171 /* Must be from zone map if the zone only uses memory from the zone_map */
1172 /*
1173 * TODO: Remove the zone->collectable check when every
1174 * zone using foreign memory is properly tagged with allows_foreign
1175 */
1176 if (zone->collectable && !zone->allows_foreign) {
1177 /* check if addr is from zone map */
1178 if (addr >= zone_map_min_address &&
1179 (addr + obj_size - 1) < zone_map_max_address) {
1180 return TRUE;
1181 }
1182
1183 return FALSE;
1184 }
1185
1186 return TRUE;
1187 }
1188
1189 static inline boolean_t
1190 is_sane_zone_page_metadata(zone_t zone,
1191 vm_offset_t page_meta)
1192 {
1193 /* NULL page metadata structures are invalid */
1194 if (page_meta == 0) {
1195 return FALSE;
1196 }
1197 return is_sane_zone_ptr(zone, page_meta, sizeof(struct zone_page_metadata));
1198 }
1199
1200 static inline boolean_t
1201 is_sane_zone_element(zone_t zone,
1202 vm_offset_t addr)
1203 {
1204 /* NULL is OK because it indicates the tail of the list */
1205 if (addr == 0) {
1206 return TRUE;
1207 }
1208 return is_sane_zone_ptr(zone, addr, zone->elem_size);
1209 }
1210
1211 /* Someone wrote to freed memory. */
1212 __dead2
1213 static inline void
1214 zone_element_was_modified_panic(zone_t zone,
1215 vm_offset_t element,
1216 vm_offset_t found,
1217 vm_offset_t expected,
1218 vm_offset_t offset)
1219 {
1220 panic("a freed zone element has been modified in zone %s: expected %p but found %p, bits changed %p, at offset %d of %d in element %p, cookies %p %p",
1221 zone->zone_name,
1222 (void *) expected,
1223 (void *) found,
1224 (void *) (expected ^ found),
1225 (uint32_t) offset,
1226 (uint32_t) zone->elem_size,
1227 (void *) element,
1228 (void *) zp_nopoison_cookie,
1229 (void *) zp_poisoned_cookie);
1230 }
1231
1232 /*
1233 * The primary and backup pointers don't match.
1234 * Determine which one was likely the corrupted pointer, find out what it
1235 * probably should have been, and panic.
1236 */
1237 __dead2
1238 static void
1239 backup_ptr_mismatch_panic(zone_t zone,
1240 vm_offset_t element,
1241 vm_offset_t primary,
1242 vm_offset_t backup)
1243 {
1244 vm_offset_t likely_backup;
1245 vm_offset_t likely_primary;
1246
1247 likely_primary = primary ^ zp_nopoison_cookie;
1248 boolean_t sane_backup;
1249 boolean_t sane_primary = is_sane_zone_element(zone, likely_primary);
1250 boolean_t element_was_poisoned = (backup & 0x1) ? TRUE : FALSE;
1251
1252 #if defined(__LP64__)
1253 /* We can inspect the tag in the upper bits for additional confirmation */
1254 if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000) {
1255 element_was_poisoned = TRUE;
1256 } else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000) {
1257 element_was_poisoned = FALSE;
1258 }
1259 #endif
1260
1261 if (element_was_poisoned) {
1262 likely_backup = backup ^ zp_poisoned_cookie;
1263 sane_backup = is_sane_zone_element(zone, likely_backup);
1264 } else {
1265 likely_backup = backup ^ zp_nopoison_cookie;
1266 sane_backup = is_sane_zone_element(zone, likely_backup);
1267 }
1268
1269 /* The primary is definitely the corrupted one */
1270 if (!sane_primary && sane_backup) {
1271 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1272 }
1273
1274 /* The backup is definitely the corrupted one */
1275 if (sane_primary && !sane_backup) {
1276 zone_element_was_modified_panic(zone, element, backup,
1277 (likely_primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
1278 zone->elem_size - sizeof(vm_offset_t));
1279 }
1280
1281 /*
1282 * Not sure which is the corrupted one.
1283 * It's less likely that the backup pointer was overwritten with
1284 * ( (sane address) ^ (valid cookie) ), so we'll guess that the
1285 * primary pointer has been overwritten with a sane but incorrect address.
1286 */
1287 if (sane_primary && sane_backup) {
1288 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1289 }
1290
1291 /* Neither are sane, so just guess. */
1292 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1293 }
1294
1295 /*
1296 * Adds the element to the head of the zone's free list
1297 * Keeps a backup next-pointer at the end of the element
1298 */
1299 static inline void
1300 free_to_zone(zone_t zone,
1301 vm_offset_t element,
1302 boolean_t poison)
1303 {
1304 vm_offset_t old_head;
1305 struct zone_page_metadata *page_meta;
1306
1307 vm_offset_t *primary = (vm_offset_t *) element;
1308 vm_offset_t *backup = get_backup_ptr(zone->elem_size, primary);
1309
1310 page_meta = get_zone_page_metadata((struct zone_free_element *)element, FALSE);
1311 assert(PAGE_METADATA_GET_ZONE(page_meta) == zone);
1312 old_head = (vm_offset_t)page_metadata_get_freelist(page_meta);
1313
1314 if (__improbable(!is_sane_zone_element(zone, old_head))) {
1315 panic("zfree: invalid head pointer %p for freelist of zone %s\n",
1316 (void *) old_head, zone->zone_name);
1317 }
1318
1319 if (__improbable(!is_sane_zone_element(zone, element))) {
1320 panic("zfree: freeing invalid pointer %p to zone %s\n",
1321 (void *) element, zone->zone_name);
1322 }
1323
1324 if (__improbable(old_head == element)) {
1325 panic("zfree: double free of %p to zone %s\n",
1326 (void *) element, zone->zone_name);
1327 }
1328 /*
1329 * Always write a redundant next pointer
1330 * So that it is more difficult to forge, xor it with a random cookie
1331 * A poisoned element is indicated by using zp_poisoned_cookie
1332 * instead of zp_nopoison_cookie
1333 */
1334
1335 *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
1336
1337 /*
1338 * Insert this element at the head of the free list. We also xor the
1339 * primary pointer with the zp_nopoison_cookie to make sure a free
1340 * element does not provide the location of the next free element directly.
1341 */
1342 *primary = old_head ^ zp_nopoison_cookie;
1343 page_metadata_set_freelist(page_meta, (struct zone_free_element *)element);
1344 page_meta->free_count++;
1345 if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) {
1346 if (page_meta->free_count == 1) {
1347 /* first foreign element freed on page, move from all_used */
1348 re_queue_tail(&zone->pages.any_free_foreign, &(page_meta->pages));
1349 } else {
1350 /* no other list transitions */
1351 }
1352 } else if (page_meta->free_count == get_metadata_alloc_count(page_meta)) {
1353 /* whether the page was on the intermediate or all_used, queue, move it to free */
1354 re_queue_tail(&zone->pages.all_free, &(page_meta->pages));
1355 zone->count_all_free_pages += page_meta->page_count;
1356 } else if (page_meta->free_count == 1) {
1357 /* first free element on page, move from all_used */
1358 re_queue_tail(&zone->pages.intermediate, &(page_meta->pages));
1359 }
1360 zone->count--;
1361 zone->countfree++;
1362
1363 #if KASAN_ZALLOC
1364 kasan_poison_range(element, zone->elem_size, ASAN_HEAP_FREED);
1365 #endif
1366 }
1367
1368
1369 /*
1370 * Removes an element from the zone's free list, returning 0 if the free list is empty.
1371 * Verifies that the next-pointer and backup next-pointer are intact,
1372 * and verifies that a poisoned element hasn't been modified.
1373 */
1374 static inline vm_offset_t
1375 try_alloc_from_zone(zone_t zone,
1376 vm_tag_t tag __unused,
1377 boolean_t* check_poison)
1378 {
1379 vm_offset_t element;
1380 struct zone_page_metadata *page_meta;
1381
1382 *check_poison = FALSE;
1383
1384 /* if zone is empty, bail */
1385 if (zone->allows_foreign && !queue_empty(&zone->pages.any_free_foreign)) {
1386 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
1387 } else if (!queue_empty(&zone->pages.intermediate)) {
1388 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
1389 } else if (!queue_empty(&zone->pages.all_free)) {
1390 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
1391 assert(zone->count_all_free_pages >= page_meta->page_count);
1392 zone->count_all_free_pages -= page_meta->page_count;
1393 } else {
1394 return 0;
1395 }
1396 /* Check if page_meta passes is_sane_zone_element */
1397 if (__improbable(!is_sane_zone_page_metadata(zone, (vm_offset_t)page_meta))) {
1398 panic("zalloc: invalid metadata structure %p for freelist of zone %s\n",
1399 (void *) page_meta, zone->zone_name);
1400 }
1401 assert(PAGE_METADATA_GET_ZONE(page_meta) == zone);
1402 element = (vm_offset_t)page_metadata_get_freelist(page_meta);
1403
1404 if (__improbable(!is_sane_zone_ptr(zone, element, zone->elem_size))) {
1405 panic("zfree: invalid head pointer %p for freelist of zone %s\n",
1406 (void *) element, zone->zone_name);
1407 }
1408
1409 vm_offset_t *primary = (vm_offset_t *) element;
1410 vm_offset_t *backup = get_backup_ptr(zone->elem_size, primary);
1411
1412 /*
1413 * Since the primary next pointer is xor'ed with zp_nopoison_cookie
1414 * for obfuscation, retrieve the original value back
1415 */
1416 vm_offset_t next_element = *primary ^ zp_nopoison_cookie;
1417 vm_offset_t next_element_primary = *primary;
1418 vm_offset_t next_element_backup = *backup;
1419
1420 /*
1421 * backup_ptr_mismatch_panic will determine what next_element
1422 * should have been, and print it appropriately
1423 */
1424 if (__improbable(!is_sane_zone_element(zone, next_element))) {
1425 backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup);
1426 }
1427
1428 /* Check the backup pointer for the regular cookie */
1429 if (__improbable(next_element != (next_element_backup ^ zp_nopoison_cookie))) {
1430 /* Check for the poisoned cookie instead */
1431 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) {
1432 /* Neither cookie is valid, corruption has occurred */
1433 backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup);
1434 }
1435
1436 /*
1437 * Element was marked as poisoned, so check its integrity before using it.
1438 */
1439 *check_poison = TRUE;
1440 }
1441
1442 /* Make sure the page_meta is at the correct offset from the start of page */
1443 if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)element, FALSE))) {
1444 panic("zalloc: Incorrect metadata %p found in zone %s page queue. Expected metadata: %p\n",
1445 page_meta, zone->zone_name, get_zone_page_metadata((struct zone_free_element *)element, FALSE));
1446 }
1447
1448 /* Make sure next_element belongs to the same page as page_meta */
1449 if (next_element) {
1450 if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)next_element, FALSE))) {
1451 panic("zalloc: next element pointer %p for element %p points to invalid element for zone %s\n",
1452 (void *)next_element, (void *)element, zone->zone_name);
1453 }
1454 }
1455
1456 /* Remove this element from the free list */
1457 page_metadata_set_freelist(page_meta, (struct zone_free_element *)next_element);
1458 page_meta->free_count--;
1459
1460 if (page_meta->free_count == 0) {
1461 /* move to all used */
1462 re_queue_tail(&zone->pages.all_used, &(page_meta->pages));
1463 } else {
1464 if (!zone->allows_foreign || from_zone_map(element, zone->elem_size)) {
1465 if (get_metadata_alloc_count(page_meta) == page_meta->free_count + 1) {
1466 /* remove from free, move to intermediate */
1467 re_queue_tail(&zone->pages.intermediate, &(page_meta->pages));
1468 }
1469 }
1470 }
1471 zone->countfree--;
1472 zone->count++;
1473 zone->sum_count++;
1474
1475 #if VM_MAX_TAG_ZONES
1476 if (__improbable(zone->tags)) {
1477 // set the tag with b0 clear so the block remains inuse
1478 ZTAG(zone, element)[0] = (tag << 1);
1479 }
1480 #endif /* VM_MAX_TAG_ZONES */
1481
1482
1483 #if KASAN_ZALLOC
1484 kasan_poison_range(element, zone->elem_size, ASAN_VALID);
1485 #endif
1486
1487 return element;
1488 }
1489
1490 /*
1491 * End of zone poisoning
1492 */
1493
1494 /*
1495 * Zone info options
1496 */
1497 #define ZINFO_SLOTS MAX_ZONES /* for now */
1498
1499 zone_t zone_find_largest(void);
1500
1501 /*
1502 * Async allocation of zones
1503 * This mechanism allows for bootstrapping an empty zone which is setup with
1504 * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
1505 * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
1506 * This will prime the zone for the next use.
1507 *
1508 * Currently the thread_callout function (zalloc_async) will loop through all zones
1509 * looking for any zone with async_pending set and do the work for it.
1510 *
1511 * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
1512 * then zalloc_noblock to an empty zone may succeed.
1513 */
1514 void zalloc_async(
1515 thread_call_param_t p0,
1516 thread_call_param_t p1);
1517
1518 static thread_call_data_t call_async_alloc;
1519
1520 /*
1521 * Align elements that use the zone page list to 32 byte boundaries.
1522 */
1523 #define ZONE_ELEMENT_ALIGNMENT 32
1524
1525 #define zone_wakeup(zone) thread_wakeup((event_t)(zone))
1526 #define zone_sleep(zone) \
1527 (void) lck_mtx_sleep(&(zone)->lock, LCK_SLEEP_SPIN_ALWAYS, (event_t)(zone), THREAD_UNINT);
1528
1529
1530 #define lock_zone_init(zone) \
1531 MACRO_BEGIN \
1532 lck_attr_setdefault(&(zone)->lock_attr); \
1533 lck_mtx_init_ext(&(zone)->lock, &(zone)->lock_ext, \
1534 &zone_locks_grp, &(zone)->lock_attr); \
1535 MACRO_END
1536
1537 #define lock_try_zone(zone) lck_mtx_try_lock_spin(&zone->lock)
1538
1539 /*
1540 * Exclude more than one concurrent garbage collection
1541 */
1542 decl_lck_mtx_data(, zone_gc_lock);
1543
1544 lck_attr_t zone_gc_lck_attr;
1545 lck_grp_t zone_gc_lck_grp;
1546 lck_grp_attr_t zone_gc_lck_grp_attr;
1547 lck_mtx_ext_t zone_gc_lck_ext;
1548
1549 boolean_t zone_gc_allowed = TRUE;
1550 boolean_t panic_include_zprint = FALSE;
1551
1552 mach_memory_info_t *panic_kext_memory_info = NULL;
1553 vm_size_t panic_kext_memory_size = 0;
1554
1555 #define ZALLOC_DEBUG_ZONEGC 0x00000001
1556 #define ZALLOC_DEBUG_ZCRAM 0x00000002
1557
1558 #if DEBUG || DEVELOPMENT
1559 static uint32_t zalloc_debug = 0;
1560 #endif
1561
1562 /*
1563 * Zone leak debugging code
1564 *
1565 * When enabled, this code keeps a log to track allocations to a particular zone that have not
1566 * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated
1567 * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is
1568 * off by default.
1569 *
1570 * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
1571 * is the name of the zone you wish to log.
1572 *
1573 * This code only tracks one zone, so you need to identify which one is leaking first.
1574 * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
1575 * garbage collector. Note that the zone name printed in the panic message is not necessarily the one
1576 * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This
1577 * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The
1578 * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
1579 * See the help in the kgmacros for usage info.
1580 *
1581 *
1582 * Zone corruption logging
1583 *
1584 * Logging can also be used to help identify the source of a zone corruption. First, identify the zone
1585 * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args. When -zc is used in conjunction
1586 * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the
1587 * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
1588 * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been
1589 * corrupted to examine its history. This should lead to the source of the corruption.
1590 */
1591
1592 static boolean_t log_records_init = FALSE;
1593 static int log_records; /* size of the log, expressed in number of records */
1594
1595 #define MAX_NUM_ZONES_ALLOWED_LOGGING 10 /* Maximum 10 zones can be logged at once */
1596
1597 static int max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
1598 static int num_zones_logged = 0;
1599
1600 static char zone_name_to_log[MAX_ZONE_NAME] = ""; /* the zone name we're logging, if any */
1601
1602 /* Log allocations and frees to help debug a zone element corruption */
1603 boolean_t corruption_debug_flag = DEBUG; /* enabled by "-zc" boot-arg */
1604 /* Making pointer scanning leaks detection possible for all zones */
1605
1606 #if DEBUG || DEVELOPMENT
1607 boolean_t leak_scan_debug_flag = FALSE; /* enabled by "-zl" boot-arg */
1608 #endif /* DEBUG || DEVELOPMENT */
1609
1610
1611 /*
1612 * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to
1613 * the number of records you want in the log. For example, "zrecs=10" sets it to 10 records. Since this
1614 * is the number of stacks suspected of leaking, we don't need many records.
1615 */
1616
1617 #if defined(__LP64__)
1618 #define ZRECORDS_MAX 2560 /* Max records allowed in the log */
1619 #else
1620 #define ZRECORDS_MAX 1536 /* Max records allowed in the log */
1621 #endif
1622 #define ZRECORDS_DEFAULT 1024 /* default records in log if zrecs is not specificed in boot-args */
1623
1624 /*
1625 * Each record in the log contains a pointer to the zone element it refers to,
1626 * and a small array to hold the pc's from the stack trace. A
1627 * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging,
1628 * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees.
1629 * If the log fills, old records are replaced as if it were a circular buffer.
1630 */
1631
1632
1633 /*
1634 * Decide if we want to log this zone by doing a string compare between a zone name and the name
1635 * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not
1636 * possible to include spaces in strings passed in via the boot-args, a period in the logname will
1637 * match a space in the zone name.
1638 */
1639
1640 int
1641 track_this_zone(const char *zonename, const char *logname)
1642 {
1643 unsigned int len;
1644 const char *zc = zonename;
1645 const char *lc = logname;
1646
1647 /*
1648 * Compare the strings. We bound the compare by MAX_ZONE_NAME.
1649 */
1650
1651 for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
1652 /*
1653 * If the current characters don't match, check for a space in
1654 * in the zone name and a corresponding period in the log name.
1655 * If that's not there, then the strings don't match.
1656 */
1657
1658 if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
1659 break;
1660 }
1661
1662 /*
1663 * The strings are equal so far. If we're at the end, then it's a match.
1664 */
1665
1666 if (*zc == '\0') {
1667 return TRUE;
1668 }
1669 }
1670
1671 return FALSE;
1672 }
1673
1674
1675 /*
1676 * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and
1677 * the buffer for the records has been allocated.
1678 */
1679
1680 #define DO_LOGGING(z) (z->zone_logging == TRUE && z->zlog_btlog)
1681
1682 extern boolean_t kmem_alloc_ready;
1683
1684 #if CONFIG_ZLEAKS
1685 #pragma mark -
1686 #pragma mark Zone Leak Detection
1687
1688 /*
1689 * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
1690 * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a
1691 * backtrace. Every free, we examine the table and determine if the allocation was being tracked,
1692 * and stop tracking it if it was being tracked.
1693 *
1694 * We track the allocations in the zallocations hash table, which stores the address that was returned from
1695 * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which
1696 * stores the backtrace associated with that allocation. This provides uniquing for the relatively large
1697 * backtraces - we don't store them more than once.
1698 *
1699 * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
1700 * a large amount of virtual space.
1701 */
1702 #define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */
1703 #define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */
1704 #define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */
1705 #define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */
1706 uint32_t zleak_state = 0; /* State of collection, as above */
1707
1708 boolean_t panic_include_ztrace = FALSE; /* Enable zleak logging on panic */
1709 vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */
1710 vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */
1711 unsigned int zleak_sample_factor = 1000; /* Allocations per sample attempt */
1712
1713 /*
1714 * Counters for allocation statistics.
1715 */
1716
1717 /* Times two active records want to occupy the same spot */
1718 unsigned int z_alloc_collisions = 0;
1719 unsigned int z_trace_collisions = 0;
1720
1721 /* Times a new record lands on a spot previously occupied by a freed allocation */
1722 unsigned int z_alloc_overwrites = 0;
1723 unsigned int z_trace_overwrites = 0;
1724
1725 /* Times a new alloc or trace is put into the hash table */
1726 unsigned int z_alloc_recorded = 0;
1727 unsigned int z_trace_recorded = 0;
1728
1729 /* Times zleak_log returned false due to not being able to acquire the lock */
1730 unsigned int z_total_conflicts = 0;
1731
1732
1733 #pragma mark struct zallocation
1734 /*
1735 * Structure for keeping track of an allocation
1736 * An allocation bucket is in use if its element is not NULL
1737 */
1738 struct zallocation {
1739 uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
1740 vm_size_t za_size; /* how much memory did this allocation take up? */
1741 uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */
1742 /* TODO: #if this out */
1743 uint32_t za_hit_count; /* for determining effectiveness of hash function */
1744 };
1745
1746 /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
1747 uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
1748 uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
1749
1750 vm_size_t zleak_max_zonemap_size;
1751
1752 /* Hashmaps of allocations and their corresponding traces */
1753 static struct zallocation* zallocations;
1754 static struct ztrace* ztraces;
1755
1756 /* not static so that panic can see this, see kern/debug.c */
1757 struct ztrace* top_ztrace;
1758
1759 /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
1760 static lck_spin_t zleak_lock;
1761 static lck_attr_t zleak_lock_attr;
1762 static lck_grp_t zleak_lock_grp;
1763 static lck_grp_attr_t zleak_lock_grp_attr;
1764
1765 /*
1766 * Initializes the zone leak monitor. Called from zone_init()
1767 */
1768 static void
1769 zleak_init(vm_size_t max_zonemap_size)
1770 {
1771 char scratch_buf[16];
1772 boolean_t zleak_enable_flag = FALSE;
1773
1774 zleak_max_zonemap_size = max_zonemap_size;
1775 zleak_global_tracking_threshold = max_zonemap_size / 2;
1776 zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
1777
1778 #if CONFIG_EMBEDDED
1779 if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) {
1780 zleak_enable_flag = TRUE;
1781 printf("zone leak detection enabled\n");
1782 } else {
1783 zleak_enable_flag = FALSE;
1784 printf("zone leak detection disabled\n");
1785 }
1786 #else /* CONFIG_EMBEDDED */
1787 /* -zleakoff (flag to disable zone leak monitor) */
1788 if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
1789 zleak_enable_flag = FALSE;
1790 printf("zone leak detection disabled\n");
1791 } else {
1792 zleak_enable_flag = TRUE;
1793 printf("zone leak detection enabled\n");
1794 }
1795 #endif /* CONFIG_EMBEDDED */
1796
1797 /* zfactor=XXXX (override how often to sample the zone allocator) */
1798 if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
1799 printf("Zone leak factor override: %u\n", zleak_sample_factor);
1800 }
1801
1802 /* zleak-allocs=XXXX (override number of buckets in zallocations) */
1803 if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
1804 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
1805 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1806 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) {
1807 printf("Override isn't a power of two, bad things might happen!\n");
1808 }
1809 }
1810
1811 /* zleak-traces=XXXX (override number of buckets in ztraces) */
1812 if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
1813 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
1814 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1815 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) {
1816 printf("Override isn't a power of two, bad things might happen!\n");
1817 }
1818 }
1819
1820 /* allocate the zleak_lock */
1821 lck_grp_attr_setdefault(&zleak_lock_grp_attr);
1822 lck_grp_init(&zleak_lock_grp, "zleak_lock", &zleak_lock_grp_attr);
1823 lck_attr_setdefault(&zleak_lock_attr);
1824 lck_spin_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr);
1825
1826 if (zleak_enable_flag) {
1827 zleak_state = ZLEAK_STATE_ENABLED;
1828 }
1829 }
1830
1831 #if CONFIG_ZLEAKS
1832
1833 /*
1834 * Support for kern.zleak.active sysctl - a simplified
1835 * version of the zleak_state variable.
1836 */
1837 int
1838 get_zleak_state(void)
1839 {
1840 if (zleak_state & ZLEAK_STATE_FAILED) {
1841 return -1;
1842 }
1843 if (zleak_state & ZLEAK_STATE_ACTIVE) {
1844 return 1;
1845 }
1846 return 0;
1847 }
1848
1849 #endif
1850
1851
1852 kern_return_t
1853 zleak_activate(void)
1854 {
1855 kern_return_t retval;
1856 vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
1857 vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
1858 void *allocations_ptr = NULL;
1859 void *traces_ptr = NULL;
1860
1861 /* Only one thread attempts to activate at a time */
1862 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1863 return KERN_SUCCESS;
1864 }
1865
1866 /* Indicate that we're doing the setup */
1867 lck_spin_lock(&zleak_lock);
1868 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1869 lck_spin_unlock(&zleak_lock);
1870 return KERN_SUCCESS;
1871 }
1872
1873 zleak_state |= ZLEAK_STATE_ACTIVATING;
1874 lck_spin_unlock(&zleak_lock);
1875
1876 /* Allocate and zero tables */
1877 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK);
1878 if (retval != KERN_SUCCESS) {
1879 goto fail;
1880 }
1881
1882 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK);
1883 if (retval != KERN_SUCCESS) {
1884 goto fail;
1885 }
1886
1887 bzero(allocations_ptr, z_alloc_size);
1888 bzero(traces_ptr, z_trace_size);
1889
1890 /* Everything's set. Install tables, mark active. */
1891 zallocations = allocations_ptr;
1892 ztraces = traces_ptr;
1893
1894 /*
1895 * Initialize the top_ztrace to the first entry in ztraces,
1896 * so we don't have to check for null in zleak_log
1897 */
1898 top_ztrace = &ztraces[0];
1899
1900 /*
1901 * Note that we do need a barrier between installing
1902 * the tables and setting the active flag, because the zfree()
1903 * path accesses the table without a lock if we're active.
1904 */
1905 lck_spin_lock(&zleak_lock);
1906 zleak_state |= ZLEAK_STATE_ACTIVE;
1907 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1908 lck_spin_unlock(&zleak_lock);
1909
1910 return 0;
1911
1912 fail:
1913 /*
1914 * If we fail to allocate memory, don't further tax
1915 * the system by trying again.
1916 */
1917 lck_spin_lock(&zleak_lock);
1918 zleak_state |= ZLEAK_STATE_FAILED;
1919 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1920 lck_spin_unlock(&zleak_lock);
1921
1922 if (allocations_ptr != NULL) {
1923 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
1924 }
1925
1926 if (traces_ptr != NULL) {
1927 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
1928 }
1929
1930 return retval;
1931 }
1932
1933 /*
1934 * TODO: What about allocations that never get deallocated,
1935 * especially ones with unique backtraces? Should we wait to record
1936 * until after boot has completed?
1937 * (How many persistent zallocs are there?)
1938 */
1939
1940 /*
1941 * This function records the allocation in the allocations table,
1942 * and stores the associated backtrace in the traces table
1943 * (or just increments the refcount if the trace is already recorded)
1944 * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
1945 * the associated trace's refcount is decremented.
1946 * If the trace slot is in use, it returns.
1947 * The refcount is incremented by the amount of memory the allocation consumes.
1948 * The return value indicates whether to try again next time.
1949 */
1950 static boolean_t
1951 zleak_log(uintptr_t* bt,
1952 uintptr_t addr,
1953 uint32_t depth,
1954 vm_size_t allocation_size)
1955 {
1956 /* Quit if there's someone else modifying the hash tables */
1957 if (!lck_spin_try_lock(&zleak_lock)) {
1958 z_total_conflicts++;
1959 return FALSE;
1960 }
1961
1962 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
1963
1964 uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
1965 struct ztrace* trace = &ztraces[trace_index];
1966
1967 allocation->za_hit_count++;
1968 trace->zt_hit_count++;
1969
1970 /*
1971 * If the allocation bucket we want to be in is occupied, and if the occupier
1972 * has the same trace as us, just bail.
1973 */
1974 if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
1975 z_alloc_collisions++;
1976
1977 lck_spin_unlock(&zleak_lock);
1978 return TRUE;
1979 }
1980
1981 /* STEP 1: Store the backtrace in the traces array. */
1982 /* A size of zero indicates that the trace bucket is free. */
1983
1984 if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) {
1985 /*
1986 * Different unique trace with same hash!
1987 * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
1988 * and get out of the way for later chances
1989 */
1990 trace->zt_collisions++;
1991 z_trace_collisions++;
1992
1993 lck_spin_unlock(&zleak_lock);
1994 return TRUE;
1995 } else if (trace->zt_size > 0) {
1996 /* Same trace, already added, so increment refcount */
1997 trace->zt_size += allocation_size;
1998 } else {
1999 /* Found an unused trace bucket, record the trace here! */
2000 if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */
2001 z_trace_overwrites++;
2002 }
2003
2004 z_trace_recorded++;
2005 trace->zt_size = allocation_size;
2006 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)));
2007
2008 trace->zt_depth = depth;
2009 trace->zt_collisions = 0;
2010 }
2011
2012 /* STEP 2: Store the allocation record in the allocations array. */
2013
2014 if (allocation->za_element != (uintptr_t) 0) {
2015 /*
2016 * Straight up replace any allocation record that was there. We don't want to do the work
2017 * to preserve the allocation entries that were there, because we only record a subset of the
2018 * allocations anyways.
2019 */
2020
2021 z_alloc_collisions++;
2022
2023 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
2024 /* Knock off old allocation's size, not the new allocation */
2025 associated_trace->zt_size -= allocation->za_size;
2026 } else if (allocation->za_trace_index != 0) {
2027 /* Slot previously used but not currently in use */
2028 z_alloc_overwrites++;
2029 }
2030
2031 allocation->za_element = addr;
2032 allocation->za_trace_index = trace_index;
2033 allocation->za_size = allocation_size;
2034
2035 z_alloc_recorded++;
2036
2037 if (top_ztrace->zt_size < trace->zt_size) {
2038 top_ztrace = trace;
2039 }
2040
2041 lck_spin_unlock(&zleak_lock);
2042 return TRUE;
2043 }
2044
2045 /*
2046 * Free the allocation record and release the stacktrace.
2047 * This should be as fast as possible because it will be called for every free.
2048 */
2049 static void
2050 zleak_free(uintptr_t addr,
2051 vm_size_t allocation_size)
2052 {
2053 if (addr == (uintptr_t) 0) {
2054 return;
2055 }
2056
2057 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2058
2059 /* Double-checked locking: check to find out if we're interested, lock, check to make
2060 * sure it hasn't changed, then modify it, and release the lock.
2061 */
2062
2063 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2064 /* if the allocation was the one, grab the lock, check again, then delete it */
2065 lck_spin_lock(&zleak_lock);
2066
2067 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2068 struct ztrace *trace;
2069
2070 /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
2071 if (allocation->za_size != allocation_size) {
2072 panic("Freeing as size %lu memory that was allocated with size %lu\n",
2073 (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
2074 }
2075
2076 trace = &ztraces[allocation->za_trace_index];
2077
2078 /* size of 0 indicates trace bucket is unused */
2079 if (trace->zt_size > 0) {
2080 trace->zt_size -= allocation_size;
2081 }
2082
2083 /* A NULL element means the allocation bucket is unused */
2084 allocation->za_element = 0;
2085 }
2086 lck_spin_unlock(&zleak_lock);
2087 }
2088 }
2089
2090 #endif /* CONFIG_ZLEAKS */
2091
2092 /* These functions outside of CONFIG_ZLEAKS because they are also used in
2093 * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix.
2094 */
2095
2096 /* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */
2097 uintptr_t
2098 hash_mix(uintptr_t x)
2099 {
2100 #ifndef __LP64__
2101 x += ~(x << 15);
2102 x ^= (x >> 10);
2103 x += (x << 3);
2104 x ^= (x >> 6);
2105 x += ~(x << 11);
2106 x ^= (x >> 16);
2107 #else
2108 x += ~(x << 32);
2109 x ^= (x >> 22);
2110 x += ~(x << 13);
2111 x ^= (x >> 8);
2112 x += (x << 3);
2113 x ^= (x >> 15);
2114 x += ~(x << 27);
2115 x ^= (x >> 31);
2116 #endif
2117 return x;
2118 }
2119
2120 uint32_t
2121 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
2122 {
2123 uintptr_t hash = 0;
2124 uintptr_t mask = max_size - 1;
2125
2126 while (depth) {
2127 hash += bt[--depth];
2128 }
2129
2130 hash = hash_mix(hash) & mask;
2131
2132 assert(hash < max_size);
2133
2134 return (uint32_t) hash;
2135 }
2136
2137 /*
2138 * TODO: Determine how well distributed this is
2139 * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
2140 */
2141 uint32_t
2142 hashaddr(uintptr_t pt, uint32_t max_size)
2143 {
2144 uintptr_t hash = 0;
2145 uintptr_t mask = max_size - 1;
2146
2147 hash = hash_mix(pt) & mask;
2148
2149 assert(hash < max_size);
2150
2151 return (uint32_t) hash;
2152 }
2153
2154 /* End of all leak-detection code */
2155 #pragma mark -
2156
2157 #define ZONE_MAX_ALLOC_SIZE (32 * 1024)
2158 #define ZONE_ALLOC_FRAG_PERCENT(alloc_size, ele_size) (((alloc_size % ele_size) * 100) / alloc_size)
2159
2160 /* Used to manage copying in of new zone names */
2161 static vm_offset_t zone_names_start;
2162 static vm_offset_t zone_names_next;
2163
2164 static vm_size_t
2165 compute_element_size(vm_size_t requested_size)
2166 {
2167 vm_size_t element_size = requested_size;
2168
2169 /* Zone elements must fit both a next pointer and a backup pointer */
2170 vm_size_t minimum_element_size = sizeof(vm_offset_t) * 2;
2171 if (element_size < minimum_element_size) {
2172 element_size = minimum_element_size;
2173 }
2174
2175 /*
2176 * Round element size to a multiple of sizeof(pointer)
2177 * This also enforces that allocations will be aligned on pointer boundaries
2178 */
2179 element_size = ((element_size - 1) + sizeof(vm_offset_t)) -
2180 ((element_size - 1) % sizeof(vm_offset_t));
2181
2182 return element_size;
2183 }
2184
2185 #if KASAN_ZALLOC
2186
2187 /*
2188 * Called from zinit().
2189 *
2190 * Fixes up the zone's element size to incorporate the redzones.
2191 */
2192 static void
2193 kasan_update_element_size_for_redzone(
2194 zone_t zone, /* the zone that needs to be updated */
2195 vm_size_t *size, /* requested zone element size */
2196 vm_size_t *max, /* maximum memory to use */
2197 const char *name) /* zone name */
2198 {
2199 /* Expand the zone allocation size to include the redzones. For page-multiple
2200 * zones add a full guard page because they likely require alignment. kalloc
2201 * and fakestack handles its own KASan state, so ignore those zones. */
2202 /* XXX: remove this when zinit_with_options() is a thing */
2203 const char *kalloc_name = "kalloc.";
2204 const char *fakestack_name = "fakestack.";
2205 if (strncmp(name, kalloc_name, strlen(kalloc_name)) == 0) {
2206 zone->kasan_redzone = 0;
2207 } else if (strncmp(name, fakestack_name, strlen(fakestack_name)) == 0) {
2208 zone->kasan_redzone = 0;
2209 } else {
2210 if ((*size % PAGE_SIZE) != 0) {
2211 zone->kasan_redzone = KASAN_GUARD_SIZE;
2212 } else {
2213 zone->kasan_redzone = PAGE_SIZE;
2214 }
2215 *max = (*max / *size) * (*size + zone->kasan_redzone * 2);
2216 *size += zone->kasan_redzone * 2;
2217 }
2218 }
2219
2220 /*
2221 * Called from zalloc_internal() to fix up the address of the newly
2222 * allocated element.
2223 *
2224 * Returns the element address skipping over the redzone on the left.
2225 */
2226 static vm_offset_t
2227 kasan_fixup_allocated_element_address(
2228 zone_t zone, /* the zone the element belongs to */
2229 vm_offset_t addr) /* address of the element, including the redzone */
2230 {
2231 /* Fixup the return address to skip the redzone */
2232 if (zone->kasan_redzone) {
2233 addr = kasan_alloc(addr, zone->elem_size,
2234 zone->elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
2235 }
2236 return addr;
2237 }
2238
2239 /*
2240 * Called from zfree() to add the element being freed to the KASan quarantine.
2241 *
2242 * Returns true if the newly-freed element made it into the quarantine without
2243 * displacing another, false otherwise. In the latter case, addrp points to the
2244 * address of the displaced element, which will be freed by the zone.
2245 */
2246 static bool
2247 kasan_quarantine_freed_element(
2248 zone_t *zonep, /* the zone the element is being freed to */
2249 void **addrp) /* address of the element being freed */
2250 {
2251 zone_t zone = *zonep;
2252 void *addr = *addrp;
2253
2254 /*
2255 * Resize back to the real allocation size and hand off to the KASan
2256 * quarantine. `addr` may then point to a different allocation, if the
2257 * current element replaced another in the quarantine. The zone then
2258 * takes ownership of the swapped out free element.
2259 */
2260 vm_size_t usersz = zone->elem_size - 2 * zone->kasan_redzone;
2261 vm_size_t sz = usersz;
2262
2263 if (addr && zone->kasan_redzone) {
2264 kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
2265 addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
2266 assert(sz == zone->elem_size);
2267 }
2268 if (addr && zone->kasan_quarantine) {
2269 kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true);
2270 if (!addr) {
2271 return TRUE;
2272 }
2273 }
2274 *addrp = addr;
2275 return FALSE;
2276 }
2277
2278 #endif /* KASAN_ZALLOC */
2279
2280 /*
2281 * zinit initializes a new zone. The zone data structures themselves
2282 * are stored in a zone, which is initially a static structure that
2283 * is initialized by zone_init.
2284 */
2285
2286 zone_t
2287 zinit(
2288 vm_size_t size, /* the size of an element */
2289 vm_size_t max, /* maximum memory to use */
2290 vm_size_t alloc, /* allocation size */
2291 const char *name) /* a name for the zone */
2292 {
2293 zone_t z;
2294
2295 size = compute_element_size(size);
2296
2297 simple_lock(&all_zones_lock, &zone_locks_grp);
2298
2299 assert(num_zones < MAX_ZONES);
2300 assert(num_zones_in_use <= num_zones);
2301
2302 /* If possible, find a previously zdestroy'ed zone in the zone_array that we can reuse instead of initializing a new zone. */
2303 for (int index = bitmap_first(zone_empty_bitmap, MAX_ZONES);
2304 index >= 0 && index < (int)num_zones;
2305 index = bitmap_next(zone_empty_bitmap, index)) {
2306 z = &(zone_array[index]);
2307
2308 /*
2309 * If the zone name and the element size are the same, we can just reuse the old zone struct.
2310 * Otherwise hand out a new zone from the zone_array.
2311 */
2312 if (!strcmp(z->zone_name, name)) {
2313 vm_size_t old_size = z->elem_size;
2314 #if KASAN_ZALLOC
2315 old_size -= z->kasan_redzone * 2;
2316 #endif
2317 if (old_size == size) {
2318 /* Clear the empty bit for this zone, increment num_zones_in_use, and mark the zone as valid again. */
2319 bitmap_clear(zone_empty_bitmap, index);
2320 num_zones_in_use++;
2321 z->zone_valid = TRUE;
2322 z->zone_destruction = FALSE;
2323
2324 /* All other state is already set up since the zone was previously in use. Return early. */
2325 simple_unlock(&all_zones_lock);
2326 return z;
2327 }
2328 }
2329 }
2330
2331 /* If we're here, it means we didn't find a zone above that we could simply reuse. Set up a new zone. */
2332
2333 /* Clear the empty bit for the new zone */
2334 bitmap_clear(zone_empty_bitmap, num_zones);
2335
2336 z = &(zone_array[num_zones]);
2337 z->index = num_zones;
2338
2339 num_zones++;
2340 num_zones_in_use++;
2341
2342 /*
2343 * Initialize the zone lock here before dropping the all_zones_lock. Otherwise we could race with
2344 * zalloc_async() and try to grab the zone lock before it has been initialized, causing a panic.
2345 */
2346 lock_zone_init(z);
2347
2348 simple_unlock(&all_zones_lock);
2349
2350 #if KASAN_ZALLOC
2351 kasan_update_element_size_for_redzone(z, &size, &max, name);
2352 #endif
2353
2354 max = round_page(max);
2355
2356 vm_size_t best_alloc = PAGE_SIZE;
2357
2358 if ((size % PAGE_SIZE) == 0) {
2359 /* zero fragmentation by definition */
2360 best_alloc = size;
2361 } else {
2362 vm_size_t alloc_size;
2363 for (alloc_size = (2 * PAGE_SIZE); alloc_size <= ZONE_MAX_ALLOC_SIZE; alloc_size += PAGE_SIZE) {
2364 if (ZONE_ALLOC_FRAG_PERCENT(alloc_size, size) < ZONE_ALLOC_FRAG_PERCENT(best_alloc, size)) {
2365 best_alloc = alloc_size;
2366 }
2367 }
2368 }
2369
2370 alloc = best_alloc;
2371 if (max && (max < alloc)) {
2372 max = alloc;
2373 }
2374
2375 z->free_elements = NULL;
2376 queue_init(&z->pages.any_free_foreign);
2377 queue_init(&z->pages.all_free);
2378 queue_init(&z->pages.intermediate);
2379 queue_init(&z->pages.all_used);
2380 z->cur_size = 0;
2381 z->page_count = 0;
2382 z->max_size = max;
2383 z->elem_size = size;
2384 z->alloc_size = alloc;
2385 z->count = 0;
2386 z->countfree = 0;
2387 z->count_all_free_pages = 0;
2388 z->sum_count = 0LL;
2389 z->doing_alloc_without_vm_priv = FALSE;
2390 z->doing_alloc_with_vm_priv = FALSE;
2391 z->exhaustible = FALSE;
2392 z->collectable = TRUE;
2393 z->allows_foreign = FALSE;
2394 z->expandable = TRUE;
2395 z->waiting = FALSE;
2396 z->async_pending = FALSE;
2397 z->caller_acct = TRUE;
2398 z->noencrypt = FALSE;
2399 z->no_callout = FALSE;
2400 z->async_prio_refill = FALSE;
2401 z->gzalloc_exempt = FALSE;
2402 z->alignment_required = FALSE;
2403 z->zone_replenishing = FALSE;
2404 z->prio_refill_watermark = 0;
2405 z->zone_replenish_thread = NULL;
2406 z->zp_count = 0;
2407 z->kasan_quarantine = TRUE;
2408 z->zone_valid = TRUE;
2409 z->zone_destruction = FALSE;
2410 z->cpu_cache_enabled = FALSE;
2411 z->clear_memory = FALSE;
2412
2413 #if CONFIG_ZLEAKS
2414 z->zleak_capture = 0;
2415 z->zleak_on = FALSE;
2416 #endif /* CONFIG_ZLEAKS */
2417
2418 /*
2419 * If the VM is ready to handle kmem_alloc requests, copy the zone name passed in.
2420 *
2421 * Else simply maintain a pointer to the name string. The only zones we'll actually have
2422 * to do this for would be the VM-related zones that are created very early on before any
2423 * kexts can be loaded (unloaded). So we should be fine with just a pointer in this case.
2424 */
2425 if (kmem_alloc_ready) {
2426 size_t len = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
2427
2428 if (zone_names_start == 0 || ((zone_names_next - zone_names_start) + len) > PAGE_SIZE) {
2429 printf("zalloc: allocating memory for zone names buffer\n");
2430 kern_return_t retval = kmem_alloc_kobject(kernel_map, &zone_names_start,
2431 PAGE_SIZE, VM_KERN_MEMORY_OSFMK);
2432 if (retval != KERN_SUCCESS) {
2433 panic("zalloc: zone_names memory allocation failed");
2434 }
2435 bzero((char *)zone_names_start, PAGE_SIZE);
2436 zone_names_next = zone_names_start;
2437 }
2438
2439 strlcpy((char *)zone_names_next, name, len);
2440 z->zone_name = (char *)zone_names_next;
2441 zone_names_next += len;
2442 } else {
2443 z->zone_name = name;
2444 }
2445
2446 /*
2447 * Check for and set up zone leak detection if requested via boot-args. We recognized two
2448 * boot-args:
2449 *
2450 * zlog=<zone_to_log>
2451 * zrecs=<num_records_in_log>
2452 *
2453 * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to
2454 * control the size of the log. If zrecs is not specified, a default value is used.
2455 */
2456
2457 if (num_zones_logged < max_num_zones_to_log) {
2458 int i = 1; /* zlog0 isn't allowed. */
2459 boolean_t zone_logging_enabled = FALSE;
2460 char zlog_name[MAX_ZONE_NAME] = ""; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
2461
2462 while (i <= max_num_zones_to_log) {
2463 snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
2464
2465 if (PE_parse_boot_argn(zlog_name, zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
2466 if (track_this_zone(z->zone_name, zone_name_to_log)) {
2467 if (z->zone_valid) {
2468 z->zone_logging = TRUE;
2469 zone_logging_enabled = TRUE;
2470 num_zones_logged++;
2471 break;
2472 }
2473 }
2474 }
2475 i++;
2476 }
2477
2478 if (zone_logging_enabled == FALSE) {
2479 /*
2480 * Backwards compat. with the old boot-arg used to specify single zone logging i.e. zlog
2481 * Needs to happen after the newer zlogn checks because the prefix will match all the zlogn
2482 * boot-args.
2483 */
2484 if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
2485 if (track_this_zone(z->zone_name, zone_name_to_log)) {
2486 if (z->zone_valid) {
2487 z->zone_logging = TRUE;
2488 zone_logging_enabled = TRUE;
2489 num_zones_logged++;
2490 }
2491 }
2492 }
2493 }
2494
2495 if (log_records_init == FALSE && zone_logging_enabled == TRUE) {
2496 if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) {
2497 /*
2498 * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
2499 * This prevents accidentally hogging too much kernel memory and making the system
2500 * unusable.
2501 */
2502
2503 log_records = MIN(ZRECORDS_MAX, log_records);
2504 log_records_init = TRUE;
2505 } else {
2506 log_records = ZRECORDS_DEFAULT;
2507 log_records_init = TRUE;
2508 }
2509 }
2510
2511 /*
2512 * If we want to log a zone, see if we need to allocate buffer space for the log. Some vm related zones are
2513 * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case. kmem_alloc_ready is set to
2514 * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work. If we want to log one
2515 * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again
2516 * later on some other zone. So note we may be allocating a buffer to log a zone other than the one being initialized
2517 * right now.
2518 */
2519 if (kmem_alloc_ready) {
2520 zone_t curr_zone = NULL;
2521 unsigned int max_zones = 0, zone_idx = 0;
2522
2523 simple_lock(&all_zones_lock, &zone_locks_grp);
2524 max_zones = num_zones;
2525 simple_unlock(&all_zones_lock);
2526
2527 for (zone_idx = 0; zone_idx < max_zones; zone_idx++) {
2528 curr_zone = &(zone_array[zone_idx]);
2529
2530 if (!curr_zone->zone_valid) {
2531 continue;
2532 }
2533
2534 /*
2535 * We work with the zone unlocked here because we could end up needing the zone lock to
2536 * enable logging for this zone e.g. need a VM object to allocate memory to enable logging for the
2537 * VM objects zone.
2538 *
2539 * We don't expect these zones to be needed at this early a time in boot and so take this chance.
2540 */
2541 if (curr_zone->zone_logging && curr_zone->zlog_btlog == NULL) {
2542 curr_zone->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
2543
2544 if (curr_zone->zlog_btlog) {
2545 printf("zone: logging started for zone %s\n", curr_zone->zone_name);
2546 } else {
2547 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
2548 curr_zone->zone_logging = FALSE;
2549 }
2550 }
2551 }
2552 }
2553 }
2554
2555 #if CONFIG_GZALLOC
2556 gzalloc_zone_init(z);
2557 #endif
2558
2559 #if CONFIG_ZCACHE
2560 /* Check if boot-arg specified it should have a cache */
2561 if (cache_all_zones || track_this_zone(name, cache_zone_name)) {
2562 zone_change(z, Z_CACHING_ENABLED, TRUE);
2563 }
2564 #endif
2565
2566 return z;
2567 }
2568 unsigned zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count;
2569
2570 static void zone_replenish_thread(zone_t);
2571
2572 /* High priority VM privileged thread used to asynchronously refill a designated
2573 * zone, such as the reserved VM map entry zone.
2574 */
2575 __dead2
2576 static void
2577 zone_replenish_thread(zone_t z)
2578 {
2579 vm_size_t free_size;
2580 current_thread()->options |= TH_OPT_VMPRIV;
2581
2582 for (;;) {
2583 lock_zone(z);
2584 assert(z->zone_valid);
2585 z->zone_replenishing = TRUE;
2586 assert(z->prio_refill_watermark != 0);
2587 while ((free_size = (z->cur_size - (z->count * z->elem_size))) < (z->prio_refill_watermark * z->elem_size)) {
2588 assert(z->doing_alloc_without_vm_priv == FALSE);
2589 assert(z->doing_alloc_with_vm_priv == FALSE);
2590 assert(z->async_prio_refill == TRUE);
2591
2592 unlock_zone(z);
2593 int zflags = KMA_KOBJECT | KMA_NOPAGEWAIT;
2594 vm_offset_t space, alloc_size;
2595 kern_return_t kr;
2596
2597 if (vm_pool_low()) {
2598 alloc_size = round_page(z->elem_size);
2599 } else {
2600 alloc_size = z->alloc_size;
2601 }
2602
2603 if (z->noencrypt) {
2604 zflags |= KMA_NOENCRYPT;
2605 }
2606
2607 if (z->clear_memory) {
2608 zflags |= KMA_ZERO;
2609 }
2610
2611 /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
2612 if (is_zone_map_nearing_exhaustion()) {
2613 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2614 }
2615
2616 kr = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
2617
2618 if (kr == KERN_SUCCESS) {
2619 zcram(z, space, alloc_size);
2620 } else if (kr == KERN_RESOURCE_SHORTAGE) {
2621 VM_PAGE_WAIT();
2622 } else if (kr == KERN_NO_SPACE) {
2623 kr = kernel_memory_allocate(kernel_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
2624 if (kr == KERN_SUCCESS) {
2625 zcram(z, space, alloc_size);
2626 } else {
2627 assert_wait_timeout(&z->zone_replenish_thread, THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
2628 thread_block(THREAD_CONTINUE_NULL);
2629 }
2630 }
2631
2632 lock_zone(z);
2633 assert(z->zone_valid);
2634 zone_replenish_loops++;
2635 }
2636
2637 z->zone_replenishing = FALSE;
2638 /* Signal any potential throttled consumers, terminating
2639 * their timer-bounded waits.
2640 */
2641 thread_wakeup(z);
2642
2643 assert_wait(&z->zone_replenish_thread, THREAD_UNINT);
2644 unlock_zone(z);
2645 thread_block(THREAD_CONTINUE_NULL);
2646 zone_replenish_wakeups++;
2647 }
2648 }
2649
2650 void
2651 zone_prio_refill_configure(zone_t z, vm_size_t low_water_mark)
2652 {
2653 z->prio_refill_watermark = low_water_mark;
2654
2655 z->async_prio_refill = TRUE;
2656 OSMemoryBarrier();
2657 kern_return_t tres = kernel_thread_start_priority((thread_continue_t)zone_replenish_thread, z, MAXPRI_KERNEL, &z->zone_replenish_thread);
2658
2659 if (tres != KERN_SUCCESS) {
2660 panic("zone_prio_refill_configure, thread create: 0x%x", tres);
2661 }
2662
2663 thread_deallocate(z->zone_replenish_thread);
2664 }
2665
2666 void
2667 zdestroy(zone_t z)
2668 {
2669 unsigned int zindex;
2670
2671 assert(z != NULL);
2672
2673 lock_zone(z);
2674 assert(z->zone_valid);
2675
2676 /* Assert that the zone does not have any allocations in flight */
2677 assert(z->doing_alloc_without_vm_priv == FALSE);
2678 assert(z->doing_alloc_with_vm_priv == FALSE);
2679 assert(z->async_pending == FALSE);
2680 assert(z->waiting == FALSE);
2681 assert(z->async_prio_refill == FALSE);
2682
2683 #if !KASAN_ZALLOC
2684 /*
2685 * Unset the valid bit. We'll hit an assert failure on further operations on this zone, until zinit() is called again.
2686 * Leave the zone valid for KASan as we will see zfree's on quarantined free elements even after the zone is destroyed.
2687 */
2688 z->zone_valid = FALSE;
2689 #endif
2690 z->zone_destruction = TRUE;
2691 unlock_zone(z);
2692
2693 #if CONFIG_ZCACHE
2694 /* Drain the per-cpu caches if caching is enabled for the zone. */
2695 if (zone_caching_enabled(z)) {
2696 panic("zdestroy: Zone caching enabled for zone %s", z->zone_name);
2697 }
2698 #endif /* CONFIG_ZCACHE */
2699
2700 /* Dump all the free elements */
2701 drop_free_elements(z);
2702
2703 #if CONFIG_GZALLOC
2704 /* If the zone is gzalloc managed dump all the elements in the free cache */
2705 gzalloc_empty_free_cache(z);
2706 #endif
2707
2708 lock_zone(z);
2709
2710 #if !KASAN_ZALLOC
2711 /* Assert that all counts are zero */
2712 assert(z->count == 0);
2713 assert(z->countfree == 0);
2714 assert(z->cur_size == 0);
2715 assert(z->page_count == 0);
2716 assert(z->count_all_free_pages == 0);
2717
2718 /* Assert that all queues except the foreign queue are empty. The zone allocator doesn't know how to free up foreign memory. */
2719 assert(queue_empty(&z->pages.all_used));
2720 assert(queue_empty(&z->pages.intermediate));
2721 assert(queue_empty(&z->pages.all_free));
2722 #endif
2723
2724 zindex = z->index;
2725
2726 unlock_zone(z);
2727
2728 simple_lock(&all_zones_lock, &zone_locks_grp);
2729
2730 assert(!bitmap_test(zone_empty_bitmap, zindex));
2731 /* Mark the zone as empty in the bitmap */
2732 bitmap_set(zone_empty_bitmap, zindex);
2733 num_zones_in_use--;
2734 assert(num_zones_in_use > 0);
2735
2736 simple_unlock(&all_zones_lock);
2737 }
2738
2739 /* Initialize the metadata for an allocation chunk */
2740 static inline void
2741 zcram_metadata_init(vm_offset_t newmem, vm_size_t size, struct zone_page_metadata *chunk_metadata)
2742 {
2743 struct zone_page_metadata *page_metadata;
2744
2745 /* The first page is the real metadata for this allocation chunk. We mark the others as fake metadata */
2746 size -= PAGE_SIZE;
2747 newmem += PAGE_SIZE;
2748
2749 for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) {
2750 page_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE);
2751 assert(page_metadata != chunk_metadata);
2752 PAGE_METADATA_SET_ZINDEX(page_metadata, MULTIPAGE_METADATA_MAGIC);
2753 page_metadata_set_realmeta(page_metadata, chunk_metadata);
2754 page_metadata->free_count = 0;
2755 }
2756 return;
2757 }
2758
2759
2760 static void
2761 random_free_to_zone(
2762 zone_t zone,
2763 vm_offset_t newmem,
2764 vm_offset_t first_element_offset,
2765 int element_count,
2766 unsigned int *entropy_buffer)
2767 {
2768 vm_offset_t last_element_offset;
2769 vm_offset_t element_addr;
2770 vm_size_t elem_size;
2771 int index;
2772
2773 assert(element_count && element_count <= ZONE_CHUNK_MAXELEMENTS);
2774 elem_size = zone->elem_size;
2775 last_element_offset = first_element_offset + ((element_count * elem_size) - elem_size);
2776 for (index = 0; index < element_count; index++) {
2777 assert(first_element_offset <= last_element_offset);
2778 if (
2779 #if DEBUG || DEVELOPMENT
2780 leak_scan_debug_flag || __improbable(zone->tags) ||
2781 #endif /* DEBUG || DEVELOPMENT */
2782 random_bool_gen_bits(&zone_bool_gen, entropy_buffer, MAX_ENTROPY_PER_ZCRAM, 1)) {
2783 element_addr = newmem + first_element_offset;
2784 first_element_offset += elem_size;
2785 } else {
2786 element_addr = newmem + last_element_offset;
2787 last_element_offset -= elem_size;
2788 }
2789 if (element_addr != (vm_offset_t)zone) {
2790 zone->count++; /* compensate for free_to_zone */
2791 free_to_zone(zone, element_addr, FALSE);
2792 }
2793 zone->cur_size += elem_size;
2794 }
2795 }
2796
2797 /*
2798 * Cram the given memory into the specified zone. Update the zone page count accordingly.
2799 */
2800 void
2801 zcram(
2802 zone_t zone,
2803 vm_offset_t newmem,
2804 vm_size_t size)
2805 {
2806 vm_size_t elem_size;
2807 boolean_t from_zm = FALSE;
2808 int element_count;
2809 unsigned int entropy_buffer[MAX_ENTROPY_PER_ZCRAM] = { 0 };
2810
2811 /* Basic sanity checks */
2812 assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
2813 assert(!zone->collectable || zone->allows_foreign
2814 || (from_zone_map(newmem, size)));
2815
2816 elem_size = zone->elem_size;
2817
2818 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START, zone->index, size);
2819
2820 if (from_zone_map(newmem, size)) {
2821 from_zm = TRUE;
2822 }
2823
2824 if (!from_zm) {
2825 /* We cannot support elements larger than page size for foreign memory because we
2826 * put metadata on the page itself for each page of foreign memory. We need to do
2827 * this in order to be able to reach the metadata when any element is freed
2828 */
2829 assert((zone->allows_foreign == TRUE) && (zone->elem_size <= (PAGE_SIZE - sizeof(struct zone_page_metadata))));
2830 }
2831
2832 #if DEBUG || DEVELOPMENT
2833 if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) {
2834 kprintf("zcram(%p[%s], 0x%lx%s, 0x%lx)\n", zone, zone->zone_name,
2835 (unsigned long)newmem, from_zm ? "" : "[F]", (unsigned long)size);
2836 }
2837 #endif /* DEBUG || DEVELOPMENT */
2838
2839 ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE));
2840
2841 /*
2842 * Initialize the metadata for all pages. We dont need the zone lock
2843 * here because we are not manipulating any zone related state yet.
2844 */
2845
2846 struct zone_page_metadata *chunk_metadata;
2847 size_t zone_page_metadata_size = sizeof(struct zone_page_metadata);
2848
2849 assert((newmem & PAGE_MASK) == 0);
2850 assert((size & PAGE_MASK) == 0);
2851
2852 chunk_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE);
2853 chunk_metadata->pages.next = NULL;
2854 chunk_metadata->pages.prev = NULL;
2855 page_metadata_set_freelist(chunk_metadata, 0);
2856 PAGE_METADATA_SET_ZINDEX(chunk_metadata, zone->index);
2857 chunk_metadata->free_count = 0;
2858 assert((size / PAGE_SIZE) <= ZONE_CHUNK_MAXPAGES);
2859 chunk_metadata->page_count = (unsigned)(size / PAGE_SIZE);
2860
2861 zcram_metadata_init(newmem, size, chunk_metadata);
2862
2863 #if VM_MAX_TAG_ZONES
2864 if (__improbable(zone->tags)) {
2865 assert(from_zm);
2866 ztMemoryAdd(zone, newmem, size);
2867 }
2868 #endif /* VM_MAX_TAG_ZONES */
2869
2870 lock_zone(zone);
2871 assert(zone->zone_valid);
2872 enqueue_tail(&zone->pages.all_used, &(chunk_metadata->pages));
2873
2874 if (!from_zm) {
2875 /* We cannot support elements larger than page size for foreign memory because we
2876 * put metadata on the page itself for each page of foreign memory. We need to do
2877 * this in order to be able to reach the metadata when any element is freed
2878 */
2879
2880 for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) {
2881 vm_offset_t first_element_offset = 0;
2882 if (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT == 0) {
2883 first_element_offset = zone_page_metadata_size;
2884 } else {
2885 first_element_offset = zone_page_metadata_size + (ZONE_ELEMENT_ALIGNMENT - (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT));
2886 }
2887 element_count = (unsigned int)((PAGE_SIZE - first_element_offset) / elem_size);
2888 random_free_to_zone(zone, newmem, first_element_offset, element_count, entropy_buffer);
2889 }
2890 } else {
2891 element_count = (unsigned int)(size / elem_size);
2892 random_free_to_zone(zone, newmem, 0, element_count, entropy_buffer);
2893 }
2894 unlock_zone(zone);
2895
2896 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, zone->index);
2897 }
2898
2899 /*
2900 * Fill a zone with enough memory to contain at least nelem elements.
2901 * Return the number of elements actually put into the zone, which may
2902 * be more than the caller asked for since the memory allocation is
2903 * rounded up to the next zone allocation size.
2904 */
2905 int
2906 zfill(
2907 zone_t zone,
2908 int nelem)
2909 {
2910 kern_return_t kr;
2911 vm_offset_t memory;
2912
2913 vm_size_t alloc_size = zone->alloc_size;
2914 vm_size_t elem_per_alloc = alloc_size / zone->elem_size;
2915 vm_size_t nalloc = (nelem + elem_per_alloc - 1) / elem_per_alloc;
2916 int zflags = KMA_KOBJECT;
2917
2918 if (zone->clear_memory) {
2919 zflags |= KMA_ZERO;
2920 }
2921
2922 /* Don't mix-and-match zfill with foreign memory */
2923 assert(!zone->allows_foreign);
2924
2925 /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
2926 if (is_zone_map_nearing_exhaustion()) {
2927 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2928 }
2929
2930 kr = kernel_memory_allocate(zone_map, &memory, nalloc * alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
2931 if (kr != KERN_SUCCESS) {
2932 printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
2933 __func__, (unsigned long)(nalloc * alloc_size));
2934 return 0;
2935 }
2936
2937 for (vm_size_t i = 0; i < nalloc; i++) {
2938 zcram(zone, memory + i * alloc_size, alloc_size);
2939 }
2940
2941 return (int)(nalloc * elem_per_alloc);
2942 }
2943
2944 /*
2945 * Initialize the "zone of zones" which uses fixed memory allocated
2946 * earlier in memory initialization. zone_bootstrap is called
2947 * before zone_init.
2948 */
2949 void
2950 zone_bootstrap(void)
2951 {
2952 char temp_buf[16];
2953
2954 #if DEBUG || DEVELOPMENT
2955 if (!PE_parse_boot_argn("zalloc_debug", &zalloc_debug, sizeof(zalloc_debug))) {
2956 zalloc_debug = 0;
2957 }
2958 #endif /* DEBUG || DEVELOPMENT */
2959
2960 /* Set up zone element poisoning */
2961 zp_init();
2962
2963 random_bool_init(&zone_bool_gen);
2964
2965 /* should zlog log to debug zone corruption instead of leaks? */
2966 if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) {
2967 corruption_debug_flag = TRUE;
2968 }
2969
2970 #if DEBUG || DEVELOPMENT
2971 /* should perform zone element size checking in copyin/copyout? */
2972 if (PE_parse_boot_argn("-no-copyio-zalloc-check", temp_buf, sizeof(temp_buf))) {
2973 copyio_zalloc_check = FALSE;
2974 }
2975 #if VM_MAX_TAG_ZONES
2976 /* enable tags for zones that ask for */
2977 if (PE_parse_boot_argn("-zt", temp_buf, sizeof(temp_buf))) {
2978 zone_tagging_on = TRUE;
2979 }
2980 #endif /* VM_MAX_TAG_ZONES */
2981 /* disable element location randomization in a page */
2982 if (PE_parse_boot_argn("-zl", temp_buf, sizeof(temp_buf))) {
2983 leak_scan_debug_flag = TRUE;
2984 }
2985 #endif
2986
2987 simple_lock_init(&all_zones_lock, 0);
2988
2989 num_zones_in_use = 0;
2990 num_zones = 0;
2991 /* Mark all zones as empty */
2992 bitmap_full(zone_empty_bitmap, BITMAP_LEN(MAX_ZONES));
2993 zone_names_next = zone_names_start = 0;
2994
2995 #if DEBUG || DEVELOPMENT
2996 simple_lock_init(&zone_test_lock, 0);
2997 #endif /* DEBUG || DEVELOPMENT */
2998
2999 thread_call_setup(&call_async_alloc, zalloc_async, NULL);
3000
3001 /* initializing global lock group for zones */
3002 lck_grp_attr_setdefault(&zone_locks_grp_attr);
3003 lck_grp_init(&zone_locks_grp, "zone_locks", &zone_locks_grp_attr);
3004
3005 lck_attr_setdefault(&zone_metadata_lock_attr);
3006 lck_mtx_init_ext(&zone_metadata_region_lck, &zone_metadata_region_lck_ext, &zone_locks_grp, &zone_metadata_lock_attr);
3007
3008 #if CONFIG_ZCACHE
3009 /* zcc_enable_for_zone_name=<zone>: enable per-cpu zone caching for <zone>. */
3010 if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) {
3011 printf("zcache: caching enabled for zone %s\n", cache_zone_name);
3012 }
3013
3014 /* -zcache_all: enable per-cpu zone caching for all zones, overrides 'zcc_enable_for_zone_name'. */
3015 if (PE_parse_boot_argn("-zcache_all", temp_buf, sizeof(temp_buf))) {
3016 cache_all_zones = TRUE;
3017 printf("zcache: caching enabled for all zones\n");
3018 }
3019 #endif /* CONFIG_ZCACHE */
3020 }
3021
3022 /*
3023 * We're being very conservative here and picking a value of 95%. We might need to lower this if
3024 * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
3025 */
3026 #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
3027
3028 /*
3029 * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
3030 * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
3031 */
3032 unsigned int zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
3033
3034 /*
3035 * Returns pid of the task with the largest number of VM map entries.
3036 */
3037 extern pid_t find_largest_process_vm_map_entries(void);
3038
3039 /*
3040 * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
3041 * For any other pid we try to kill that process synchronously.
3042 */
3043 boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
3044
3045 void
3046 get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
3047 {
3048 *current_size = zone_map->size;
3049 *capacity = vm_map_max(zone_map) - vm_map_min(zone_map);
3050 }
3051
3052 void
3053 get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
3054 {
3055 zone_t largest_zone = zone_find_largest();
3056 strlcpy(zone_name, largest_zone->zone_name, zone_name_len);
3057 *zone_size = largest_zone->cur_size;
3058 }
3059
3060 boolean_t
3061 is_zone_map_nearing_exhaustion(void)
3062 {
3063 uint64_t size = zone_map->size;
3064 uint64_t capacity = vm_map_max(zone_map) - vm_map_min(zone_map);
3065 if (size > ((capacity * zone_map_jetsam_limit) / 100)) {
3066 return TRUE;
3067 }
3068 return FALSE;
3069 }
3070
3071 extern zone_t vm_map_entry_zone;
3072 extern zone_t vm_object_zone;
3073
3074 #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
3075
3076 /*
3077 * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
3078 * to walk through the jetsam priority bands and kill processes.
3079 */
3080 static void
3081 kill_process_in_largest_zone(void)
3082 {
3083 pid_t pid = -1;
3084 zone_t largest_zone = zone_find_largest();
3085
3086 printf("zone_map_exhaustion: Zone map size %lld, capacity %lld [jetsam limit %d%%]\n", (uint64_t)zone_map->size,
3087 (uint64_t)(vm_map_max(zone_map) - vm_map_min(zone_map)), zone_map_jetsam_limit);
3088 printf("zone_map_exhaustion: Largest zone %s, size %lu\n", largest_zone->zone_name, (uintptr_t)largest_zone->cur_size);
3089
3090 /*
3091 * We want to make sure we don't call this function from userspace. Or we could end up trying to synchronously kill the process
3092 * whose context we're in, causing the system to hang.
3093 */
3094 assert(current_task() == kernel_task);
3095
3096 /*
3097 * If vm_object_zone is the largest, check to see if the number of elements in vm_map_entry_zone is comparable. If so, consider
3098 * vm_map_entry_zone as the largest. This lets us target a specific process to jetsam to quickly recover from the zone map bloat.
3099 */
3100 if (largest_zone == vm_object_zone) {
3101 unsigned int vm_object_zone_count = vm_object_zone->count;
3102 unsigned int vm_map_entry_zone_count = vm_map_entry_zone->count;
3103 /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
3104 if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
3105 largest_zone = vm_map_entry_zone;
3106 printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n", (uintptr_t)largest_zone->cur_size);
3107 }
3108 }
3109
3110 /* TODO: Extend this to check for the largest process in other zones as well. */
3111 if (largest_zone == vm_map_entry_zone) {
3112 pid = find_largest_process_vm_map_entries();
3113 } else {
3114 printf("zone_map_exhaustion: Nothing to do for the largest zone [%s]. Waking up memorystatus thread.\n", largest_zone->zone_name);
3115 }
3116 if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
3117 printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
3118 }
3119 }
3120
3121 /* Global initialization of Zone Allocator.
3122 * Runs after zone_bootstrap.
3123 */
3124 void
3125 zone_init(
3126 vm_size_t max_zonemap_size)
3127 {
3128 kern_return_t retval;
3129 vm_offset_t zone_min;
3130 vm_offset_t zone_max;
3131 vm_offset_t zone_metadata_space;
3132 unsigned int zone_pages;
3133 vm_map_kernel_flags_t vmk_flags;
3134
3135 #if VM_MAX_TAG_ZONES
3136 if (zone_tagging_on) {
3137 ztInit(max_zonemap_size, &zone_locks_grp);
3138 }
3139 #endif
3140
3141 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
3142 vmk_flags.vmkf_permanent = TRUE;
3143 retval = kmem_suballoc(kernel_map, &zone_min, max_zonemap_size,
3144 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_ZONE,
3145 &zone_map);
3146
3147 if (retval != KERN_SUCCESS) {
3148 panic("zone_init: kmem_suballoc failed");
3149 }
3150 zone_max = zone_min + round_page(max_zonemap_size);
3151
3152 #if CONFIG_GZALLOC
3153 gzalloc_init(max_zonemap_size);
3154 #endif
3155
3156 /*
3157 * Setup garbage collection information:
3158 */
3159 zone_map_min_address = zone_min;
3160 zone_map_max_address = zone_max;
3161
3162 zone_pages = (unsigned int)atop_kernel(zone_max - zone_min);
3163 zone_metadata_space = round_page(zone_pages * sizeof(struct zone_page_metadata));
3164 retval = kernel_memory_allocate(zone_map, &zone_metadata_region_min, zone_metadata_space,
3165 0, KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT, VM_KERN_MEMORY_OSFMK);
3166 if (retval != KERN_SUCCESS) {
3167 panic("zone_init: zone_metadata_region initialization failed!");
3168 }
3169 zone_metadata_region_max = zone_metadata_region_min + zone_metadata_space;
3170
3171 #if defined(__LP64__)
3172 /*
3173 * ensure that any vm_page_t that gets created from
3174 * the vm_page zone can be packed properly (see vm_page.h
3175 * for the packing requirements
3176 */
3177 if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_metadata_region_max))) != (vm_page_t)zone_metadata_region_max) {
3178 panic("VM_PAGE_PACK_PTR failed on zone_metadata_region_max - %p", (void *)zone_metadata_region_max);
3179 }
3180
3181 if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_max_address))) != (vm_page_t)zone_map_max_address) {
3182 panic("VM_PAGE_PACK_PTR failed on zone_map_max_address - %p", (void *)zone_map_max_address);
3183 }
3184 #endif
3185
3186 lck_grp_attr_setdefault(&zone_gc_lck_grp_attr);
3187 lck_grp_init(&zone_gc_lck_grp, "zone_gc", &zone_gc_lck_grp_attr);
3188 lck_attr_setdefault(&zone_gc_lck_attr);
3189 lck_mtx_init_ext(&zone_gc_lock, &zone_gc_lck_ext, &zone_gc_lck_grp, &zone_gc_lck_attr);
3190
3191 #if CONFIG_ZLEAKS
3192 /*
3193 * Initialize the zone leak monitor
3194 */
3195 zleak_init(max_zonemap_size);
3196 #endif /* CONFIG_ZLEAKS */
3197
3198 #if VM_MAX_TAG_ZONES
3199 if (zone_tagging_on) {
3200 vm_allocation_zones_init();
3201 }
3202 #endif
3203
3204 int jetsam_limit_temp = 0;
3205 if (PE_parse_boot_argn("zone_map_jetsam_limit", &jetsam_limit_temp, sizeof(jetsam_limit_temp)) &&
3206 jetsam_limit_temp > 0 && jetsam_limit_temp <= 100) {
3207 zone_map_jetsam_limit = jetsam_limit_temp;
3208 }
3209 }
3210
3211 #pragma mark -
3212 #pragma mark zalloc_canblock
3213
3214 extern boolean_t early_boot_complete;
3215
3216 void
3217 zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr)
3218 {
3219 vm_offset_t inner_size = zone->elem_size;
3220 if (__improbable(check_poison && addr)) {
3221 vm_offset_t *element_cursor = ((vm_offset_t *) addr) + 1;
3222 vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *) addr);
3223
3224 for (; element_cursor < backup; element_cursor++) {
3225 if (__improbable(*element_cursor != ZP_POISON)) {
3226 zone_element_was_modified_panic(zone,
3227 addr,
3228 *element_cursor,
3229 ZP_POISON,
3230 ((vm_offset_t)element_cursor) - addr);
3231 }
3232 }
3233 }
3234
3235 if (addr) {
3236 /*
3237 * Clear out the old next pointer and backup to avoid leaking the cookie
3238 * and so that only values on the freelist have a valid cookie
3239 */
3240
3241 vm_offset_t *primary = (vm_offset_t *) addr;
3242 vm_offset_t *backup = get_backup_ptr(inner_size, primary);
3243
3244 *primary = ZP_POISON;
3245 *backup = ZP_POISON;
3246 }
3247 }
3248
3249 /*
3250 * When deleting page mappings from the kernel map, it might be necessary to split
3251 * apart an existing vm_map_entry. That means that a "free" operation, will need to
3252 * *allocate* new vm_map_entry structures before it can free a page.
3253 *
3254 * This reserve here is the number of elements which are held back from everyone except
3255 * the zone_gc thread. This is done so the zone_gc thread should never have to wait for
3256 * the zone replenish thread for vm_map_entry structs. If it did, it could wind up
3257 * in a deadlock.
3258 */
3259 #define VM_MAP_ENTRY_RESERVE_CNT 8
3260
3261 /*
3262 * zalloc returns an element from the specified zone.
3263 */
3264 static void *
3265 zalloc_internal(
3266 zone_t zone,
3267 boolean_t canblock,
3268 boolean_t nopagewait,
3269 vm_size_t
3270 #if !VM_MAX_TAG_ZONES
3271 __unused
3272 #endif
3273 reqsize,
3274 vm_tag_t tag)
3275 {
3276 vm_offset_t addr = 0;
3277 kern_return_t retval;
3278 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */
3279 unsigned int numsaved = 0;
3280 thread_t thr = current_thread();
3281 boolean_t check_poison = FALSE;
3282 boolean_t set_doing_alloc_with_vm_priv = FALSE;
3283
3284 #if CONFIG_ZLEAKS
3285 uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */
3286 #endif /* CONFIG_ZLEAKS */
3287
3288 #if KASAN
3289 /*
3290 * KASan uses zalloc() for fakestack, which can be called anywhere. However,
3291 * we make sure these calls can never block.
3292 */
3293 boolean_t irq_safe = FALSE;
3294 const char *fakestack_name = "fakestack.";
3295 if (strncmp(zone->zone_name, fakestack_name, strlen(fakestack_name)) == 0) {
3296 irq_safe = TRUE;
3297 }
3298 #elif MACH_ASSERT
3299 /* In every other case, zalloc() from interrupt context is unsafe. */
3300 const boolean_t irq_safe = FALSE;
3301 #endif
3302
3303 assert(zone != ZONE_NULL);
3304 assert(irq_safe || ml_get_interrupts_enabled() || ml_is_quiescing() || debug_mode_active() || !early_boot_complete);
3305
3306 #if CONFIG_GZALLOC
3307 addr = gzalloc_alloc(zone, canblock);
3308 #endif
3309 /*
3310 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
3311 */
3312 if (__improbable(DO_LOGGING(zone))) {
3313 numsaved = OSBacktrace((void*) zbt, MAX_ZTRACE_DEPTH);
3314 }
3315
3316 #if CONFIG_ZLEAKS
3317 /*
3318 * Zone leak detection: capture a backtrace every zleak_sample_factor
3319 * allocations in this zone.
3320 */
3321 if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) {
3322 /* Avoid backtracing twice if zone logging is on */
3323 if (numsaved == 0) {
3324 zleak_tracedepth = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
3325 } else {
3326 zleak_tracedepth = numsaved;
3327 }
3328 }
3329 #endif /* CONFIG_ZLEAKS */
3330
3331 #if VM_MAX_TAG_ZONES
3332 if (__improbable(zone->tags)) {
3333 vm_tag_will_update_zone(tag, zone->tag_zone_index);
3334 }
3335 #endif /* VM_MAX_TAG_ZONES */
3336
3337 #if CONFIG_ZCACHE
3338 if (__probable(addr == 0)) {
3339 if (zone_caching_enabled(zone)) {
3340 addr = zcache_alloc_from_cpu_cache(zone);
3341 if (addr) {
3342 #if KASAN_ZALLOC
3343 addr = kasan_fixup_allocated_element_address(zone, addr);
3344 #endif
3345 if (__improbable(DO_LOGGING(zone) && addr)) {
3346 btlog_add_entry(zone->zlog_btlog, (void *)addr,
3347 ZOP_ALLOC, (void **)zbt, numsaved);
3348 }
3349 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
3350 return (void *)addr;
3351 }
3352 }
3353 }
3354 #endif /* CONFIG_ZCACHE */
3355
3356 lock_zone(zone);
3357 assert(zone->zone_valid);
3358
3359 /*
3360 * Check if we need another thread to replenish the zone.
3361 * This is used for elements, like vm_map_entry, which are
3362 * needed themselves to implement zalloc().
3363 */
3364 if (zone->async_prio_refill && zone->zone_replenish_thread) {
3365 vm_size_t curr_free;
3366 vm_size_t refill_level;
3367 const vm_size_t reserved_min = VM_MAP_ENTRY_RESERVE_CNT * zone->elem_size;
3368
3369 for (;;) {
3370 curr_free = (zone->cur_size - (zone->count * zone->elem_size));
3371 refill_level = zone->prio_refill_watermark * zone->elem_size;
3372
3373 /*
3374 * Nothing to do if there are plenty of elements.
3375 */
3376 if (curr_free > refill_level) {
3377 break;
3378 }
3379
3380 /*
3381 * Wakeup the replenish thread.
3382 */
3383 zone_replenish_wakeups_initiated++;
3384 thread_wakeup(&zone->zone_replenish_thread);
3385
3386 /*
3387 * If we:
3388 * - still have head room, more than half the refill amount, or
3389 * - this is a VMPRIV thread and we're still above reserved, or
3390 * - this is the zone garbage collection thread which may use the reserve
3391 * then we don't have to wait for the replenish thread.
3392 *
3393 * The reserve for the garbage collection thread is to avoid a deadlock
3394 * on the zone_map_lock between the replenish thread and GC thread.
3395 */
3396 if (curr_free > refill_level / 2 ||
3397 ((thr->options & TH_OPT_VMPRIV) && curr_free > reserved_min) ||
3398 (thr->options & TH_OPT_ZONE_GC)) {
3399 break;
3400 }
3401 zone_replenish_throttle_count++;
3402 unlock_zone(zone);
3403 assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
3404 thread_block(THREAD_CONTINUE_NULL);
3405 lock_zone(zone);
3406
3407 assert(zone->zone_valid);
3408 }
3409 }
3410
3411 if (__probable(addr == 0)) {
3412 addr = try_alloc_from_zone(zone, tag, &check_poison);
3413 }
3414
3415 /* If we're here because of zone_gc(), we didn't wait for zone_replenish_thread to finish.
3416 * So we need to ensure that we did successfully grab an element. And we only need to assert
3417 * this for zones that have a replenish thread configured (in this case, the Reserved VM map
3418 * entries zone). The value of reserved_min in the previous bit of code should have given us
3419 * headroom even though the GC thread didn't wait.
3420 */
3421 if ((thr->options & TH_OPT_ZONE_GC) && zone->async_prio_refill) {
3422 assert(addr != 0);
3423 }
3424
3425 while ((addr == 0) && canblock) {
3426 /*
3427 * zone is empty, try to expand it
3428 *
3429 * Note that we now allow up to 2 threads (1 vm_privliged and 1 non-vm_privliged)
3430 * to expand the zone concurrently... this is necessary to avoid stalling
3431 * vm_privileged threads running critical code necessary to continue compressing/swapping
3432 * pages (i.e. making new free pages) from stalling behind non-vm_privileged threads
3433 * waiting to acquire free pages when the vm_page_free_count is below the
3434 * vm_page_free_reserved limit.
3435 */
3436 if ((zone->doing_alloc_without_vm_priv || zone->doing_alloc_with_vm_priv) &&
3437 (((thr->options & TH_OPT_VMPRIV) == 0) || zone->doing_alloc_with_vm_priv)) {
3438 /*
3439 * This is a non-vm_privileged thread and a non-vm_privileged or
3440 * a vm_privileged thread is already expanding the zone...
3441 * OR
3442 * this is a vm_privileged thread and a vm_privileged thread is
3443 * already expanding the zone...
3444 *
3445 * In either case wait for a thread to finish, then try again.
3446 */
3447 zone->waiting = TRUE;
3448 zone_sleep(zone);
3449 } else {
3450 vm_offset_t space;
3451 vm_size_t alloc_size;
3452 int retry = 0;
3453
3454 if ((zone->cur_size + zone->elem_size) >
3455 zone->max_size) {
3456 if (zone->exhaustible) {
3457 break;
3458 }
3459 if (zone->expandable) {
3460 /*
3461 * We're willing to overflow certain
3462 * zones, but not without complaining.
3463 *
3464 * This is best used in conjunction
3465 * with the collectable flag. What we
3466 * want is an assurance we can get the
3467 * memory back, assuming there's no
3468 * leak.
3469 */
3470 zone->max_size += (zone->max_size >> 1);
3471 } else {
3472 unlock_zone(zone);
3473
3474 panic_include_zprint = TRUE;
3475 #if CONFIG_ZLEAKS
3476 if (zleak_state & ZLEAK_STATE_ACTIVE) {
3477 panic_include_ztrace = TRUE;
3478 }
3479 #endif /* CONFIG_ZLEAKS */
3480 panic("zalloc: zone \"%s\" empty.", zone->zone_name);
3481 }
3482 }
3483 /*
3484 * It is possible that a BG thread is refilling/expanding the zone
3485 * and gets pre-empted during that operation. That blocks all other
3486 * threads from making progress leading to a watchdog timeout. To
3487 * avoid that, boost the thread priority using the rwlock boost
3488 */
3489 set_thread_rwlock_boost();
3490
3491 if ((thr->options & TH_OPT_VMPRIV)) {
3492 zone->doing_alloc_with_vm_priv = TRUE;
3493 set_doing_alloc_with_vm_priv = TRUE;
3494 } else {
3495 zone->doing_alloc_without_vm_priv = TRUE;
3496 }
3497 unlock_zone(zone);
3498
3499 for (;;) {
3500 int zflags = KMA_KOBJECT | KMA_NOPAGEWAIT;
3501
3502 if (vm_pool_low() || retry >= 1) {
3503 alloc_size =
3504 round_page(zone->elem_size);
3505 } else {
3506 alloc_size = zone->alloc_size;
3507 }
3508
3509 if (zone->noencrypt) {
3510 zflags |= KMA_NOENCRYPT;
3511 }
3512
3513 if (zone->clear_memory) {
3514 zflags |= KMA_ZERO;
3515 }
3516
3517 /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
3518 if (is_zone_map_nearing_exhaustion()) {
3519 thread_wakeup((event_t) &vm_pageout_garbage_collect);
3520 }
3521
3522 retval = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
3523 if (retval == KERN_SUCCESS) {
3524 #if CONFIG_ZLEAKS
3525 if ((zleak_state & (ZLEAK_STATE_ENABLED | ZLEAK_STATE_ACTIVE)) == ZLEAK_STATE_ENABLED) {
3526 if (zone_map->size >= zleak_global_tracking_threshold) {
3527 kern_return_t kr;
3528
3529 kr = zleak_activate();
3530 if (kr != KERN_SUCCESS) {
3531 printf("Failed to activate live zone leak debugging (%d).\n", kr);
3532 }
3533 }
3534 }
3535
3536 if ((zleak_state & ZLEAK_STATE_ACTIVE) && !(zone->zleak_on)) {
3537 if (zone->cur_size > zleak_per_zone_tracking_threshold) {
3538 zone->zleak_on = TRUE;
3539 }
3540 }
3541 #endif /* CONFIG_ZLEAKS */
3542 zcram(zone, space, alloc_size);
3543
3544 break;
3545 } else if (retval != KERN_RESOURCE_SHORTAGE) {
3546 retry++;
3547
3548 if (retry == 3) {
3549 panic_include_zprint = TRUE;
3550 #if CONFIG_ZLEAKS
3551 if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
3552 panic_include_ztrace = TRUE;
3553 }
3554 #endif /* CONFIG_ZLEAKS */
3555 if (retval == KERN_NO_SPACE) {
3556 zone_t zone_largest = zone_find_largest();
3557 panic("zalloc: zone map exhausted while allocating from zone %s, likely due to memory leak in zone %s (%lu total bytes, %d elements allocated)",
3558 zone->zone_name, zone_largest->zone_name,
3559 (unsigned long)zone_largest->cur_size, zone_largest->count);
3560 }
3561 panic("zalloc: \"%s\" (%d elements) retry fail %d", zone->zone_name, zone->count, retval);
3562 }
3563 } else {
3564 break;
3565 }
3566 }
3567 lock_zone(zone);
3568 assert(zone->zone_valid);
3569
3570 if (set_doing_alloc_with_vm_priv == TRUE) {
3571 zone->doing_alloc_with_vm_priv = FALSE;
3572 } else {
3573 zone->doing_alloc_without_vm_priv = FALSE;
3574 }
3575
3576 if (zone->waiting) {
3577 zone->waiting = FALSE;
3578 zone_wakeup(zone);
3579 }
3580 clear_thread_rwlock_boost();
3581
3582 addr = try_alloc_from_zone(zone, tag, &check_poison);
3583 if (addr == 0 &&
3584 retval == KERN_RESOURCE_SHORTAGE) {
3585 if (nopagewait == TRUE) {
3586 break; /* out of the main while loop */
3587 }
3588 unlock_zone(zone);
3589
3590 VM_PAGE_WAIT();
3591 lock_zone(zone);
3592 assert(zone->zone_valid);
3593 }
3594 }
3595 if (addr == 0) {
3596 addr = try_alloc_from_zone(zone, tag, &check_poison);
3597 }
3598 }
3599
3600 #if CONFIG_ZLEAKS
3601 /* Zone leak detection:
3602 * If we're sampling this allocation, add it to the zleaks hash table.
3603 */
3604 if (addr && zleak_tracedepth > 0) {
3605 /* Sampling can fail if another sample is happening at the same time in a different zone. */
3606 if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) {
3607 /* If it failed, roll back the counter so we sample the next allocation instead. */
3608 zone->zleak_capture = zleak_sample_factor;
3609 }
3610 }
3611 #endif /* CONFIG_ZLEAKS */
3612
3613
3614 if ((addr == 0) && (!canblock || nopagewait) && (zone->async_pending == FALSE) && (zone->no_callout == FALSE) && (zone->exhaustible == FALSE) && (!vm_pool_low())) {
3615 zone->async_pending = TRUE;
3616 unlock_zone(zone);
3617 thread_call_enter(&call_async_alloc);
3618 lock_zone(zone);
3619 assert(zone->zone_valid);
3620 addr = try_alloc_from_zone(zone, tag, &check_poison);
3621 }
3622
3623 #if VM_MAX_TAG_ZONES
3624 if (__improbable(zone->tags) && addr) {
3625 if (reqsize) {
3626 reqsize = zone->elem_size - reqsize;
3627 }
3628 vm_tag_update_zone_size(tag, zone->tag_zone_index, zone->elem_size, reqsize);
3629 }
3630 #endif /* VM_MAX_TAG_ZONES */
3631
3632 unlock_zone(zone);
3633
3634 if (__improbable(DO_LOGGING(zone) && addr)) {
3635 btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved);
3636 }
3637
3638 zalloc_poison_element(check_poison, zone, addr);
3639
3640 if (addr) {
3641 #if DEBUG || DEVELOPMENT
3642 if (__improbable(leak_scan_debug_flag && !(zone->elem_size & (sizeof(uintptr_t) - 1)))) {
3643 unsigned int count, idx;
3644 /* Fill element, from tail, with backtrace in reverse order */
3645 if (numsaved == 0) {
3646 numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
3647 }
3648 count = (unsigned int)(zone->elem_size / sizeof(uintptr_t));
3649 if (count >= numsaved) {
3650 count = numsaved - 1;
3651 }
3652 for (idx = 0; idx < count; idx++) {
3653 ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
3654 }
3655 }
3656 #endif /* DEBUG || DEVELOPMENT */
3657 }
3658
3659 TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr);
3660
3661
3662 #if KASAN_ZALLOC
3663 addr = kasan_fixup_allocated_element_address(zone, addr);
3664 #endif
3665
3666 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
3667
3668 return (void *)addr;
3669 }
3670
3671 void *
3672 zalloc(zone_t zone)
3673 {
3674 return zalloc_internal(zone, TRUE, FALSE, 0, VM_KERN_MEMORY_NONE);
3675 }
3676
3677 void *
3678 zalloc_noblock(zone_t zone)
3679 {
3680 return zalloc_internal(zone, FALSE, FALSE, 0, VM_KERN_MEMORY_NONE);
3681 }
3682
3683 void *
3684 zalloc_nopagewait(zone_t zone)
3685 {
3686 return zalloc_internal(zone, TRUE, TRUE, 0, VM_KERN_MEMORY_NONE);
3687 }
3688
3689 void *
3690 zalloc_canblock_tag(zone_t zone, boolean_t canblock, vm_size_t reqsize, vm_tag_t tag)
3691 {
3692 return zalloc_internal(zone, canblock, FALSE, reqsize, tag);
3693 }
3694
3695 void *
3696 zalloc_canblock(zone_t zone, boolean_t canblock)
3697 {
3698 return zalloc_internal(zone, canblock, FALSE, 0, VM_KERN_MEMORY_NONE);
3699 }
3700
3701 void *
3702 zalloc_attempt(zone_t zone)
3703 {
3704 boolean_t check_poison = FALSE;
3705 vm_offset_t addr = try_alloc_from_zone(zone, VM_KERN_MEMORY_NONE, &check_poison);
3706 zalloc_poison_element(check_poison, zone, addr);
3707 return (void *)addr;
3708 }
3709
3710 void
3711 zfree_direct(zone_t zone, vm_offset_t elem)
3712 {
3713 boolean_t poison = zfree_poison_element(zone, elem);
3714 free_to_zone(zone, elem, poison);
3715 }
3716
3717
3718 void
3719 zalloc_async(
3720 __unused thread_call_param_t p0,
3721 __unused thread_call_param_t p1)
3722 {
3723 zone_t current_z = NULL;
3724 unsigned int max_zones, i;
3725 void *elt = NULL;
3726 boolean_t pending = FALSE;
3727
3728 simple_lock(&all_zones_lock, &zone_locks_grp);
3729 max_zones = num_zones;
3730 simple_unlock(&all_zones_lock);
3731 for (i = 0; i < max_zones; i++) {
3732 current_z = &(zone_array[i]);
3733
3734 if (current_z->no_callout == TRUE) {
3735 /* async_pending will never be set */
3736 continue;
3737 }
3738
3739 lock_zone(current_z);
3740 if (current_z->zone_valid && current_z->async_pending == TRUE) {
3741 current_z->async_pending = FALSE;
3742 pending = TRUE;
3743 }
3744 unlock_zone(current_z);
3745
3746 if (pending == TRUE) {
3747 elt = zalloc_canblock_tag(current_z, TRUE, 0, VM_KERN_MEMORY_OSFMK);
3748 zfree(current_z, elt);
3749 pending = FALSE;
3750 }
3751 }
3752 }
3753
3754 /*
3755 * zget returns an element from the specified zone
3756 * and immediately returns nothing if there is nothing there.
3757 */
3758 void *
3759 zget(
3760 zone_t zone)
3761 {
3762 return zalloc_internal(zone, FALSE, TRUE, 0, VM_KERN_MEMORY_NONE);
3763 }
3764
3765 /* Keep this FALSE by default. Large memory machine run orders of magnitude
3766 * slower in debug mode when true. Use debugger to enable if needed */
3767 /* static */ boolean_t zone_check = FALSE;
3768
3769 static void
3770 zone_check_freelist(zone_t zone, vm_offset_t elem)
3771 {
3772 struct zone_free_element *this;
3773 struct zone_page_metadata *thispage;
3774
3775 if (zone->allows_foreign) {
3776 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
3777 !queue_end(&zone->pages.any_free_foreign, &(thispage->pages));
3778 thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3779 for (this = page_metadata_get_freelist(thispage);
3780 this != NULL;
3781 this = this->next) {
3782 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) {
3783 panic("zone_check_freelist");
3784 }
3785 }
3786 }
3787 }
3788 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
3789 !queue_end(&zone->pages.all_free, &(thispage->pages));
3790 thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3791 for (this = page_metadata_get_freelist(thispage);
3792 this != NULL;
3793 this = this->next) {
3794 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) {
3795 panic("zone_check_freelist");
3796 }
3797 }
3798 }
3799 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
3800 !queue_end(&zone->pages.intermediate, &(thispage->pages));
3801 thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3802 for (this = page_metadata_get_freelist(thispage);
3803 this != NULL;
3804 this = this->next) {
3805 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) {
3806 panic("zone_check_freelist");
3807 }
3808 }
3809 }
3810 }
3811
3812 boolean_t
3813 zfree_poison_element(zone_t zone, vm_offset_t elem)
3814 {
3815 boolean_t poison = FALSE;
3816 if (zp_factor != 0 || zp_tiny_zone_limit != 0) {
3817 /*
3818 * Poison the memory before it ends up on the freelist to catch
3819 * use-after-free and use of uninitialized memory
3820 *
3821 * Always poison tiny zones' elements (limit is 0 if -no-zp is set)
3822 * Also poison larger elements periodically
3823 */
3824
3825 vm_offset_t inner_size = zone->elem_size;
3826
3827 uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale);
3828
3829 if (inner_size <= zp_tiny_zone_limit) {
3830 poison = TRUE;
3831 } else if (zp_factor != 0 && sample_counter(&zone->zp_count, sample_factor) == TRUE) {
3832 poison = TRUE;
3833 }
3834
3835 if (__improbable(poison)) {
3836 /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
3837 /* Poison everything but primary and backup */
3838 vm_offset_t *element_cursor = ((vm_offset_t *) elem) + 1;
3839 vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *)elem);
3840
3841 for (; element_cursor < backup; element_cursor++) {
3842 *element_cursor = ZP_POISON;
3843 }
3844 }
3845 }
3846 return poison;
3847 }
3848 void
3849 (zfree)(
3850 zone_t zone,
3851 void *addr)
3852 {
3853 vm_offset_t elem = (vm_offset_t) addr;
3854 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* only used if zone logging is enabled via boot-args */
3855 unsigned int numsaved = 0;
3856 boolean_t gzfreed = FALSE;
3857 boolean_t poison = FALSE;
3858 #if VM_MAX_TAG_ZONES
3859 vm_tag_t tag;
3860 #endif /* VM_MAX_TAG_ZONES */
3861
3862 assert(zone != ZONE_NULL);
3863 DTRACE_VM2(zfree, zone_t, zone, void*, addr);
3864 #if KASAN_ZALLOC
3865 if (kasan_quarantine_freed_element(&zone, &addr)) {
3866 return;
3867 }
3868 elem = (vm_offset_t)addr;
3869 #endif
3870
3871 /*
3872 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
3873 */
3874
3875 if (__improbable(DO_LOGGING(zone) && corruption_debug_flag)) {
3876 numsaved = OSBacktrace((void *)zbt, MAX_ZTRACE_DEPTH);
3877 }
3878
3879 #if MACH_ASSERT
3880 /* Basic sanity checks */
3881 if (zone == ZONE_NULL || elem == (vm_offset_t)0) {
3882 panic("zfree: NULL");
3883 }
3884 #endif
3885
3886 #if CONFIG_GZALLOC
3887 gzfreed = gzalloc_free(zone, addr);
3888 #endif
3889
3890 if (!gzfreed) {
3891 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
3892 if (zone != PAGE_METADATA_GET_ZONE(page_meta)) {
3893 panic("Element %p from zone %s caught being freed to wrong zone %s\n", addr, PAGE_METADATA_GET_ZONE(page_meta)->zone_name, zone->zone_name);
3894 }
3895 }
3896
3897 TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, zone->elem_size, (uintptr_t)addr);
3898
3899 if (__improbable(!gzfreed && zone->collectable && !zone->allows_foreign &&
3900 !from_zone_map(elem, zone->elem_size))) {
3901 panic("zfree: non-allocated memory in collectable zone!");
3902 }
3903
3904 if (!gzfreed) {
3905 poison = zfree_poison_element(zone, elem);
3906 }
3907
3908 /*
3909 * See if we're doing logging on this zone. There are two styles of logging used depending on
3910 * whether we're trying to catch a leak or corruption. See comments above in zalloc for details.
3911 */
3912
3913 if (__improbable(DO_LOGGING(zone))) {
3914 if (corruption_debug_flag) {
3915 /*
3916 * We're logging to catch a corruption. Add a record of this zfree operation
3917 * to log.
3918 */
3919 btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE, (void **)zbt, numsaved);
3920 } else {
3921 /*
3922 * We're logging to catch a leak. Remove any record we might have for this
3923 * element since it's being freed. Note that we may not find it if the buffer
3924 * overflowed and that's OK. Since the log is of a limited size, old records
3925 * get overwritten if there are more zallocs than zfrees.
3926 */
3927 btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
3928 }
3929 }
3930
3931 #if CONFIG_ZCACHE
3932 if (zone_caching_enabled(zone)) {
3933 int __assert_only ret = zcache_free_to_cpu_cache(zone, addr);
3934 assert(ret != FALSE);
3935 return;
3936 }
3937 #endif /* CONFIG_ZCACHE */
3938
3939 lock_zone(zone);
3940 assert(zone->zone_valid);
3941
3942 if (zone_check) {
3943 zone_check_freelist(zone, elem);
3944 }
3945
3946 if (__probable(!gzfreed)) {
3947 #if VM_MAX_TAG_ZONES
3948 if (__improbable(zone->tags)) {
3949 tag = (ZTAG(zone, elem)[0] >> 1);
3950 // set the tag with b0 clear so the block remains inuse
3951 ZTAG(zone, elem)[0] = 0xFFFE;
3952 }
3953 #endif /* VM_MAX_TAG_ZONES */
3954 free_to_zone(zone, elem, poison);
3955 }
3956
3957 if (__improbable(zone->count < 0)) {
3958 panic("zfree: zone count underflow in zone %s while freeing element %p, possible cause: double frees or freeing memory that did not come from this zone",
3959 zone->zone_name, addr);
3960 }
3961
3962 #if CONFIG_ZLEAKS
3963 /*
3964 * Zone leak detection: un-track the allocation
3965 */
3966 if (zone->zleak_on) {
3967 zleak_free(elem, zone->elem_size);
3968 }
3969 #endif /* CONFIG_ZLEAKS */
3970
3971 #if VM_MAX_TAG_ZONES
3972 if (__improbable(zone->tags) && __probable(!gzfreed)) {
3973 vm_tag_update_zone_size(tag, zone->tag_zone_index, -((int64_t)zone->elem_size), 0);
3974 }
3975 #endif /* VM_MAX_TAG_ZONES */
3976
3977 unlock_zone(zone);
3978 }
3979
3980 /* Change a zone's flags.
3981 * This routine must be called immediately after zinit.
3982 */
3983 void
3984 zone_change(
3985 zone_t zone,
3986 unsigned int item,
3987 boolean_t value)
3988 {
3989 assert( zone != ZONE_NULL );
3990 assert( value == TRUE || value == FALSE );
3991
3992 switch (item) {
3993 case Z_NOENCRYPT:
3994 zone->noencrypt = value;
3995 break;
3996 case Z_EXHAUST:
3997 zone->exhaustible = value;
3998 break;
3999 case Z_COLLECT:
4000 zone->collectable = value;
4001 break;
4002 case Z_EXPAND:
4003 zone->expandable = value;
4004 break;
4005 case Z_FOREIGN:
4006 zone->allows_foreign = value;
4007 break;
4008 case Z_CALLERACCT:
4009 zone->caller_acct = value;
4010 break;
4011 case Z_NOCALLOUT:
4012 zone->no_callout = value;
4013 break;
4014 case Z_TAGS_ENABLED:
4015 #if VM_MAX_TAG_ZONES
4016 {
4017 static int tag_zone_index;
4018 zone->tags = TRUE;
4019 zone->tags_inline = (((page_size + zone->elem_size - 1) / zone->elem_size) <= (sizeof(uint32_t) / sizeof(uint16_t)));
4020 zone->tag_zone_index = OSAddAtomic(1, &tag_zone_index);
4021 }
4022 #endif /* VM_MAX_TAG_ZONES */
4023 break;
4024 case Z_GZALLOC_EXEMPT:
4025 zone->gzalloc_exempt = value;
4026 #if CONFIG_GZALLOC
4027 gzalloc_reconfigure(zone);
4028 #endif
4029 break;
4030 case Z_ALIGNMENT_REQUIRED:
4031 zone->alignment_required = value;
4032 #if KASAN_ZALLOC
4033 if (zone->kasan_redzone == KASAN_GUARD_SIZE) {
4034 /* Don't disturb alignment with the redzone for zones with
4035 * specific alignment requirements. */
4036 zone->elem_size -= zone->kasan_redzone * 2;
4037 zone->kasan_redzone = 0;
4038 }
4039 #endif
4040 #if CONFIG_GZALLOC
4041 gzalloc_reconfigure(zone);
4042 #endif
4043 break;
4044 case Z_KASAN_QUARANTINE:
4045 zone->kasan_quarantine = value;
4046 break;
4047 case Z_CACHING_ENABLED:
4048 #if CONFIG_ZCACHE
4049 if (value == TRUE) {
4050 #if CONFIG_GZALLOC
4051 /*
4052 * Per cpu zone caching should be
4053 * disabled if gzalloc is enabled.
4054 */
4055 if (gzalloc_enabled()) {
4056 break;
4057 }
4058 #endif
4059 if (zcache_ready()) {
4060 zcache_init(zone);
4061 } else {
4062 zone->cpu_cache_enable_when_ready = TRUE;
4063 }
4064 }
4065 #endif
4066 break;
4067 case Z_CLEARMEMORY:
4068 zone->clear_memory = value;
4069 break;
4070 default:
4071 panic("Zone_change: Wrong Item Type!");
4072 /* break; */
4073 }
4074 }
4075
4076 /*
4077 * Return the expected number of free elements in the zone.
4078 * This calculation will be incorrect if items are zfree'd that
4079 * were never zalloc'd/zget'd. The correct way to stuff memory
4080 * into a zone is by zcram.
4081 */
4082
4083 integer_t
4084 zone_free_count(zone_t zone)
4085 {
4086 integer_t free_count;
4087
4088 lock_zone(zone);
4089 free_count = zone->countfree;
4090 unlock_zone(zone);
4091
4092 assert(free_count >= 0);
4093
4094 return free_count;
4095 }
4096
4097 /*
4098 * Drops (i.e. frees) the elements in the all free pages queue of a zone.
4099 * Called by zone_gc() on each zone and when a zone is zdestroy()ed.
4100 */
4101 void
4102 drop_free_elements(zone_t z)
4103 {
4104 vm_size_t elt_size;
4105 unsigned int total_freed_pages = 0;
4106 struct zone_page_metadata *page_meta;
4107 vm_address_t free_page_address;
4108 vm_size_t size_to_free;
4109
4110 lock_zone(z);
4111
4112 elt_size = z->elem_size;
4113
4114 while (!queue_empty(&z->pages.all_free)) {
4115 page_meta = (struct zone_page_metadata *)queue_first(&z->pages.all_free);
4116 assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */
4117 /*
4118 * Don't drain zones with async refill to below the refill threshold,
4119 * as they need some reserve to function properly.
4120 */
4121 if (!z->zone_destruction &&
4122 z->async_prio_refill && z->zone_replenish_thread &&
4123 (vm_size_t)(page_meta->free_count - z->countfree) < z->prio_refill_watermark) {
4124 break;
4125 }
4126
4127 (void)dequeue_head(&z->pages.all_free);
4128
4129 assert(z->countfree >= page_meta->free_count);
4130 z->countfree -= page_meta->free_count;
4131
4132 assert(z->count_all_free_pages >= page_meta->page_count);
4133 z->count_all_free_pages -= page_meta->page_count;
4134
4135 assert(z->cur_size >= page_meta->free_count * elt_size);
4136 z->cur_size -= page_meta->free_count * elt_size;
4137
4138 ZONE_PAGE_COUNT_DECR(z, page_meta->page_count);
4139 unlock_zone(z);
4140
4141 /* Free the pages for metadata and account for them */
4142 free_page_address = get_zone_page(page_meta);
4143 total_freed_pages += page_meta->page_count;
4144 size_to_free = page_meta->page_count * PAGE_SIZE;
4145 #if KASAN_ZALLOC
4146 kasan_poison_range(free_page_address, size_to_free, ASAN_VALID);
4147 #endif
4148 #if VM_MAX_TAG_ZONES
4149 if (z->tags) {
4150 ztMemoryRemove(z, free_page_address, size_to_free);
4151 }
4152 #endif /* VM_MAX_TAG_ZONES */
4153 kmem_free(zone_map, free_page_address, size_to_free);
4154 if (current_thread()->options & TH_OPT_ZONE_GC) {
4155 thread_yield_to_preemption();
4156 }
4157 lock_zone(z);
4158 }
4159 if (z->zone_destruction) {
4160 assert(queue_empty(&z->pages.all_free));
4161 assert(z->count_all_free_pages == 0);
4162 }
4163 unlock_zone(z);
4164
4165
4166 #if DEBUG || DEVELOPMENT
4167 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
4168 kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name,
4169 (unsigned long)((total_freed_pages * PAGE_SIZE) / elt_size), total_freed_pages);
4170 }
4171 #endif /* DEBUG || DEVELOPMENT */
4172 }
4173
4174 /* Zone garbage collection
4175 *
4176 * zone_gc will walk through all the free elements in all the
4177 * zones that are marked collectable looking for reclaimable
4178 * pages. zone_gc is called by consider_zone_gc when the system
4179 * begins to run out of memory.
4180 *
4181 * We should ensure that zone_gc never blocks.
4182 */
4183 void
4184 zone_gc(boolean_t consider_jetsams)
4185 {
4186 unsigned int max_zones;
4187 zone_t z;
4188 unsigned int i;
4189
4190 if (consider_jetsams) {
4191 kill_process_in_largest_zone();
4192 /*
4193 * If we do end up jetsamming something, we need to do a zone_gc so that
4194 * we can reclaim free zone elements and update the zone map size.
4195 * Fall through.
4196 */
4197 }
4198
4199 lck_mtx_lock(&zone_gc_lock);
4200
4201 current_thread()->options |= TH_OPT_ZONE_GC;
4202
4203 simple_lock(&all_zones_lock, &zone_locks_grp);
4204 max_zones = num_zones;
4205 simple_unlock(&all_zones_lock);
4206
4207 #if DEBUG || DEVELOPMENT
4208 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
4209 kprintf("zone_gc() starting...\n");
4210 }
4211 #endif /* DEBUG || DEVELOPMENT */
4212
4213 for (i = 0; i < max_zones; i++) {
4214 z = &(zone_array[i]);
4215 assert(z != ZONE_NULL);
4216
4217 if (!z->collectable) {
4218 continue;
4219 }
4220 #if CONFIG_ZCACHE
4221 if (zone_caching_enabled(z)) {
4222 zcache_drain_depot(z);
4223 }
4224 #endif /* CONFIG_ZCACHE */
4225 if (queue_empty(&z->pages.all_free)) {
4226 continue;
4227 }
4228
4229 drop_free_elements(z);
4230 }
4231
4232 current_thread()->options &= ~TH_OPT_ZONE_GC;
4233
4234 lck_mtx_unlock(&zone_gc_lock);
4235 }
4236
4237 extern vm_offset_t kmapoff_kaddr;
4238 extern unsigned int kmapoff_pgcnt;
4239
4240 /*
4241 * consider_zone_gc:
4242 *
4243 * Called by the pageout daemon when the system needs more free pages.
4244 */
4245
4246 void
4247 consider_zone_gc(boolean_t consider_jetsams)
4248 {
4249 if (kmapoff_kaddr != 0) {
4250 /*
4251 * One-time reclaim of kernel_map resources we allocated in
4252 * early boot.
4253 */
4254 (void) vm_deallocate(kernel_map,
4255 kmapoff_kaddr, kmapoff_pgcnt * PAGE_SIZE_64);
4256 kmapoff_kaddr = 0;
4257 }
4258
4259 if (zone_gc_allowed) {
4260 zone_gc(consider_jetsams);
4261 }
4262 }
4263
4264 /*
4265 * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
4266 * requesting zone information.
4267 * Frees unused pages towards the end of the region, and zero'es out unused
4268 * space on the last page.
4269 */
4270 vm_map_copy_t
4271 create_vm_map_copy(
4272 vm_offset_t start_addr,
4273 vm_size_t total_size,
4274 vm_size_t used_size)
4275 {
4276 kern_return_t kr;
4277 vm_offset_t end_addr;
4278 vm_size_t free_size;
4279 vm_map_copy_t copy;
4280
4281 if (used_size != total_size) {
4282 end_addr = start_addr + used_size;
4283 free_size = total_size - (round_page(end_addr) - start_addr);
4284
4285 if (free_size >= PAGE_SIZE) {
4286 kmem_free(ipc_kernel_map,
4287 round_page(end_addr), free_size);
4288 }
4289 bzero((char *) end_addr, round_page(end_addr) - end_addr);
4290 }
4291
4292 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
4293 (vm_map_size_t)used_size, TRUE, &copy);
4294 assert(kr == KERN_SUCCESS);
4295
4296 return copy;
4297 }
4298
4299 boolean_t
4300 get_zone_info(
4301 zone_t z,
4302 mach_zone_name_t *zn,
4303 mach_zone_info_t *zi)
4304 {
4305 struct zone zcopy;
4306
4307 assert(z != ZONE_NULL);
4308 lock_zone(z);
4309 if (!z->zone_valid) {
4310 unlock_zone(z);
4311 return FALSE;
4312 }
4313 zcopy = *z;
4314 unlock_zone(z);
4315
4316 if (zn != NULL) {
4317 /* assuming here the name data is static */
4318 (void) __nosan_strlcpy(zn->mzn_name, zcopy.zone_name,
4319 strlen(zcopy.zone_name) + 1);
4320 }
4321
4322 if (zi != NULL) {
4323 zi->mzi_count = (uint64_t)zcopy.count;
4324 zi->mzi_cur_size = ptoa_64(zcopy.page_count);
4325 zi->mzi_max_size = (uint64_t)zcopy.max_size;
4326 zi->mzi_elem_size = (uint64_t)zcopy.elem_size;
4327 zi->mzi_alloc_size = (uint64_t)zcopy.alloc_size;
4328 zi->mzi_sum_size = zcopy.sum_count * zcopy.elem_size;
4329 zi->mzi_exhaustible = (uint64_t)zcopy.exhaustible;
4330 zi->mzi_collectable = 0;
4331 if (zcopy.collectable) {
4332 SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable, ((uint64_t)zcopy.count_all_free_pages * PAGE_SIZE));
4333 SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
4334 }
4335 }
4336
4337 return TRUE;
4338 }
4339
4340 kern_return_t
4341 task_zone_info(
4342 __unused task_t task,
4343 __unused mach_zone_name_array_t *namesp,
4344 __unused mach_msg_type_number_t *namesCntp,
4345 __unused task_zone_info_array_t *infop,
4346 __unused mach_msg_type_number_t *infoCntp)
4347 {
4348 return KERN_FAILURE;
4349 }
4350
4351 kern_return_t
4352 mach_zone_info(
4353 host_priv_t host,
4354 mach_zone_name_array_t *namesp,
4355 mach_msg_type_number_t *namesCntp,
4356 mach_zone_info_array_t *infop,
4357 mach_msg_type_number_t *infoCntp)
4358 {
4359 return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL);
4360 }
4361
4362
4363 kern_return_t
4364 mach_memory_info(
4365 host_priv_t host,
4366 mach_zone_name_array_t *namesp,
4367 mach_msg_type_number_t *namesCntp,
4368 mach_zone_info_array_t *infop,
4369 mach_msg_type_number_t *infoCntp,
4370 mach_memory_info_array_t *memoryInfop,
4371 mach_msg_type_number_t *memoryInfoCntp)
4372 {
4373 mach_zone_name_t *names;
4374 vm_offset_t names_addr;
4375 vm_size_t names_size;
4376
4377 mach_zone_info_t *info;
4378 vm_offset_t info_addr;
4379 vm_size_t info_size;
4380
4381 mach_memory_info_t *memory_info;
4382 vm_offset_t memory_info_addr;
4383 vm_size_t memory_info_size;
4384 vm_size_t memory_info_vmsize;
4385 unsigned int num_info;
4386
4387 unsigned int max_zones, used_zones, i;
4388 mach_zone_name_t *zn;
4389 mach_zone_info_t *zi;
4390 kern_return_t kr;
4391
4392 uint64_t zones_collectable_bytes = 0;
4393
4394 if (host == HOST_NULL) {
4395 return KERN_INVALID_HOST;
4396 }
4397 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
4398 if (!PE_i_can_has_debugger(NULL)) {
4399 return KERN_INVALID_HOST;
4400 }
4401 #endif
4402
4403 /*
4404 * We assume that zones aren't freed once allocated.
4405 * We won't pick up any zones that are allocated later.
4406 */
4407
4408 simple_lock(&all_zones_lock, &zone_locks_grp);
4409 max_zones = (unsigned int)(num_zones);
4410 simple_unlock(&all_zones_lock);
4411
4412 names_size = round_page(max_zones * sizeof *names);
4413 kr = kmem_alloc_pageable(ipc_kernel_map,
4414 &names_addr, names_size, VM_KERN_MEMORY_IPC);
4415 if (kr != KERN_SUCCESS) {
4416 return kr;
4417 }
4418 names = (mach_zone_name_t *) names_addr;
4419
4420 info_size = round_page(max_zones * sizeof *info);
4421 kr = kmem_alloc_pageable(ipc_kernel_map,
4422 &info_addr, info_size, VM_KERN_MEMORY_IPC);
4423 if (kr != KERN_SUCCESS) {
4424 kmem_free(ipc_kernel_map,
4425 names_addr, names_size);
4426 return kr;
4427 }
4428 info = (mach_zone_info_t *) info_addr;
4429
4430 zn = &names[0];
4431 zi = &info[0];
4432
4433 used_zones = max_zones;
4434 for (i = 0; i < max_zones; i++) {
4435 if (!get_zone_info(&(zone_array[i]), zn, zi)) {
4436 used_zones--;
4437 continue;
4438 }
4439 zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
4440 zn++;
4441 zi++;
4442 }
4443
4444 *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
4445 *namesCntp = used_zones;
4446
4447 *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
4448 *infoCntp = used_zones;
4449
4450 num_info = 0;
4451 memory_info_addr = 0;
4452
4453 if (memoryInfop && memoryInfoCntp) {
4454 vm_map_copy_t copy;
4455 num_info = vm_page_diagnose_estimate();
4456 memory_info_size = num_info * sizeof(*memory_info);
4457 memory_info_vmsize = round_page(memory_info_size);
4458 kr = kmem_alloc_pageable(ipc_kernel_map,
4459 &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
4460 if (kr != KERN_SUCCESS) {
4461 return kr;
4462 }
4463
4464 kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
4465 VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
4466 assert(kr == KERN_SUCCESS);
4467
4468 memory_info = (mach_memory_info_t *) memory_info_addr;
4469 vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
4470
4471 kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
4472 assert(kr == KERN_SUCCESS);
4473
4474 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
4475 (vm_map_size_t)memory_info_size, TRUE, &copy);
4476 assert(kr == KERN_SUCCESS);
4477
4478 *memoryInfop = (mach_memory_info_t *) copy;
4479 *memoryInfoCntp = num_info;
4480 }
4481
4482 return KERN_SUCCESS;
4483 }
4484
4485 kern_return_t
4486 mach_zone_info_for_zone(
4487 host_priv_t host,
4488 mach_zone_name_t name,
4489 mach_zone_info_t *infop)
4490 {
4491 unsigned int max_zones, i;
4492 zone_t zone_ptr;
4493
4494 if (host == HOST_NULL) {
4495 return KERN_INVALID_HOST;
4496 }
4497 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
4498 if (!PE_i_can_has_debugger(NULL)) {
4499 return KERN_INVALID_HOST;
4500 }
4501 #endif
4502
4503 if (infop == NULL) {
4504 return KERN_INVALID_ARGUMENT;
4505 }
4506
4507 simple_lock(&all_zones_lock, &zone_locks_grp);
4508 max_zones = (unsigned int)(num_zones);
4509 simple_unlock(&all_zones_lock);
4510
4511 zone_ptr = ZONE_NULL;
4512 for (i = 0; i < max_zones; i++) {
4513 zone_t z = &(zone_array[i]);
4514 assert(z != ZONE_NULL);
4515
4516 /* Find the requested zone by name */
4517 if (track_this_zone(z->zone_name, name.mzn_name)) {
4518 zone_ptr = z;
4519 break;
4520 }
4521 }
4522
4523 /* No zones found with the requested zone name */
4524 if (zone_ptr == ZONE_NULL) {
4525 return KERN_INVALID_ARGUMENT;
4526 }
4527
4528 if (get_zone_info(zone_ptr, NULL, infop)) {
4529 return KERN_SUCCESS;
4530 }
4531 return KERN_FAILURE;
4532 }
4533
4534 kern_return_t
4535 mach_zone_info_for_largest_zone(
4536 host_priv_t host,
4537 mach_zone_name_t *namep,
4538 mach_zone_info_t *infop)
4539 {
4540 if (host == HOST_NULL) {
4541 return KERN_INVALID_HOST;
4542 }
4543 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
4544 if (!PE_i_can_has_debugger(NULL)) {
4545 return KERN_INVALID_HOST;
4546 }
4547 #endif
4548
4549 if (namep == NULL || infop == NULL) {
4550 return KERN_INVALID_ARGUMENT;
4551 }
4552
4553 if (get_zone_info(zone_find_largest(), namep, infop)) {
4554 return KERN_SUCCESS;
4555 }
4556 return KERN_FAILURE;
4557 }
4558
4559 uint64_t
4560 get_zones_collectable_bytes(void)
4561 {
4562 unsigned int i, max_zones;
4563 uint64_t zones_collectable_bytes = 0;
4564 mach_zone_info_t zi;
4565
4566 simple_lock(&all_zones_lock, &zone_locks_grp);
4567 max_zones = (unsigned int)(num_zones);
4568 simple_unlock(&all_zones_lock);
4569
4570 for (i = 0; i < max_zones; i++) {
4571 if (get_zone_info(&(zone_array[i]), NULL, &zi)) {
4572 zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
4573 }
4574 }
4575
4576 return zones_collectable_bytes;
4577 }
4578
4579 kern_return_t
4580 mach_zone_get_zlog_zones(
4581 host_priv_t host,
4582 mach_zone_name_array_t *namesp,
4583 mach_msg_type_number_t *namesCntp)
4584 {
4585 #if DEBUG || DEVELOPMENT
4586 unsigned int max_zones, logged_zones, i;
4587 kern_return_t kr;
4588 zone_t zone_ptr;
4589 mach_zone_name_t *names;
4590 vm_offset_t names_addr;
4591 vm_size_t names_size;
4592
4593 if (host == HOST_NULL) {
4594 return KERN_INVALID_HOST;
4595 }
4596
4597 if (namesp == NULL || namesCntp == NULL) {
4598 return KERN_INVALID_ARGUMENT;
4599 }
4600
4601 simple_lock(&all_zones_lock, &zone_locks_grp);
4602 max_zones = (unsigned int)(num_zones);
4603 simple_unlock(&all_zones_lock);
4604
4605 names_size = round_page(max_zones * sizeof *names);
4606 kr = kmem_alloc_pageable(ipc_kernel_map,
4607 &names_addr, names_size, VM_KERN_MEMORY_IPC);
4608 if (kr != KERN_SUCCESS) {
4609 return kr;
4610 }
4611 names = (mach_zone_name_t *) names_addr;
4612
4613 zone_ptr = ZONE_NULL;
4614 logged_zones = 0;
4615 for (i = 0; i < max_zones; i++) {
4616 zone_t z = &(zone_array[i]);
4617 assert(z != ZONE_NULL);
4618
4619 /* Copy out the zone name if zone logging is enabled */
4620 if (z->zlog_btlog) {
4621 get_zone_info(z, &names[logged_zones], NULL);
4622 logged_zones++;
4623 }
4624 }
4625
4626 *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
4627 *namesCntp = logged_zones;
4628
4629 return KERN_SUCCESS;
4630
4631 #else /* DEBUG || DEVELOPMENT */
4632 #pragma unused(host, namesp, namesCntp)
4633 return KERN_FAILURE;
4634 #endif /* DEBUG || DEVELOPMENT */
4635 }
4636
4637 kern_return_t
4638 mach_zone_get_btlog_records(
4639 host_priv_t host,
4640 mach_zone_name_t name,
4641 zone_btrecord_array_t *recsp,
4642 mach_msg_type_number_t *recsCntp)
4643 {
4644 #if DEBUG || DEVELOPMENT
4645 unsigned int max_zones, i, numrecs = 0;
4646 zone_btrecord_t *recs;
4647 kern_return_t kr;
4648 zone_t zone_ptr;
4649 vm_offset_t recs_addr;
4650 vm_size_t recs_size;
4651
4652 if (host == HOST_NULL) {
4653 return KERN_INVALID_HOST;
4654 }
4655
4656 if (recsp == NULL || recsCntp == NULL) {
4657 return KERN_INVALID_ARGUMENT;
4658 }
4659
4660 simple_lock(&all_zones_lock, &zone_locks_grp);
4661 max_zones = (unsigned int)(num_zones);
4662 simple_unlock(&all_zones_lock);
4663
4664 zone_ptr = ZONE_NULL;
4665 for (i = 0; i < max_zones; i++) {
4666 zone_t z = &(zone_array[i]);
4667 assert(z != ZONE_NULL);
4668
4669 /* Find the requested zone by name */
4670 if (track_this_zone(z->zone_name, name.mzn_name)) {
4671 zone_ptr = z;
4672 break;
4673 }
4674 }
4675
4676 /* No zones found with the requested zone name */
4677 if (zone_ptr == ZONE_NULL) {
4678 return KERN_INVALID_ARGUMENT;
4679 }
4680
4681 /* Logging not turned on for the requested zone */
4682 if (!DO_LOGGING(zone_ptr)) {
4683 return KERN_FAILURE;
4684 }
4685
4686 /* Allocate memory for btlog records */
4687 numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog));
4688 recs_size = round_page(numrecs * sizeof *recs);
4689
4690 kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC);
4691 if (kr != KERN_SUCCESS) {
4692 return kr;
4693 }
4694
4695 /*
4696 * We will call get_btlog_records() below which populates this region while holding a spinlock
4697 * (the btlog lock). So these pages need to be wired.
4698 */
4699 kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size,
4700 VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
4701 assert(kr == KERN_SUCCESS);
4702
4703 recs = (zone_btrecord_t *)recs_addr;
4704 get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs);
4705
4706 kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE);
4707 assert(kr == KERN_SUCCESS);
4708
4709 *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs);
4710 *recsCntp = numrecs;
4711
4712 return KERN_SUCCESS;
4713
4714 #else /* DEBUG || DEVELOPMENT */
4715 #pragma unused(host, name, recsp, recsCntp)
4716 return KERN_FAILURE;
4717 #endif /* DEBUG || DEVELOPMENT */
4718 }
4719
4720
4721 #if DEBUG || DEVELOPMENT
4722
4723 kern_return_t
4724 mach_memory_info_check(void)
4725 {
4726 mach_memory_info_t * memory_info;
4727 mach_memory_info_t * info;
4728 zone_t zone;
4729 unsigned int idx, num_info, max_zones;
4730 vm_offset_t memory_info_addr;
4731 kern_return_t kr;
4732 size_t memory_info_size, memory_info_vmsize;
4733 uint64_t top_wired, zonestotal, total;
4734
4735 num_info = vm_page_diagnose_estimate();
4736 memory_info_size = num_info * sizeof(*memory_info);
4737 memory_info_vmsize = round_page(memory_info_size);
4738 kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG);
4739 assert(kr == KERN_SUCCESS);
4740
4741 memory_info = (mach_memory_info_t *) memory_info_addr;
4742 vm_page_diagnose(memory_info, num_info, 0);
4743
4744 simple_lock(&all_zones_lock, &zone_locks_grp);
4745 max_zones = num_zones;
4746 simple_unlock(&all_zones_lock);
4747
4748 top_wired = total = zonestotal = 0;
4749 for (idx = 0; idx < max_zones; idx++) {
4750 zone = &(zone_array[idx]);
4751 assert(zone != ZONE_NULL);
4752 lock_zone(zone);
4753 zonestotal += ptoa_64(zone->page_count);
4754 unlock_zone(zone);
4755 }
4756 for (idx = 0; idx < num_info; idx++) {
4757 info = &memory_info[idx];
4758 if (!info->size) {
4759 continue;
4760 }
4761 if (VM_KERN_COUNT_WIRED == info->site) {
4762 top_wired = info->size;
4763 }
4764 if (VM_KERN_SITE_HIDE & info->flags) {
4765 continue;
4766 }
4767 if (!(VM_KERN_SITE_WIRED & info->flags)) {
4768 continue;
4769 }
4770 total += info->size;
4771 }
4772 total += zonestotal;
4773
4774 printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n", total, top_wired, zonestotal, top_wired - total);
4775
4776 kmem_free(kernel_map, memory_info_addr, memory_info_vmsize);
4777
4778 return kr;
4779 }
4780
4781 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
4782
4783 #endif /* DEBUG || DEVELOPMENT */
4784
4785 kern_return_t
4786 mach_zone_force_gc(
4787 host_t host)
4788 {
4789 if (host == HOST_NULL) {
4790 return KERN_INVALID_HOST;
4791 }
4792
4793 #if DEBUG || DEVELOPMENT
4794 /* Callout to buffer cache GC to drop elements in the apfs zones */
4795 if (consider_buffer_cache_collect != NULL) {
4796 (void)(*consider_buffer_cache_collect)(0);
4797 }
4798 consider_zone_gc(FALSE);
4799 #endif /* DEBUG || DEVELOPMENT */
4800 return KERN_SUCCESS;
4801 }
4802
4803 extern unsigned int stack_total;
4804 extern unsigned long long stack_allocs;
4805
4806 zone_t
4807 zone_find_largest(void)
4808 {
4809 unsigned int i;
4810 unsigned int max_zones;
4811 zone_t the_zone;
4812 zone_t zone_largest;
4813
4814 simple_lock(&all_zones_lock, &zone_locks_grp);
4815 max_zones = num_zones;
4816 simple_unlock(&all_zones_lock);
4817
4818 zone_largest = &(zone_array[0]);
4819 for (i = 0; i < max_zones; i++) {
4820 the_zone = &(zone_array[i]);
4821 if (the_zone->cur_size > zone_largest->cur_size) {
4822 zone_largest = the_zone;
4823 }
4824 }
4825 return zone_largest;
4826 }
4827
4828 #if ZONE_DEBUG
4829
4830 /* should we care about locks here ? */
4831
4832 #define zone_in_use(z) ( z->count || z->free_elements \
4833 || !queue_empty(&z->pages.all_free) \
4834 || !queue_empty(&z->pages.intermediate) \
4835 || (z->allows_foreign && !queue_empty(&z->pages.any_free_foreign)))
4836
4837
4838 #endif /* ZONE_DEBUG */
4839
4840
4841 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
4842
4843 #if DEBUG || DEVELOPMENT
4844
4845 static uintptr_t *
4846 zone_copy_all_allocations_inqueue(zone_t z, queue_head_t * queue, uintptr_t * elems)
4847 {
4848 struct zone_page_metadata *page_meta;
4849 vm_offset_t free, elements;
4850 vm_offset_t idx, numElements, freeCount, bytesAvail, metaSize;
4851
4852 queue_iterate(queue, page_meta, struct zone_page_metadata *, pages)
4853 {
4854 elements = get_zone_page(page_meta);
4855 bytesAvail = ptoa(page_meta->page_count);
4856 freeCount = 0;
4857 if (z->allows_foreign && !from_zone_map(elements, z->elem_size)) {
4858 metaSize = (sizeof(struct zone_page_metadata) + ZONE_ELEMENT_ALIGNMENT - 1) & ~(ZONE_ELEMENT_ALIGNMENT - 1);
4859 bytesAvail -= metaSize;
4860 elements += metaSize;
4861 }
4862 numElements = bytesAvail / z->elem_size;
4863 // construct array of all possible elements
4864 for (idx = 0; idx < numElements; idx++) {
4865 elems[idx] = INSTANCE_PUT(elements + idx * z->elem_size);
4866 }
4867 // remove from the array all free elements
4868 free = (vm_offset_t)page_metadata_get_freelist(page_meta);
4869 while (free) {
4870 // find idx of free element
4871 for (idx = 0; (idx < numElements) && (elems[idx] != INSTANCE_PUT(free)); idx++) {
4872 }
4873 assert(idx < numElements);
4874 // remove it
4875 bcopy(&elems[idx + 1], &elems[idx], (numElements - (idx + 1)) * sizeof(elems[0]));
4876 numElements--;
4877 freeCount++;
4878 // next free element
4879 vm_offset_t *primary = (vm_offset_t *) free;
4880 free = *primary ^ zp_nopoison_cookie;
4881 }
4882 elems += numElements;
4883 }
4884
4885 return elems;
4886 }
4887
4888 kern_return_t
4889 zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon)
4890 {
4891 uintptr_t zbt[MAX_ZTRACE_DEPTH];
4892 zone_t zone;
4893 uintptr_t * array;
4894 uintptr_t * next;
4895 uintptr_t element, bt;
4896 uint32_t idx, count, found;
4897 uint32_t btidx, btcount, nobtcount, btfound;
4898 uint32_t elemSize;
4899 uint64_t maxElems;
4900 unsigned int max_zones;
4901 kern_return_t kr;
4902
4903 simple_lock(&all_zones_lock, &zone_locks_grp);
4904 max_zones = num_zones;
4905 simple_unlock(&all_zones_lock);
4906
4907 for (idx = 0; idx < max_zones; idx++) {
4908 if (!strncmp(zoneName, zone_array[idx].zone_name, nameLen)) {
4909 break;
4910 }
4911 }
4912 if (idx >= max_zones) {
4913 return KERN_INVALID_NAME;
4914 }
4915 zone = &zone_array[idx];
4916
4917 elemSize = (uint32_t) zone->elem_size;
4918 maxElems = ptoa(zone->page_count) / elemSize;
4919
4920 if ((zone->alloc_size % elemSize)
4921 && !leak_scan_debug_flag) {
4922 return KERN_INVALID_CAPABILITY;
4923 }
4924
4925 kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array,
4926 maxElems * sizeof(uintptr_t), VM_KERN_MEMORY_DIAG);
4927 if (KERN_SUCCESS != kr) {
4928 return kr;
4929 }
4930
4931 lock_zone(zone);
4932
4933 next = array;
4934 next = zone_copy_all_allocations_inqueue(zone, &zone->pages.any_free_foreign, next);
4935 next = zone_copy_all_allocations_inqueue(zone, &zone->pages.intermediate, next);
4936 next = zone_copy_all_allocations_inqueue(zone, &zone->pages.all_used, next);
4937 count = (uint32_t)(next - array);
4938
4939 unlock_zone(zone);
4940
4941 zone_leaks_scan(array, count, (uint32_t)zone->elem_size, &found);
4942 assert(found <= count);
4943
4944 for (idx = 0; idx < count; idx++) {
4945 element = array[idx];
4946 if (kInstanceFlagReferenced & element) {
4947 continue;
4948 }
4949 element = INSTANCE_PUT(element) & ~kInstanceFlags;
4950 }
4951
4952 if (zone->zlog_btlog && !corruption_debug_flag) {
4953 // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
4954 btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon);
4955 }
4956
4957 for (nobtcount = idx = 0; idx < count; idx++) {
4958 element = array[idx];
4959 if (!element) {
4960 continue;
4961 }
4962 if (kInstanceFlagReferenced & element) {
4963 continue;
4964 }
4965 element = INSTANCE_PUT(element) & ~kInstanceFlags;
4966
4967 // see if we can find any backtrace left in the element
4968 btcount = (typeof(btcount))(zone->elem_size / sizeof(uintptr_t));
4969 if (btcount >= MAX_ZTRACE_DEPTH) {
4970 btcount = MAX_ZTRACE_DEPTH - 1;
4971 }
4972 for (btfound = btidx = 0; btidx < btcount; btidx++) {
4973 bt = ((uintptr_t *)element)[btcount - 1 - btidx];
4974 if (!VM_KERNEL_IS_SLID(bt)) {
4975 break;
4976 }
4977 zbt[btfound++] = bt;
4978 }
4979 if (btfound) {
4980 (*proc)(refCon, 1, elemSize, &zbt[0], btfound);
4981 } else {
4982 nobtcount++;
4983 }
4984 }
4985 if (nobtcount) {
4986 // fake backtrace when we found nothing
4987 zbt[0] = (uintptr_t) &zalloc;
4988 (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1);
4989 }
4990
4991 kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t));
4992
4993 return KERN_SUCCESS;
4994 }
4995
4996 boolean_t
4997 kdp_is_in_zone(void *addr, const char *zone_name)
4998 {
4999 zone_t z;
5000 return zone_element_size(addr, &z) && !strcmp(z->zone_name, zone_name);
5001 }
5002
5003 boolean_t
5004 run_zone_test(void)
5005 {
5006 unsigned int i = 0, max_iter = 5;
5007 void * test_ptr;
5008 zone_t test_zone;
5009
5010 simple_lock(&zone_test_lock, &zone_locks_grp);
5011 if (!zone_test_running) {
5012 zone_test_running = TRUE;
5013 } else {
5014 simple_unlock(&zone_test_lock);
5015 printf("run_zone_test: Test already running.\n");
5016 return FALSE;
5017 }
5018 simple_unlock(&zone_test_lock);
5019
5020 printf("run_zone_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
5021
5022 /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
5023 do {
5024 test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
5025 if (test_zone == NULL) {
5026 printf("run_zone_test: zinit() failed\n");
5027 return FALSE;
5028 }
5029
5030 #if KASAN_ZALLOC
5031 if (test_zone_ptr == NULL && zone_free_count(test_zone) != 0) {
5032 #else
5033 if (zone_free_count(test_zone) != 0) {
5034 #endif
5035 printf("run_zone_test: free count is not zero\n");
5036 return FALSE;
5037 }
5038
5039 if (test_zone_ptr == NULL) {
5040 /* Stash the zone pointer returned on the fist zinit */
5041 printf("run_zone_test: zone created for the first time\n");
5042 test_zone_ptr = test_zone;
5043 } else if (test_zone != test_zone_ptr) {
5044 printf("run_zone_test: old zone pointer and new zone pointer don't match\n");
5045 return FALSE;
5046 }
5047
5048 test_ptr = zalloc(test_zone);
5049 if (test_ptr == NULL) {
5050 printf("run_zone_test: zalloc() failed\n");
5051 return FALSE;
5052 }
5053 zfree(test_zone, test_ptr);
5054
5055 zdestroy(test_zone);
5056 i++;
5057
5058 printf("run_zone_test: Iteration %d successful\n", i);
5059 } while (i < max_iter);
5060
5061 printf("run_zone_test: Test passed\n");
5062
5063 simple_lock(&zone_test_lock, &zone_locks_grp);
5064 zone_test_running = FALSE;
5065 simple_unlock(&zone_test_lock);
5066
5067 return TRUE;
5068 }
5069
5070 #endif /* DEBUG || DEVELOPMENT */