]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/zalloc.c
xnu-4570.20.62.tar.gz
[apple/xnu.git] / osfmk / kern / zalloc.c
1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: kern/zalloc.c
60 * Author: Avadis Tevanian, Jr.
61 *
62 * Zone-based memory allocator. A zone is a collection of fixed size
63 * data blocks for which quick allocation/deallocation is possible.
64 */
65 #include <zone_debug.h>
66
67 #include <mach/mach_types.h>
68 #include <mach/vm_param.h>
69 #include <mach/kern_return.h>
70 #include <mach/mach_host_server.h>
71 #include <mach/task_server.h>
72 #include <mach/machine/vm_types.h>
73 #include <mach_debug/zone_info.h>
74 #include <mach/vm_map.h>
75
76 #include <kern/bits.h>
77 #include <kern/kern_types.h>
78 #include <kern/assert.h>
79 #include <kern/backtrace.h>
80 #include <kern/host.h>
81 #include <kern/macro_help.h>
82 #include <kern/sched.h>
83 #include <kern/locks.h>
84 #include <kern/sched_prim.h>
85 #include <kern/misc_protos.h>
86 #include <kern/thread_call.h>
87 #include <kern/zalloc.h>
88 #include <kern/kalloc.h>
89
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_kern.h>
93 #include <vm/vm_page.h>
94
95 #include <pexpert/pexpert.h>
96
97 #include <machine/machparam.h>
98 #include <machine/machine_routines.h> /* ml_cpu_get_info */
99
100 #include <libkern/OSDebug.h>
101 #include <libkern/OSAtomic.h>
102 #include <sys/kdebug.h>
103
104 #include <san/kasan.h>
105
106 /*
107 * ZONE_ALIAS_ADDR (deprecated)
108 */
109
110 #define from_zone_map(addr, size) \
111 ((vm_offset_t)(addr) >= zone_map_min_address && \
112 ((vm_offset_t)(addr) + size - 1) < zone_map_max_address )
113
114 /*
115 * Zone Corruption Debugging
116 *
117 * We use three techniques to detect modification of a zone element
118 * after it's been freed.
119 *
120 * (1) Check the freelist next pointer for sanity.
121 * (2) Store a backup of the next pointer at the end of the element,
122 * and compare it to the primary next pointer when the element is allocated
123 * to detect corruption of the freelist due to use-after-free bugs.
124 * The backup pointer is also XORed with a per-boot random cookie.
125 * (3) Poison the freed element by overwriting it with 0xdeadbeef,
126 * and check for that value when the element is being reused to make sure
127 * no part of the element has been modified while it was on the freelist.
128 * This will also help catch read-after-frees, as code will now dereference
129 * 0xdeadbeef instead of a valid but freed pointer.
130 *
131 * (1) and (2) occur for every allocation and free to a zone.
132 * This is done to make it slightly more difficult for an attacker to
133 * manipulate the freelist to behave in a specific way.
134 *
135 * Poisoning (3) occurs periodically for every N frees (counted per-zone)
136 * and on every free for zones smaller than a cacheline. If -zp
137 * is passed as a boot arg, poisoning occurs for every free.
138 *
139 * Performance slowdown is inversely proportional to the frequency of poisoning,
140 * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
141 * and higher. You can expect to find a 100% reproducible bug in an average of
142 * N tries, with a standard deviation of about N, but you will want to set
143 * "-zp" to always poison every free if you are attempting to reproduce
144 * a known bug.
145 *
146 * For a more heavyweight, but finer-grained method of detecting misuse
147 * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
148 *
149 * Zone Corruption Logging
150 *
151 * You can also track where corruptions come from by using the boot-arguments
152 * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
153 * in this document for more implementation and usage information.
154 *
155 * Zone Leak Detection
156 *
157 * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
158 * found later in this file via the showtopztrace and showz* macros in kgmacros,
159 * or use zlog without the -zc argument.
160 *
161 */
162
163 /* Returns TRUE if we rolled over the counter at factor */
164 static inline boolean_t
165 sample_counter(volatile uint32_t * count_p, uint32_t factor)
166 {
167 uint32_t old_count, new_count;
168 boolean_t rolled_over;
169
170 do {
171 new_count = old_count = *count_p;
172
173 if (++new_count >= factor) {
174 rolled_over = TRUE;
175 new_count = 0;
176 } else {
177 rolled_over = FALSE;
178 }
179
180 } while (!OSCompareAndSwap(old_count, new_count, count_p));
181
182 return rolled_over;
183 }
184
185 #if defined(__LP64__)
186 #define ZP_POISON 0xdeadbeefdeadbeef
187 #else
188 #define ZP_POISON 0xdeadbeef
189 #endif
190
191 #define ZP_DEFAULT_SAMPLING_FACTOR 16
192 #define ZP_DEFAULT_SCALE_FACTOR 4
193
194 /*
195 * A zp_factor of 0 indicates zone poisoning is disabled,
196 * however, we still poison zones smaller than zp_tiny_zone_limit (a cacheline).
197 * Passing the -no-zp boot-arg disables even this behavior.
198 * In all cases, we record and check the integrity of a backup pointer.
199 */
200
201 /* set by zp-factor=N boot arg, zero indicates non-tiny poisoning disabled */
202 uint32_t zp_factor = 0;
203
204 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
205 uint32_t zp_scale = 0;
206
207 /* set in zp_init, zero indicates -no-zp boot-arg */
208 vm_size_t zp_tiny_zone_limit = 0;
209
210 /* initialized to a per-boot random value in zp_init */
211 uintptr_t zp_poisoned_cookie = 0;
212 uintptr_t zp_nopoison_cookie = 0;
213
214 #if VM_MAX_TAG_ZONES
215 boolean_t zone_tagging_on;
216 #endif /* VM_MAX_TAG_ZONES */
217
218 /*
219 * initialize zone poisoning
220 * called from zone_bootstrap before any allocations are made from zalloc
221 */
222 static inline void
223 zp_init(void)
224 {
225 char temp_buf[16];
226
227 /*
228 * Initialize backup pointer random cookie for poisoned elements
229 * Try not to call early_random() back to back, it may return
230 * the same value if mach_absolute_time doesn't have sufficient time
231 * to tick over between calls. <rdar://problem/11597395>
232 * (This is only a problem on embedded devices)
233 */
234 zp_poisoned_cookie = (uintptr_t) early_random();
235
236 /*
237 * Always poison zones smaller than a cacheline,
238 * because it's pretty close to free
239 */
240 ml_cpu_info_t cpu_info;
241 ml_cpu_get_info(&cpu_info);
242 zp_tiny_zone_limit = (vm_size_t) cpu_info.cache_line_size;
243
244 zp_factor = ZP_DEFAULT_SAMPLING_FACTOR;
245 zp_scale = ZP_DEFAULT_SCALE_FACTOR;
246
247 //TODO: Bigger permutation?
248 /*
249 * Permute the default factor +/- 1 to make it less predictable
250 * This adds or subtracts ~4 poisoned objects per 1000 frees.
251 */
252 if (zp_factor != 0) {
253 uint32_t rand_bits = early_random() & 0x3;
254
255 if (rand_bits == 0x1)
256 zp_factor += 1;
257 else if (rand_bits == 0x2)
258 zp_factor -= 1;
259 /* if 0x0 or 0x3, leave it alone */
260 }
261
262 /* -zp: enable poisoning for every alloc and free */
263 if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
264 zp_factor = 1;
265 }
266
267 /* -no-zp: disable poisoning completely even for tiny zones */
268 if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
269 zp_factor = 0;
270 zp_tiny_zone_limit = 0;
271 printf("Zone poisoning disabled\n");
272 }
273
274 /* zp-factor=XXXX: override how often to poison freed zone elements */
275 if (PE_parse_boot_argn("zp-factor", &zp_factor, sizeof(zp_factor))) {
276 printf("Zone poisoning factor override: %u\n", zp_factor);
277 }
278
279 /* zp-scale=XXXX: override how much zone size scales zp-factor by */
280 if (PE_parse_boot_argn("zp-scale", &zp_scale, sizeof(zp_scale))) {
281 printf("Zone poisoning scale factor override: %u\n", zp_scale);
282 }
283
284 /* Initialize backup pointer random cookie for unpoisoned elements */
285 zp_nopoison_cookie = (uintptr_t) early_random();
286
287 #if MACH_ASSERT
288 if (zp_poisoned_cookie == zp_nopoison_cookie)
289 panic("early_random() is broken: %p and %p are not random\n",
290 (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
291 #endif
292
293 /*
294 * Use the last bit in the backup pointer to hint poisoning state
295 * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
296 * the low bits are zero.
297 */
298 zp_poisoned_cookie |= (uintptr_t)0x1ULL;
299 zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
300
301 #if defined(__LP64__)
302 /*
303 * Make backup pointers more obvious in GDB for 64 bit
304 * by making OxFFFFFF... ^ cookie = 0xFACADE...
305 * (0xFACADE = 0xFFFFFF ^ 0x053521)
306 * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
307 * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
308 * by the sanity check, so it's OK for that part of the cookie to be predictable.
309 *
310 * TODO: Use #defines, xors, and shifts
311 */
312
313 zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
314 zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
315
316 zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
317 zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
318 #endif
319 }
320
321 /*
322 * These macros are used to keep track of the number
323 * of pages being used by the zone currently. The
324 * z->page_count is not protected by the zone lock.
325 */
326 #define ZONE_PAGE_COUNT_INCR(z, count) \
327 { \
328 OSAddAtomic64(count, &(z->page_count)); \
329 }
330
331 #define ZONE_PAGE_COUNT_DECR(z, count) \
332 { \
333 OSAddAtomic64(-count, &(z->page_count)); \
334 }
335
336 vm_map_t zone_map = VM_MAP_NULL;
337
338 /* for is_sane_zone_element and garbage collection */
339
340 vm_offset_t zone_map_min_address = 0; /* initialized in zone_init */
341 vm_offset_t zone_map_max_address = 0;
342
343 /* Globals for random boolean generator for elements in free list */
344 #define MAX_ENTROPY_PER_ZCRAM 4
345 #define RANDOM_BOOL_GEN_SEED_COUNT 4
346 static unsigned int bool_gen_seed[RANDOM_BOOL_GEN_SEED_COUNT];
347 static unsigned int bool_gen_global = 0;
348 decl_simple_lock_data(, bool_gen_lock)
349
350 /* VM region for all metadata structures */
351 vm_offset_t zone_metadata_region_min = 0;
352 vm_offset_t zone_metadata_region_max = 0;
353 decl_lck_mtx_data(static ,zone_metadata_region_lck)
354 lck_attr_t zone_metadata_lock_attr;
355 lck_mtx_ext_t zone_metadata_region_lck_ext;
356
357 /* Helpful for walking through a zone's free element list. */
358 struct zone_free_element {
359 struct zone_free_element *next;
360 /* ... */
361 /* void *backup_ptr; */
362 };
363
364 /*
365 * Protects zone_array, num_zones, num_zones_in_use, and zone_empty_bitmap
366 */
367 decl_simple_lock_data(, all_zones_lock)
368 unsigned int num_zones_in_use;
369 unsigned int num_zones;
370
371 #define MAX_ZONES 288
372 struct zone zone_array[MAX_ZONES];
373
374 /* Used to keep track of empty slots in the zone_array */
375 bitmap_t zone_empty_bitmap[BITMAP_LEN(MAX_ZONES)];
376
377 #if DEBUG || DEVELOPMENT
378 /*
379 * Used for sysctl kern.run_zone_test which is not thread-safe. Ensure only one thread goes through at a time.
380 * Or we can end up with multiple test zones (if a second zinit() comes through before zdestroy()), which could lead us to
381 * run out of zones.
382 */
383 decl_simple_lock_data(, zone_test_lock)
384 static boolean_t zone_test_running = FALSE;
385 static zone_t test_zone_ptr = NULL;
386 #endif /* DEBUG || DEVELOPMENT */
387
388 #define PAGE_METADATA_GET_ZINDEX(page_meta) \
389 (page_meta->zindex)
390
391 #define PAGE_METADATA_GET_ZONE(page_meta) \
392 (&(zone_array[page_meta->zindex]))
393
394 #define PAGE_METADATA_SET_ZINDEX(page_meta, index) \
395 page_meta->zindex = (index);
396
397 struct zone_page_metadata {
398 queue_chain_t pages; /* linkage pointer for metadata lists */
399
400 /* Union for maintaining start of element free list and real metadata (for multipage allocations) */
401 union {
402 /*
403 * The start of the freelist can be maintained as a 32-bit offset instead of a pointer because
404 * the free elements would be at max ZONE_MAX_ALLOC_SIZE bytes away from the metadata. Offset
405 * from start of the allocation chunk to free element list head.
406 */
407 uint32_t freelist_offset;
408 /*
409 * This field is used to lookup the real metadata for multipage allocations, where we mark the
410 * metadata for all pages except the first as "fake" metadata using MULTIPAGE_METADATA_MAGIC.
411 * Offset from this fake metadata to real metadata of allocation chunk (-ve offset).
412 */
413 uint32_t real_metadata_offset;
414 };
415
416 /*
417 * For the first page in the allocation chunk, this represents the total number of free elements in
418 * the chunk.
419 */
420 uint16_t free_count;
421 unsigned zindex : ZINDEX_BITS; /* Zone index within the zone_array */
422 unsigned page_count : PAGECOUNT_BITS; /* Count of pages within the allocation chunk */
423 };
424
425 /* Macro to get page index (within zone_map) of page containing element */
426 #define PAGE_INDEX_FOR_ELEMENT(element) \
427 (((vm_offset_t)trunc_page(element) - zone_map_min_address) / PAGE_SIZE)
428
429 /* Macro to get metadata structure given a page index in zone_map */
430 #define PAGE_METADATA_FOR_PAGE_INDEX(index) \
431 (zone_metadata_region_min + ((index) * sizeof(struct zone_page_metadata)))
432
433 /* Macro to get index (within zone_map) for given metadata */
434 #define PAGE_INDEX_FOR_METADATA(page_meta) \
435 (((vm_offset_t)page_meta - zone_metadata_region_min) / sizeof(struct zone_page_metadata))
436
437 /* Macro to get page for given page index in zone_map */
438 #define PAGE_FOR_PAGE_INDEX(index) \
439 (zone_map_min_address + (PAGE_SIZE * (index)))
440
441 /* Macro to get the actual metadata for a given address */
442 #define PAGE_METADATA_FOR_ELEMENT(element) \
443 (struct zone_page_metadata *)(PAGE_METADATA_FOR_PAGE_INDEX(PAGE_INDEX_FOR_ELEMENT(element)))
444
445 /* Magic value to indicate empty element free list */
446 #define PAGE_METADATA_EMPTY_FREELIST ((uint32_t)(~0))
447
448 boolean_t is_zone_map_nearing_exhaustion(void);
449 extern void vm_pageout_garbage_collect(int collect);
450
451 static inline void *
452 page_metadata_get_freelist(struct zone_page_metadata *page_meta)
453 {
454 assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
455 if (page_meta->freelist_offset == PAGE_METADATA_EMPTY_FREELIST)
456 return NULL;
457 else {
458 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata)))
459 return (void *)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)) + page_meta->freelist_offset);
460 else
461 return (void *)((vm_offset_t)page_meta + page_meta->freelist_offset);
462 }
463 }
464
465 static inline void
466 page_metadata_set_freelist(struct zone_page_metadata *page_meta, void *addr)
467 {
468 assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
469 if (addr == NULL)
470 page_meta->freelist_offset = PAGE_METADATA_EMPTY_FREELIST;
471 else {
472 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata)))
473 page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)));
474 else
475 page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - (vm_offset_t)page_meta);
476 }
477 }
478
479 static inline struct zone_page_metadata *
480 page_metadata_get_realmeta(struct zone_page_metadata *page_meta)
481 {
482 assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC);
483 return (struct zone_page_metadata *)((vm_offset_t)page_meta - page_meta->real_metadata_offset);
484 }
485
486 static inline void
487 page_metadata_set_realmeta(struct zone_page_metadata *page_meta, struct zone_page_metadata *real_meta)
488 {
489 assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC);
490 assert(PAGE_METADATA_GET_ZINDEX(real_meta) != MULTIPAGE_METADATA_MAGIC);
491 assert((vm_offset_t)page_meta > (vm_offset_t)real_meta);
492 vm_offset_t offset = (vm_offset_t)page_meta - (vm_offset_t)real_meta;
493 assert(offset <= UINT32_MAX);
494 page_meta->real_metadata_offset = (uint32_t)offset;
495 }
496
497 /* The backup pointer is stored in the last pointer-sized location in an element. */
498 static inline vm_offset_t *
499 get_backup_ptr(vm_size_t elem_size,
500 vm_offset_t *element)
501 {
502 return (vm_offset_t *) ((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
503 }
504
505 /*
506 * Routine to populate a page backing metadata in the zone_metadata_region.
507 * Must be called without the zone lock held as it might potentially block.
508 */
509 static inline void
510 zone_populate_metadata_page(struct zone_page_metadata *page_meta)
511 {
512 vm_offset_t page_metadata_begin = trunc_page(page_meta);
513 vm_offset_t page_metadata_end = trunc_page((vm_offset_t)page_meta + sizeof(struct zone_page_metadata));
514
515 for(;page_metadata_begin <= page_metadata_end; page_metadata_begin += PAGE_SIZE) {
516 if (pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin))
517 continue;
518 /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
519 lck_mtx_lock(&zone_metadata_region_lck);
520 if (0 == pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) {
521 kern_return_t __unused ret = kernel_memory_populate(zone_map,
522 page_metadata_begin,
523 PAGE_SIZE,
524 KMA_KOBJECT,
525 VM_KERN_MEMORY_OSFMK);
526
527 /* should not fail with the given arguments */
528 assert(ret == KERN_SUCCESS);
529 }
530 lck_mtx_unlock(&zone_metadata_region_lck);
531 }
532 return;
533 }
534
535 static inline uint16_t
536 get_metadata_alloc_count(struct zone_page_metadata *page_meta)
537 {
538 assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
539 struct zone *z = PAGE_METADATA_GET_ZONE(page_meta);
540 return ((page_meta->page_count * PAGE_SIZE) / z->elem_size);
541 }
542
543 /*
544 * Routine to lookup metadata for any given address.
545 * If init is marked as TRUE, this should be called without holding the zone lock
546 * since the initialization might block.
547 */
548 static inline struct zone_page_metadata *
549 get_zone_page_metadata(struct zone_free_element *element, boolean_t init)
550 {
551 struct zone_page_metadata *page_meta = 0;
552
553 if (from_zone_map(element, sizeof(struct zone_free_element))) {
554 page_meta = (struct zone_page_metadata *)(PAGE_METADATA_FOR_ELEMENT(element));
555 if (init)
556 zone_populate_metadata_page(page_meta);
557 } else {
558 page_meta = (struct zone_page_metadata *)(trunc_page((vm_offset_t)element));
559 }
560 if (init)
561 __nosan_bzero((char *)page_meta, sizeof(struct zone_page_metadata));
562 return ((PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC) ? page_meta : page_metadata_get_realmeta(page_meta));
563 }
564
565 /* Routine to get the page for a given metadata */
566 static inline vm_offset_t
567 get_zone_page(struct zone_page_metadata *page_meta)
568 {
569 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata)))
570 return (vm_offset_t)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)));
571 else
572 return (vm_offset_t)(trunc_page(page_meta));
573 }
574
575 /*
576 * ZTAGS
577 */
578
579 #if VM_MAX_TAG_ZONES
580
581 // for zones with tagging enabled:
582
583 // calculate a pointer to the tag base entry,
584 // holding either a uint32_t the first tag offset for a page in the zone map,
585 // or two uint16_t tags if the page can only hold one or two elements
586
587 #define ZTAGBASE(zone, element) \
588 (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_map_min_address)])
589
590 // pointer to the tag for an element
591 #define ZTAG(zone, element) \
592 ({ \
593 vm_tag_t * result; \
594 if ((zone)->tags_inline) { \
595 result = (vm_tag_t *) ZTAGBASE((zone), (element)); \
596 if ((page_mask & element) >= (zone)->elem_size) result++; \
597 } else { \
598 result = &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / (zone)->elem_size]; \
599 } \
600 result; \
601 })
602
603
604 static vm_offset_t zone_tagbase_min;
605 static vm_offset_t zone_tagbase_max;
606 static vm_offset_t zone_tagbase_map_size;
607 static vm_map_t zone_tagbase_map;
608
609 static vm_offset_t zone_tags_min;
610 static vm_offset_t zone_tags_max;
611 static vm_offset_t zone_tags_map_size;
612 static vm_map_t zone_tags_map;
613
614 // simple heap allocator for allocating the tags for new memory
615
616 decl_lck_mtx_data(,ztLock) /* heap lock */
617 enum
618 {
619 ztFreeIndexCount = 8,
620 ztFreeIndexMax = (ztFreeIndexCount - 1),
621 ztTagsPerBlock = 4
622 };
623
624 struct ztBlock
625 {
626 #if __LITTLE_ENDIAN__
627 uint64_t free:1,
628 next:21,
629 prev:21,
630 size:21;
631 #else
632 // ztBlock needs free bit least significant
633 #error !__LITTLE_ENDIAN__
634 #endif
635 };
636 typedef struct ztBlock ztBlock;
637
638 static ztBlock * ztBlocks;
639 static uint32_t ztBlocksCount;
640 static uint32_t ztBlocksFree;
641
642 static uint32_t
643 ztLog2up(uint32_t size)
644 {
645 if (1 == size) size = 0;
646 else size = 32 - __builtin_clz(size - 1);
647 return (size);
648 }
649
650 static uint32_t
651 ztLog2down(uint32_t size)
652 {
653 size = 31 - __builtin_clz(size);
654 return (size);
655 }
656
657 static void
658 ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags)
659 {
660 vm_map_offset_t addr = (vm_map_offset_t) address;
661 vm_map_offset_t page, end;
662
663 page = trunc_page(addr);
664 end = round_page(addr + size);
665
666 for (; page < end; page += page_size)
667 {
668 if (!pmap_find_phys(kernel_pmap, page))
669 {
670 kern_return_t __unused
671 ret = kernel_memory_populate(map, page, PAGE_SIZE,
672 KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG);
673 assert(ret == KERN_SUCCESS);
674 }
675 }
676 }
677
678 static boolean_t
679 ztPresent(const void * address, size_t size)
680 {
681 vm_map_offset_t addr = (vm_map_offset_t) address;
682 vm_map_offset_t page, end;
683 boolean_t result;
684
685 page = trunc_page(addr);
686 end = round_page(addr + size);
687 for (result = TRUE; (page < end); page += page_size)
688 {
689 result = pmap_find_phys(kernel_pmap, page);
690 if (!result) break;
691 }
692 return (result);
693 }
694
695
696 void __unused
697 ztDump(boolean_t sanity);
698 void __unused
699 ztDump(boolean_t sanity)
700 {
701 uint32_t q, cq, p;
702
703 for (q = 0; q <= ztFreeIndexMax; q++)
704 {
705 p = q;
706 do
707 {
708 if (sanity)
709 {
710 cq = ztLog2down(ztBlocks[p].size);
711 if (cq > ztFreeIndexMax) cq = ztFreeIndexMax;
712 if (!ztBlocks[p].free
713 || ((p != q) && (q != cq))
714 || (ztBlocks[ztBlocks[p].next].prev != p)
715 || (ztBlocks[ztBlocks[p].prev].next != p))
716 {
717 kprintf("zterror at %d", p);
718 ztDump(FALSE);
719 kprintf("zterror at %d", p);
720 assert(FALSE);
721 }
722 continue;
723 }
724 kprintf("zt[%03d]%c %d, %d, %d\n",
725 p, ztBlocks[p].free ? 'F' : 'A',
726 ztBlocks[p].next, ztBlocks[p].prev,
727 ztBlocks[p].size);
728 p = ztBlocks[p].next;
729 if (p == q) break;
730 }
731 while (p != q);
732 if (!sanity) printf("\n");
733 }
734 if (!sanity) printf("-----------------------\n");
735 }
736
737
738
739 #define ZTBDEQ(idx) \
740 ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next; \
741 ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
742
743 static void
744 ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
745 {
746 uint32_t q, w, p, size, merge;
747
748 assert(count);
749 ztBlocksFree += count;
750
751 // merge with preceding
752 merge = (index + count);
753 if ((merge < ztBlocksCount)
754 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
755 && ztBlocks[merge].free)
756 {
757 ZTBDEQ(merge);
758 count += ztBlocks[merge].size;
759 }
760
761 // merge with following
762 merge = (index - 1);
763 if ((merge > ztFreeIndexMax)
764 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
765 && ztBlocks[merge].free)
766 {
767 size = ztBlocks[merge].size;
768 count += size;
769 index -= size;
770 ZTBDEQ(index);
771 }
772
773 q = ztLog2down(count);
774 if (q > ztFreeIndexMax) q = ztFreeIndexMax;
775 w = q;
776 // queue in order of size
777 while (TRUE)
778 {
779 p = ztBlocks[w].next;
780 if (p == q) break;
781 if (ztBlocks[p].size >= count) break;
782 w = p;
783 }
784 ztBlocks[p].prev = index;
785 ztBlocks[w].next = index;
786
787 // fault in first
788 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
789
790 // mark first & last with free flag and size
791 ztBlocks[index].free = TRUE;
792 ztBlocks[index].size = count;
793 ztBlocks[index].prev = w;
794 ztBlocks[index].next = p;
795 if (count > 1)
796 {
797 index += (count - 1);
798 // fault in last
799 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
800 ztBlocks[index].free = TRUE;
801 ztBlocks[index].size = count;
802 }
803 }
804
805 static uint32_t
806 ztAlloc(zone_t zone, uint32_t count)
807 {
808 uint32_t q, w, p, leftover;
809
810 assert(count);
811
812 q = ztLog2up(count);
813 if (q > ztFreeIndexMax) q = ztFreeIndexMax;
814 do
815 {
816 w = q;
817 while (TRUE)
818 {
819 p = ztBlocks[w].next;
820 if (p == q) break;
821 if (ztBlocks[p].size >= count)
822 {
823 // dequeue, mark both ends allocated
824 ztBlocks[w].next = ztBlocks[p].next;
825 ztBlocks[ztBlocks[p].next].prev = w;
826 ztBlocks[p].free = FALSE;
827 ztBlocksFree -= ztBlocks[p].size;
828 if (ztBlocks[p].size > 1) ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
829
830 // fault all the allocation
831 ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
832 // mark last as allocated
833 if (count > 1) ztBlocks[p + count - 1].free = FALSE;
834 // free remainder
835 leftover = ztBlocks[p].size - count;
836 if (leftover) ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
837
838 return (p);
839 }
840 w = p;
841 }
842 q++;
843 }
844 while (q <= ztFreeIndexMax);
845
846 return (-1U);
847 }
848
849 static void
850 ztInit(vm_size_t max_zonemap_size, lck_grp_t * group)
851 {
852 kern_return_t ret;
853 vm_map_kernel_flags_t vmk_flags;
854 uint32_t idx;
855
856 lck_mtx_init(&ztLock, group, LCK_ATTR_NULL);
857
858 // allocate submaps VM_KERN_MEMORY_DIAG
859
860 zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t);
861 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
862 vmk_flags.vmkf_permanent = TRUE;
863 ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size,
864 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
865 &zone_tagbase_map);
866
867 if (ret != KERN_SUCCESS) panic("zone_init: kmem_suballoc failed");
868 zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size);
869
870 zone_tags_map_size = 2048*1024 * sizeof(vm_tag_t);
871 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
872 vmk_flags.vmkf_permanent = TRUE;
873 ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size,
874 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
875 &zone_tags_map);
876
877 if (ret != KERN_SUCCESS) panic("zone_init: kmem_suballoc failed");
878 zone_tags_max = zone_tags_min + round_page(zone_tags_map_size);
879
880 ztBlocks = (ztBlock *) zone_tags_min;
881 ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
882
883 // initialize the qheads
884 lck_mtx_lock(&ztLock);
885
886 ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0);
887 for (idx = 0; idx < ztFreeIndexCount; idx++)
888 {
889 ztBlocks[idx].free = TRUE;
890 ztBlocks[idx].next = idx;
891 ztBlocks[idx].prev = idx;
892 ztBlocks[idx].size = 0;
893 }
894 // free remaining space
895 ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
896
897 lck_mtx_unlock(&ztLock);
898 }
899
900 static void
901 ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
902 {
903 uint32_t * tagbase;
904 uint32_t count, block, blocks, idx;
905 size_t pages;
906
907 pages = atop(size);
908 tagbase = ZTAGBASE(zone, mem);
909
910 lck_mtx_lock(&ztLock);
911
912 // fault tagbase
913 ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0);
914
915 if (!zone->tags_inline)
916 {
917 // allocate tags
918 count = (uint32_t)(size / zone->elem_size);
919 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
920 block = ztAlloc(zone, blocks);
921 if (-1U == block) ztDump(false);
922 assert(-1U != block);
923 }
924
925 lck_mtx_unlock(&ztLock);
926
927 if (!zone->tags_inline)
928 {
929 // set tag base for each page
930 block *= ztTagsPerBlock;
931 for (idx = 0; idx < pages; idx++)
932 {
933 tagbase[idx] = block + (uint32_t)((ptoa(idx) + (zone->elem_size - 1)) / zone->elem_size);
934 }
935 }
936 }
937
938 static void
939 ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
940 {
941 uint32_t * tagbase;
942 uint32_t count, block, blocks, idx;
943 size_t pages;
944
945 // set tag base for each page
946 pages = atop(size);
947 tagbase = ZTAGBASE(zone, mem);
948 block = tagbase[0];
949 for (idx = 0; idx < pages; idx++)
950 {
951 tagbase[idx] = 0xFFFFFFFF;
952 }
953
954 lck_mtx_lock(&ztLock);
955 if (!zone->tags_inline)
956 {
957 count = (uint32_t)(size / zone->elem_size);
958 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
959 assert(block != 0xFFFFFFFF);
960 block /= ztTagsPerBlock;
961 ztFree(NULL /* zone is unlocked */, block, blocks);
962 }
963
964 lck_mtx_unlock(&ztLock);
965 }
966
967 uint32_t
968 zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size)
969 {
970 zone_t z;
971 uint32_t idx;
972
973 simple_lock(&all_zones_lock);
974
975 for (idx = 0; idx < num_zones; idx++)
976 {
977 z = &(zone_array[idx]);
978 if (!z->tags) continue;
979 if (tag_zone_index != z->tag_zone_index) continue;
980 *elem_size = z->elem_size;
981 break;
982 }
983
984 simple_unlock(&all_zones_lock);
985
986 if (idx == num_zones) idx = -1U;
987
988 return (idx);
989 }
990
991 #endif /* VM_MAX_TAG_ZONES */
992
993 /* Routine to get the size of a zone allocated address. If the address doesnt belong to the
994 * zone_map, returns 0.
995 */
996 vm_size_t
997 zone_element_size(void *addr, zone_t *z)
998 {
999 struct zone *src_zone;
1000 if (from_zone_map(addr, sizeof(void *))) {
1001 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
1002 src_zone = PAGE_METADATA_GET_ZONE(page_meta);
1003 if (z) {
1004 *z = src_zone;
1005 }
1006 return (src_zone->elem_size);
1007 } else {
1008 #if CONFIG_GZALLOC
1009 vm_size_t gzsize;
1010 if (gzalloc_element_size(addr, z, &gzsize)) {
1011 return gzsize;
1012 }
1013 #endif /* CONFIG_GZALLOC */
1014
1015 return 0;
1016 }
1017 }
1018
1019 #if DEBUG || DEVELOPMENT
1020
1021 vm_size_t
1022 zone_element_info(void *addr, vm_tag_t * ptag)
1023 {
1024 vm_size_t size = 0;
1025 vm_tag_t tag = VM_KERN_MEMORY_NONE;
1026 struct zone * src_zone;
1027
1028 if (from_zone_map(addr, sizeof(void *))) {
1029 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
1030 src_zone = PAGE_METADATA_GET_ZONE(page_meta);
1031 #if VM_MAX_TAG_ZONES
1032 if (__improbable(src_zone->tags)) {
1033 tag = (ZTAG(src_zone, (vm_offset_t) addr)[0] >> 1);
1034 }
1035 #endif /* VM_MAX_TAG_ZONES */
1036 size = src_zone->elem_size;
1037 } else {
1038 #if CONFIG_GZALLOC
1039 gzalloc_element_size(addr, NULL, &size);
1040 #endif /* CONFIG_GZALLOC */
1041 }
1042 *ptag = tag;
1043 return size;
1044 }
1045
1046 #endif /* DEBUG || DEVELOPMENT */
1047
1048 /*
1049 * Zone checking helper function.
1050 * A pointer that satisfies these conditions is OK to be a freelist next pointer
1051 * A pointer that doesn't satisfy these conditions indicates corruption
1052 */
1053 static inline boolean_t
1054 is_sane_zone_ptr(zone_t zone,
1055 vm_offset_t addr,
1056 size_t obj_size)
1057 {
1058 /* Must be aligned to pointer boundary */
1059 if (__improbable((addr & (sizeof(vm_offset_t) - 1)) != 0))
1060 return FALSE;
1061
1062 /* Must be a kernel address */
1063 if (__improbable(!pmap_kernel_va(addr)))
1064 return FALSE;
1065
1066 /* Must be from zone map if the zone only uses memory from the zone_map */
1067 /*
1068 * TODO: Remove the zone->collectable check when every
1069 * zone using foreign memory is properly tagged with allows_foreign
1070 */
1071 if (zone->collectable && !zone->allows_foreign) {
1072 /* check if addr is from zone map */
1073 if (addr >= zone_map_min_address &&
1074 (addr + obj_size - 1) < zone_map_max_address )
1075 return TRUE;
1076
1077 return FALSE;
1078 }
1079
1080 return TRUE;
1081 }
1082
1083 static inline boolean_t
1084 is_sane_zone_page_metadata(zone_t zone,
1085 vm_offset_t page_meta)
1086 {
1087 /* NULL page metadata structures are invalid */
1088 if (page_meta == 0)
1089 return FALSE;
1090 return is_sane_zone_ptr(zone, page_meta, sizeof(struct zone_page_metadata));
1091 }
1092
1093 static inline boolean_t
1094 is_sane_zone_element(zone_t zone,
1095 vm_offset_t addr)
1096 {
1097 /* NULL is OK because it indicates the tail of the list */
1098 if (addr == 0)
1099 return TRUE;
1100 return is_sane_zone_ptr(zone, addr, zone->elem_size);
1101 }
1102
1103 /* Someone wrote to freed memory. */
1104 static inline void /* noreturn */
1105 zone_element_was_modified_panic(zone_t zone,
1106 vm_offset_t element,
1107 vm_offset_t found,
1108 vm_offset_t expected,
1109 vm_offset_t offset)
1110 {
1111 panic("a freed zone element has been modified in zone %s: expected %p but found %p, bits changed %p, at offset %d of %d in element %p, cookies %p %p",
1112 zone->zone_name,
1113 (void *) expected,
1114 (void *) found,
1115 (void *) (expected ^ found),
1116 (uint32_t) offset,
1117 (uint32_t) zone->elem_size,
1118 (void *) element,
1119 (void *) zp_nopoison_cookie,
1120 (void *) zp_poisoned_cookie);
1121 }
1122
1123 /*
1124 * The primary and backup pointers don't match.
1125 * Determine which one was likely the corrupted pointer, find out what it
1126 * probably should have been, and panic.
1127 * I would like to mark this as noreturn, but panic() isn't marked noreturn.
1128 */
1129 static void /* noreturn */
1130 backup_ptr_mismatch_panic(zone_t zone,
1131 vm_offset_t element,
1132 vm_offset_t primary,
1133 vm_offset_t backup)
1134 {
1135 vm_offset_t likely_backup;
1136 vm_offset_t likely_primary;
1137
1138 likely_primary = primary ^ zp_nopoison_cookie;
1139 boolean_t sane_backup;
1140 boolean_t sane_primary = is_sane_zone_element(zone, likely_primary);
1141 boolean_t element_was_poisoned = (backup & 0x1) ? TRUE : FALSE;
1142
1143 #if defined(__LP64__)
1144 /* We can inspect the tag in the upper bits for additional confirmation */
1145 if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000)
1146 element_was_poisoned = TRUE;
1147 else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000)
1148 element_was_poisoned = FALSE;
1149 #endif
1150
1151 if (element_was_poisoned) {
1152 likely_backup = backup ^ zp_poisoned_cookie;
1153 sane_backup = is_sane_zone_element(zone, likely_backup);
1154 } else {
1155 likely_backup = backup ^ zp_nopoison_cookie;
1156 sane_backup = is_sane_zone_element(zone, likely_backup);
1157 }
1158
1159 /* The primary is definitely the corrupted one */
1160 if (!sane_primary && sane_backup)
1161 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1162
1163 /* The backup is definitely the corrupted one */
1164 if (sane_primary && !sane_backup)
1165 zone_element_was_modified_panic(zone, element, backup,
1166 (likely_primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
1167 zone->elem_size - sizeof(vm_offset_t));
1168
1169 /*
1170 * Not sure which is the corrupted one.
1171 * It's less likely that the backup pointer was overwritten with
1172 * ( (sane address) ^ (valid cookie) ), so we'll guess that the
1173 * primary pointer has been overwritten with a sane but incorrect address.
1174 */
1175 if (sane_primary && sane_backup)
1176 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1177
1178 /* Neither are sane, so just guess. */
1179 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1180 }
1181
1182 /*
1183 * Adds the element to the head of the zone's free list
1184 * Keeps a backup next-pointer at the end of the element
1185 */
1186 static inline void
1187 free_to_zone(zone_t zone,
1188 vm_offset_t element,
1189 boolean_t poison)
1190 {
1191 vm_offset_t old_head;
1192 struct zone_page_metadata *page_meta;
1193
1194 vm_offset_t *primary = (vm_offset_t *) element;
1195 vm_offset_t *backup = get_backup_ptr(zone->elem_size, primary);
1196
1197 page_meta = get_zone_page_metadata((struct zone_free_element *)element, FALSE);
1198 assert(PAGE_METADATA_GET_ZONE(page_meta) == zone);
1199 old_head = (vm_offset_t)page_metadata_get_freelist(page_meta);
1200
1201 #if MACH_ASSERT
1202 if (__improbable(!is_sane_zone_element(zone, old_head)))
1203 panic("zfree: invalid head pointer %p for freelist of zone %s\n",
1204 (void *) old_head, zone->zone_name);
1205 #endif
1206
1207 if (__improbable(!is_sane_zone_element(zone, element)))
1208 panic("zfree: freeing invalid pointer %p to zone %s\n",
1209 (void *) element, zone->zone_name);
1210
1211 /*
1212 * Always write a redundant next pointer
1213 * So that it is more difficult to forge, xor it with a random cookie
1214 * A poisoned element is indicated by using zp_poisoned_cookie
1215 * instead of zp_nopoison_cookie
1216 */
1217
1218 *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
1219
1220 /*
1221 * Insert this element at the head of the free list. We also xor the
1222 * primary pointer with the zp_nopoison_cookie to make sure a free
1223 * element does not provide the location of the next free element directly.
1224 */
1225 *primary = old_head ^ zp_nopoison_cookie;
1226 page_metadata_set_freelist(page_meta, (struct zone_free_element *)element);
1227 page_meta->free_count++;
1228 if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) {
1229 if (page_meta->free_count == 1) {
1230 /* first foreign element freed on page, move from all_used */
1231 re_queue_tail(&zone->pages.any_free_foreign, &(page_meta->pages));
1232 } else {
1233 /* no other list transitions */
1234 }
1235 } else if (page_meta->free_count == get_metadata_alloc_count(page_meta)) {
1236 /* whether the page was on the intermediate or all_used, queue, move it to free */
1237 re_queue_tail(&zone->pages.all_free, &(page_meta->pages));
1238 zone->count_all_free_pages += page_meta->page_count;
1239 } else if (page_meta->free_count == 1) {
1240 /* first free element on page, move from all_used */
1241 re_queue_tail(&zone->pages.intermediate, &(page_meta->pages));
1242 }
1243 zone->count--;
1244 zone->countfree++;
1245
1246 #if KASAN_ZALLOC
1247 kasan_poison_range(element, zone->elem_size, ASAN_HEAP_FREED);
1248 #endif
1249 }
1250
1251
1252 /*
1253 * Removes an element from the zone's free list, returning 0 if the free list is empty.
1254 * Verifies that the next-pointer and backup next-pointer are intact,
1255 * and verifies that a poisoned element hasn't been modified.
1256 */
1257 static inline vm_offset_t
1258 try_alloc_from_zone(zone_t zone,
1259 vm_tag_t tag __unused,
1260 boolean_t* check_poison)
1261 {
1262 vm_offset_t element;
1263 struct zone_page_metadata *page_meta;
1264
1265 *check_poison = FALSE;
1266
1267 /* if zone is empty, bail */
1268 if (zone->allows_foreign && !queue_empty(&zone->pages.any_free_foreign))
1269 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
1270 else if (!queue_empty(&zone->pages.intermediate))
1271 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
1272 else if (!queue_empty(&zone->pages.all_free)) {
1273 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
1274 assert(zone->count_all_free_pages >= page_meta->page_count);
1275 zone->count_all_free_pages -= page_meta->page_count;
1276 } else {
1277 return 0;
1278 }
1279 /* Check if page_meta passes is_sane_zone_element */
1280 if (__improbable(!is_sane_zone_page_metadata(zone, (vm_offset_t)page_meta)))
1281 panic("zalloc: invalid metadata structure %p for freelist of zone %s\n",
1282 (void *) page_meta, zone->zone_name);
1283 assert(PAGE_METADATA_GET_ZONE(page_meta) == zone);
1284 element = (vm_offset_t)page_metadata_get_freelist(page_meta);
1285
1286 if (__improbable(!is_sane_zone_ptr(zone, element, zone->elem_size)))
1287 panic("zfree: invalid head pointer %p for freelist of zone %s\n",
1288 (void *) element, zone->zone_name);
1289
1290 vm_offset_t *primary = (vm_offset_t *) element;
1291 vm_offset_t *backup = get_backup_ptr(zone->elem_size, primary);
1292
1293 /*
1294 * Since the primary next pointer is xor'ed with zp_nopoison_cookie
1295 * for obfuscation, retrieve the original value back
1296 */
1297 vm_offset_t next_element = *primary ^ zp_nopoison_cookie;
1298 vm_offset_t next_element_primary = *primary;
1299 vm_offset_t next_element_backup = *backup;
1300
1301 /*
1302 * backup_ptr_mismatch_panic will determine what next_element
1303 * should have been, and print it appropriately
1304 */
1305 if (__improbable(!is_sane_zone_element(zone, next_element)))
1306 backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup);
1307
1308 /* Check the backup pointer for the regular cookie */
1309 if (__improbable(next_element != (next_element_backup ^ zp_nopoison_cookie))) {
1310
1311 /* Check for the poisoned cookie instead */
1312 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie)))
1313 /* Neither cookie is valid, corruption has occurred */
1314 backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup);
1315
1316 /*
1317 * Element was marked as poisoned, so check its integrity before using it.
1318 */
1319 *check_poison = TRUE;
1320 }
1321
1322 /* Make sure the page_meta is at the correct offset from the start of page */
1323 if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)element, FALSE)))
1324 panic("zalloc: Incorrect metadata %p found in zone %s page queue. Expected metadata: %p\n",
1325 page_meta, zone->zone_name, get_zone_page_metadata((struct zone_free_element *)element, FALSE));
1326
1327 /* Make sure next_element belongs to the same page as page_meta */
1328 if (next_element) {
1329 if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)next_element, FALSE)))
1330 panic("zalloc: next element pointer %p for element %p points to invalid element for zone %s\n",
1331 (void *)next_element, (void *)element, zone->zone_name);
1332 }
1333
1334 /* Remove this element from the free list */
1335 page_metadata_set_freelist(page_meta, (struct zone_free_element *)next_element);
1336 page_meta->free_count--;
1337
1338 if (page_meta->free_count == 0) {
1339 /* move to all used */
1340 re_queue_tail(&zone->pages.all_used, &(page_meta->pages));
1341 } else {
1342 if (!zone->allows_foreign || from_zone_map(element, zone->elem_size)) {
1343 if (get_metadata_alloc_count(page_meta) == page_meta->free_count + 1) {
1344 /* remove from free, move to intermediate */
1345 re_queue_tail(&zone->pages.intermediate, &(page_meta->pages));
1346 }
1347 }
1348 }
1349 zone->countfree--;
1350 zone->count++;
1351 zone->sum_count++;
1352
1353 #if VM_MAX_TAG_ZONES
1354 if (__improbable(zone->tags)) {
1355 // set the tag with b0 clear so the block remains inuse
1356 ZTAG(zone, element)[0] = (tag << 1);
1357 }
1358 #endif /* VM_MAX_TAG_ZONES */
1359
1360
1361 #if KASAN_ZALLOC
1362 kasan_poison_range(element, zone->elem_size, ASAN_VALID);
1363 #endif
1364
1365 return element;
1366 }
1367
1368 /*
1369 * End of zone poisoning
1370 */
1371
1372 /*
1373 * Zone info options
1374 */
1375 #define ZINFO_SLOTS MAX_ZONES /* for now */
1376
1377 zone_t zone_find_largest(void);
1378
1379 /*
1380 * Async allocation of zones
1381 * This mechanism allows for bootstrapping an empty zone which is setup with
1382 * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
1383 * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
1384 * This will prime the zone for the next use.
1385 *
1386 * Currently the thread_callout function (zalloc_async) will loop through all zones
1387 * looking for any zone with async_pending set and do the work for it.
1388 *
1389 * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
1390 * then zalloc_noblock to an empty zone may succeed.
1391 */
1392 void zalloc_async(
1393 thread_call_param_t p0,
1394 thread_call_param_t p1);
1395
1396 static thread_call_data_t call_async_alloc;
1397
1398 /*
1399 * Align elements that use the zone page list to 32 byte boundaries.
1400 */
1401 #define ZONE_ELEMENT_ALIGNMENT 32
1402
1403 #define zone_wakeup(zone) thread_wakeup((event_t)(zone))
1404 #define zone_sleep(zone) \
1405 (void) lck_mtx_sleep(&(zone)->lock, LCK_SLEEP_SPIN_ALWAYS, (event_t)(zone), THREAD_UNINT);
1406
1407 /*
1408 * The zone_locks_grp allows for collecting lock statistics.
1409 * All locks are associated to this group in zinit.
1410 * Look at tools/lockstat for debugging lock contention.
1411 */
1412
1413 lck_grp_t zone_locks_grp;
1414 lck_grp_attr_t zone_locks_grp_attr;
1415
1416 #define lock_zone_init(zone) \
1417 MACRO_BEGIN \
1418 lck_attr_setdefault(&(zone)->lock_attr); \
1419 lck_mtx_init_ext(&(zone)->lock, &(zone)->lock_ext, \
1420 &zone_locks_grp, &(zone)->lock_attr); \
1421 MACRO_END
1422
1423 #define lock_try_zone(zone) lck_mtx_try_lock_spin(&zone->lock)
1424
1425 /*
1426 * Exclude more than one concurrent garbage collection
1427 */
1428 decl_lck_mtx_data(, zone_gc_lock)
1429
1430 lck_attr_t zone_gc_lck_attr;
1431 lck_grp_t zone_gc_lck_grp;
1432 lck_grp_attr_t zone_gc_lck_grp_attr;
1433 lck_mtx_ext_t zone_gc_lck_ext;
1434
1435 boolean_t zone_gc_allowed = TRUE;
1436 boolean_t panic_include_zprint = FALSE;
1437
1438 mach_memory_info_t *panic_kext_memory_info = NULL;
1439 vm_size_t panic_kext_memory_size = 0;
1440
1441 #define ZALLOC_DEBUG_ZONEGC 0x00000001
1442 #define ZALLOC_DEBUG_ZCRAM 0x00000002
1443 uint32_t zalloc_debug = 0;
1444
1445 /*
1446 * Zone leak debugging code
1447 *
1448 * When enabled, this code keeps a log to track allocations to a particular zone that have not
1449 * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated
1450 * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is
1451 * off by default.
1452 *
1453 * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
1454 * is the name of the zone you wish to log.
1455 *
1456 * This code only tracks one zone, so you need to identify which one is leaking first.
1457 * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
1458 * garbage collector. Note that the zone name printed in the panic message is not necessarily the one
1459 * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This
1460 * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The
1461 * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
1462 * See the help in the kgmacros for usage info.
1463 *
1464 *
1465 * Zone corruption logging
1466 *
1467 * Logging can also be used to help identify the source of a zone corruption. First, identify the zone
1468 * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args. When -zc is used in conjunction
1469 * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the
1470 * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
1471 * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been
1472 * corrupted to examine its history. This should lead to the source of the corruption.
1473 */
1474
1475 static boolean_t log_records_init = FALSE;
1476 static int log_records; /* size of the log, expressed in number of records */
1477
1478 #define MAX_NUM_ZONES_ALLOWED_LOGGING 10 /* Maximum 10 zones can be logged at once */
1479
1480 static int max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
1481 static int num_zones_logged = 0;
1482
1483 static char zone_name_to_log[MAX_ZONE_NAME] = ""; /* the zone name we're logging, if any */
1484
1485 /* Log allocations and frees to help debug a zone element corruption */
1486 boolean_t corruption_debug_flag = FALSE; /* enabled by "-zc" boot-arg */
1487 /* Making pointer scanning leaks detection possible for all zones */
1488
1489 #if DEBUG || DEVELOPMENT
1490 boolean_t leak_scan_debug_flag = FALSE; /* enabled by "-zl" boot-arg */
1491 #endif /* DEBUG || DEVELOPMENT */
1492
1493
1494 /*
1495 * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to
1496 * the number of records you want in the log. For example, "zrecs=10" sets it to 10 records. Since this
1497 * is the number of stacks suspected of leaking, we don't need many records.
1498 */
1499
1500 #if defined(__LP64__)
1501 #define ZRECORDS_MAX 2560 /* Max records allowed in the log */
1502 #else
1503 #define ZRECORDS_MAX 1536 /* Max records allowed in the log */
1504 #endif
1505 #define ZRECORDS_DEFAULT 1024 /* default records in log if zrecs is not specificed in boot-args */
1506
1507 /*
1508 * Each record in the log contains a pointer to the zone element it refers to,
1509 * and a small array to hold the pc's from the stack trace. A
1510 * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging,
1511 * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees.
1512 * If the log fills, old records are replaced as if it were a circular buffer.
1513 */
1514
1515
1516 /*
1517 * Opcodes for the btlog operation field:
1518 */
1519
1520 #define ZOP_ALLOC 1
1521 #define ZOP_FREE 0
1522
1523 /*
1524 * Decide if we want to log this zone by doing a string compare between a zone name and the name
1525 * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not
1526 * possible to include spaces in strings passed in via the boot-args, a period in the logname will
1527 * match a space in the zone name.
1528 */
1529
1530 int
1531 track_this_zone(const char *zonename, const char *logname)
1532 {
1533 int len;
1534 const char *zc = zonename;
1535 const char *lc = logname;
1536
1537 /*
1538 * Compare the strings. We bound the compare by MAX_ZONE_NAME.
1539 */
1540
1541 for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
1542
1543 /*
1544 * If the current characters don't match, check for a space in
1545 * in the zone name and a corresponding period in the log name.
1546 * If that's not there, then the strings don't match.
1547 */
1548
1549 if (*zc != *lc && !(*zc == ' ' && *lc == '.'))
1550 break;
1551
1552 /*
1553 * The strings are equal so far. If we're at the end, then it's a match.
1554 */
1555
1556 if (*zc == '\0')
1557 return TRUE;
1558 }
1559
1560 return FALSE;
1561 }
1562
1563
1564 /*
1565 * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and
1566 * the buffer for the records has been allocated.
1567 */
1568
1569 #define DO_LOGGING(z) (z->zone_logging == TRUE && z->zlog_btlog)
1570
1571 extern boolean_t kmem_alloc_ready;
1572
1573 #if CONFIG_ZLEAKS
1574 #pragma mark -
1575 #pragma mark Zone Leak Detection
1576
1577 /*
1578 * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
1579 * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a
1580 * backtrace. Every free, we examine the table and determine if the allocation was being tracked,
1581 * and stop tracking it if it was being tracked.
1582 *
1583 * We track the allocations in the zallocations hash table, which stores the address that was returned from
1584 * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which
1585 * stores the backtrace associated with that allocation. This provides uniquing for the relatively large
1586 * backtraces - we don't store them more than once.
1587 *
1588 * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
1589 * a large amount of virtual space.
1590 */
1591 #define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */
1592 #define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */
1593 #define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */
1594 #define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */
1595 uint32_t zleak_state = 0; /* State of collection, as above */
1596
1597 boolean_t panic_include_ztrace = FALSE; /* Enable zleak logging on panic */
1598 vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */
1599 vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */
1600 unsigned int zleak_sample_factor = 1000; /* Allocations per sample attempt */
1601
1602 /*
1603 * Counters for allocation statistics.
1604 */
1605
1606 /* Times two active records want to occupy the same spot */
1607 unsigned int z_alloc_collisions = 0;
1608 unsigned int z_trace_collisions = 0;
1609
1610 /* Times a new record lands on a spot previously occupied by a freed allocation */
1611 unsigned int z_alloc_overwrites = 0;
1612 unsigned int z_trace_overwrites = 0;
1613
1614 /* Times a new alloc or trace is put into the hash table */
1615 unsigned int z_alloc_recorded = 0;
1616 unsigned int z_trace_recorded = 0;
1617
1618 /* Times zleak_log returned false due to not being able to acquire the lock */
1619 unsigned int z_total_conflicts = 0;
1620
1621
1622 #pragma mark struct zallocation
1623 /*
1624 * Structure for keeping track of an allocation
1625 * An allocation bucket is in use if its element is not NULL
1626 */
1627 struct zallocation {
1628 uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
1629 vm_size_t za_size; /* how much memory did this allocation take up? */
1630 uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */
1631 /* TODO: #if this out */
1632 uint32_t za_hit_count; /* for determining effectiveness of hash function */
1633 };
1634
1635 /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
1636 uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
1637 uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
1638
1639 vm_size_t zleak_max_zonemap_size;
1640
1641 /* Hashmaps of allocations and their corresponding traces */
1642 static struct zallocation* zallocations;
1643 static struct ztrace* ztraces;
1644
1645 /* not static so that panic can see this, see kern/debug.c */
1646 struct ztrace* top_ztrace;
1647
1648 /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
1649 static lck_spin_t zleak_lock;
1650 static lck_attr_t zleak_lock_attr;
1651 static lck_grp_t zleak_lock_grp;
1652 static lck_grp_attr_t zleak_lock_grp_attr;
1653
1654 /*
1655 * Initializes the zone leak monitor. Called from zone_init()
1656 */
1657 static void
1658 zleak_init(vm_size_t max_zonemap_size)
1659 {
1660 char scratch_buf[16];
1661 boolean_t zleak_enable_flag = FALSE;
1662
1663 zleak_max_zonemap_size = max_zonemap_size;
1664 zleak_global_tracking_threshold = max_zonemap_size / 2;
1665 zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
1666
1667 #if CONFIG_EMBEDDED
1668 if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) {
1669 zleak_enable_flag = TRUE;
1670 printf("zone leak detection enabled\n");
1671 } else {
1672 zleak_enable_flag = FALSE;
1673 printf("zone leak detection disabled\n");
1674 }
1675 #else /* CONFIG_EMBEDDED */
1676 /* -zleakoff (flag to disable zone leak monitor) */
1677 if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
1678 zleak_enable_flag = FALSE;
1679 printf("zone leak detection disabled\n");
1680 } else {
1681 zleak_enable_flag = TRUE;
1682 printf("zone leak detection enabled\n");
1683 }
1684 #endif /* CONFIG_EMBEDDED */
1685
1686 /* zfactor=XXXX (override how often to sample the zone allocator) */
1687 if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
1688 printf("Zone leak factor override: %u\n", zleak_sample_factor);
1689 }
1690
1691 /* zleak-allocs=XXXX (override number of buckets in zallocations) */
1692 if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
1693 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
1694 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1695 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets-1))) {
1696 printf("Override isn't a power of two, bad things might happen!\n");
1697 }
1698 }
1699
1700 /* zleak-traces=XXXX (override number of buckets in ztraces) */
1701 if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
1702 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
1703 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1704 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets-1))) {
1705 printf("Override isn't a power of two, bad things might happen!\n");
1706 }
1707 }
1708
1709 /* allocate the zleak_lock */
1710 lck_grp_attr_setdefault(&zleak_lock_grp_attr);
1711 lck_grp_init(&zleak_lock_grp, "zleak_lock", &zleak_lock_grp_attr);
1712 lck_attr_setdefault(&zleak_lock_attr);
1713 lck_spin_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr);
1714
1715 if (zleak_enable_flag) {
1716 zleak_state = ZLEAK_STATE_ENABLED;
1717 }
1718 }
1719
1720 #if CONFIG_ZLEAKS
1721
1722 /*
1723 * Support for kern.zleak.active sysctl - a simplified
1724 * version of the zleak_state variable.
1725 */
1726 int
1727 get_zleak_state(void)
1728 {
1729 if (zleak_state & ZLEAK_STATE_FAILED)
1730 return (-1);
1731 if (zleak_state & ZLEAK_STATE_ACTIVE)
1732 return (1);
1733 return (0);
1734 }
1735
1736 #endif
1737
1738
1739 kern_return_t
1740 zleak_activate(void)
1741 {
1742 kern_return_t retval;
1743 vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
1744 vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
1745 void *allocations_ptr = NULL;
1746 void *traces_ptr = NULL;
1747
1748 /* Only one thread attempts to activate at a time */
1749 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1750 return KERN_SUCCESS;
1751 }
1752
1753 /* Indicate that we're doing the setup */
1754 lck_spin_lock(&zleak_lock);
1755 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1756 lck_spin_unlock(&zleak_lock);
1757 return KERN_SUCCESS;
1758 }
1759
1760 zleak_state |= ZLEAK_STATE_ACTIVATING;
1761 lck_spin_unlock(&zleak_lock);
1762
1763 /* Allocate and zero tables */
1764 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK);
1765 if (retval != KERN_SUCCESS) {
1766 goto fail;
1767 }
1768
1769 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK);
1770 if (retval != KERN_SUCCESS) {
1771 goto fail;
1772 }
1773
1774 bzero(allocations_ptr, z_alloc_size);
1775 bzero(traces_ptr, z_trace_size);
1776
1777 /* Everything's set. Install tables, mark active. */
1778 zallocations = allocations_ptr;
1779 ztraces = traces_ptr;
1780
1781 /*
1782 * Initialize the top_ztrace to the first entry in ztraces,
1783 * so we don't have to check for null in zleak_log
1784 */
1785 top_ztrace = &ztraces[0];
1786
1787 /*
1788 * Note that we do need a barrier between installing
1789 * the tables and setting the active flag, because the zfree()
1790 * path accesses the table without a lock if we're active.
1791 */
1792 lck_spin_lock(&zleak_lock);
1793 zleak_state |= ZLEAK_STATE_ACTIVE;
1794 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1795 lck_spin_unlock(&zleak_lock);
1796
1797 return 0;
1798
1799 fail:
1800 /*
1801 * If we fail to allocate memory, don't further tax
1802 * the system by trying again.
1803 */
1804 lck_spin_lock(&zleak_lock);
1805 zleak_state |= ZLEAK_STATE_FAILED;
1806 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1807 lck_spin_unlock(&zleak_lock);
1808
1809 if (allocations_ptr != NULL) {
1810 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
1811 }
1812
1813 if (traces_ptr != NULL) {
1814 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
1815 }
1816
1817 return retval;
1818 }
1819
1820 /*
1821 * TODO: What about allocations that never get deallocated,
1822 * especially ones with unique backtraces? Should we wait to record
1823 * until after boot has completed?
1824 * (How many persistent zallocs are there?)
1825 */
1826
1827 /*
1828 * This function records the allocation in the allocations table,
1829 * and stores the associated backtrace in the traces table
1830 * (or just increments the refcount if the trace is already recorded)
1831 * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
1832 * the associated trace's refcount is decremented.
1833 * If the trace slot is in use, it returns.
1834 * The refcount is incremented by the amount of memory the allocation consumes.
1835 * The return value indicates whether to try again next time.
1836 */
1837 static boolean_t
1838 zleak_log(uintptr_t* bt,
1839 uintptr_t addr,
1840 uint32_t depth,
1841 vm_size_t allocation_size)
1842 {
1843 /* Quit if there's someone else modifying the hash tables */
1844 if (!lck_spin_try_lock(&zleak_lock)) {
1845 z_total_conflicts++;
1846 return FALSE;
1847 }
1848
1849 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
1850
1851 uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
1852 struct ztrace* trace = &ztraces[trace_index];
1853
1854 allocation->za_hit_count++;
1855 trace->zt_hit_count++;
1856
1857 /*
1858 * If the allocation bucket we want to be in is occupied, and if the occupier
1859 * has the same trace as us, just bail.
1860 */
1861 if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
1862 z_alloc_collisions++;
1863
1864 lck_spin_unlock(&zleak_lock);
1865 return TRUE;
1866 }
1867
1868 /* STEP 1: Store the backtrace in the traces array. */
1869 /* A size of zero indicates that the trace bucket is free. */
1870
1871 if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0 ) {
1872 /*
1873 * Different unique trace with same hash!
1874 * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
1875 * and get out of the way for later chances
1876 */
1877 trace->zt_collisions++;
1878 z_trace_collisions++;
1879
1880 lck_spin_unlock(&zleak_lock);
1881 return TRUE;
1882 } else if (trace->zt_size > 0) {
1883 /* Same trace, already added, so increment refcount */
1884 trace->zt_size += allocation_size;
1885 } else {
1886 /* Found an unused trace bucket, record the trace here! */
1887 if (trace->zt_depth != 0) /* if this slot was previously used but not currently in use */
1888 z_trace_overwrites++;
1889
1890 z_trace_recorded++;
1891 trace->zt_size = allocation_size;
1892 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)) );
1893
1894 trace->zt_depth = depth;
1895 trace->zt_collisions = 0;
1896 }
1897
1898 /* STEP 2: Store the allocation record in the allocations array. */
1899
1900 if (allocation->za_element != (uintptr_t) 0) {
1901 /*
1902 * Straight up replace any allocation record that was there. We don't want to do the work
1903 * to preserve the allocation entries that were there, because we only record a subset of the
1904 * allocations anyways.
1905 */
1906
1907 z_alloc_collisions++;
1908
1909 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
1910 /* Knock off old allocation's size, not the new allocation */
1911 associated_trace->zt_size -= allocation->za_size;
1912 } else if (allocation->za_trace_index != 0) {
1913 /* Slot previously used but not currently in use */
1914 z_alloc_overwrites++;
1915 }
1916
1917 allocation->za_element = addr;
1918 allocation->za_trace_index = trace_index;
1919 allocation->za_size = allocation_size;
1920
1921 z_alloc_recorded++;
1922
1923 if (top_ztrace->zt_size < trace->zt_size)
1924 top_ztrace = trace;
1925
1926 lck_spin_unlock(&zleak_lock);
1927 return TRUE;
1928 }
1929
1930 /*
1931 * Free the allocation record and release the stacktrace.
1932 * This should be as fast as possible because it will be called for every free.
1933 */
1934 static void
1935 zleak_free(uintptr_t addr,
1936 vm_size_t allocation_size)
1937 {
1938 if (addr == (uintptr_t) 0)
1939 return;
1940
1941 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
1942
1943 /* Double-checked locking: check to find out if we're interested, lock, check to make
1944 * sure it hasn't changed, then modify it, and release the lock.
1945 */
1946
1947 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
1948 /* if the allocation was the one, grab the lock, check again, then delete it */
1949 lck_spin_lock(&zleak_lock);
1950
1951 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
1952 struct ztrace *trace;
1953
1954 /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
1955 if (allocation->za_size != allocation_size) {
1956 panic("Freeing as size %lu memory that was allocated with size %lu\n",
1957 (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
1958 }
1959
1960 trace = &ztraces[allocation->za_trace_index];
1961
1962 /* size of 0 indicates trace bucket is unused */
1963 if (trace->zt_size > 0) {
1964 trace->zt_size -= allocation_size;
1965 }
1966
1967 /* A NULL element means the allocation bucket is unused */
1968 allocation->za_element = 0;
1969 }
1970 lck_spin_unlock(&zleak_lock);
1971 }
1972 }
1973
1974 #endif /* CONFIG_ZLEAKS */
1975
1976 /* These functions outside of CONFIG_ZLEAKS because they are also used in
1977 * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix.
1978 */
1979
1980 /* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */
1981 uintptr_t
1982 hash_mix(uintptr_t x)
1983 {
1984 #ifndef __LP64__
1985 x += ~(x << 15);
1986 x ^= (x >> 10);
1987 x += (x << 3 );
1988 x ^= (x >> 6 );
1989 x += ~(x << 11);
1990 x ^= (x >> 16);
1991 #else
1992 x += ~(x << 32);
1993 x ^= (x >> 22);
1994 x += ~(x << 13);
1995 x ^= (x >> 8 );
1996 x += (x << 3 );
1997 x ^= (x >> 15);
1998 x += ~(x << 27);
1999 x ^= (x >> 31);
2000 #endif
2001 return x;
2002 }
2003
2004 uint32_t
2005 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
2006 {
2007
2008 uintptr_t hash = 0;
2009 uintptr_t mask = max_size - 1;
2010
2011 while (depth) {
2012 hash += bt[--depth];
2013 }
2014
2015 hash = hash_mix(hash) & mask;
2016
2017 assert(hash < max_size);
2018
2019 return (uint32_t) hash;
2020 }
2021
2022 /*
2023 * TODO: Determine how well distributed this is
2024 * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
2025 */
2026 uint32_t
2027 hashaddr(uintptr_t pt, uint32_t max_size)
2028 {
2029 uintptr_t hash = 0;
2030 uintptr_t mask = max_size - 1;
2031
2032 hash = hash_mix(pt) & mask;
2033
2034 assert(hash < max_size);
2035
2036 return (uint32_t) hash;
2037 }
2038
2039 /* End of all leak-detection code */
2040 #pragma mark -
2041
2042 #define ZONE_MAX_ALLOC_SIZE (32 * 1024)
2043 #define ZONE_ALLOC_FRAG_PERCENT(alloc_size, ele_size) (((alloc_size % ele_size) * 100) / alloc_size)
2044
2045 /* Used to manage copying in of new zone names */
2046 static vm_offset_t zone_names_start;
2047 static vm_offset_t zone_names_next;
2048
2049 static vm_size_t
2050 compute_element_size(vm_size_t requested_size)
2051 {
2052 vm_size_t element_size = requested_size;
2053
2054 /* Zone elements must fit both a next pointer and a backup pointer */
2055 vm_size_t minimum_element_size = sizeof(vm_offset_t) * 2;
2056 if (element_size < minimum_element_size)
2057 element_size = minimum_element_size;
2058
2059 /*
2060 * Round element size to a multiple of sizeof(pointer)
2061 * This also enforces that allocations will be aligned on pointer boundaries
2062 */
2063 element_size = ((element_size-1) + sizeof(vm_offset_t)) -
2064 ((element_size-1) % sizeof(vm_offset_t));
2065
2066 return element_size;
2067 }
2068
2069 /*
2070 * zinit initializes a new zone. The zone data structures themselves
2071 * are stored in a zone, which is initially a static structure that
2072 * is initialized by zone_init.
2073 */
2074
2075 zone_t
2076 zinit(
2077 vm_size_t size, /* the size of an element */
2078 vm_size_t max, /* maximum memory to use */
2079 vm_size_t alloc, /* allocation size */
2080 const char *name) /* a name for the zone */
2081 {
2082 zone_t z;
2083
2084 size = compute_element_size(size);
2085
2086 simple_lock(&all_zones_lock);
2087
2088 assert(num_zones < MAX_ZONES);
2089 assert(num_zones_in_use <= num_zones);
2090
2091 /* If possible, find a previously zdestroy'ed zone in the zone_array that we can reuse instead of initializing a new zone. */
2092 for (int index = bitmap_first(zone_empty_bitmap, MAX_ZONES);
2093 index >= 0 && index < (int)num_zones;
2094 index = bitmap_next(zone_empty_bitmap, index)) {
2095 z = &(zone_array[index]);
2096
2097 /*
2098 * If the zone name and the element size are the same, we can just reuse the old zone struct.
2099 * Otherwise hand out a new zone from the zone_array.
2100 */
2101 if (!strcmp(z->zone_name, name)) {
2102 vm_size_t old_size = z->elem_size;
2103 #if KASAN_ZALLOC
2104 old_size -= z->kasan_redzone * 2;
2105 #endif
2106 if (old_size == size) {
2107 /* Clear the empty bit for this zone, increment num_zones_in_use, and mark the zone as valid again. */
2108 bitmap_clear(zone_empty_bitmap, index);
2109 num_zones_in_use++;
2110 z->zone_valid = TRUE;
2111
2112 /* All other state is already set up since the zone was previously in use. Return early. */
2113 simple_unlock(&all_zones_lock);
2114 return (z);
2115 }
2116 }
2117 }
2118
2119 /* If we're here, it means we didn't find a zone above that we could simply reuse. Set up a new zone. */
2120
2121 /* Clear the empty bit for the new zone */
2122 bitmap_clear(zone_empty_bitmap, num_zones);
2123
2124 z = &(zone_array[num_zones]);
2125 z->index = num_zones;
2126
2127 num_zones++;
2128 num_zones_in_use++;
2129
2130 /*
2131 * Initialize the zone lock here before dropping the all_zones_lock. Otherwise we could race with
2132 * zalloc_async() and try to grab the zone lock before it has been initialized, causing a panic.
2133 */
2134 lock_zone_init(z);
2135
2136 simple_unlock(&all_zones_lock);
2137
2138 #if KASAN_ZALLOC
2139 /* Expand the zone allocation size to include the redzones. For page-multiple
2140 * zones add a full guard page because they likely require alignment. kalloc
2141 * and fakestack handles its own KASan state, so ignore those zones. */
2142 /* XXX: remove this when zinit_with_options() is a thing */
2143 const char *kalloc_name = "kalloc.";
2144 const char *fakestack_name = "fakestack.";
2145 if (strncmp(name, kalloc_name, strlen(kalloc_name)) == 0) {
2146 z->kasan_redzone = 0;
2147 } else if (strncmp(name, fakestack_name, strlen(fakestack_name)) == 0) {
2148 z->kasan_redzone = 0;
2149 } else {
2150 if ((size % PAGE_SIZE) != 0) {
2151 z->kasan_redzone = KASAN_GUARD_SIZE;
2152 } else {
2153 z->kasan_redzone = PAGE_SIZE;
2154 }
2155 max = (max / size) * (size + z->kasan_redzone * 2);
2156 size += z->kasan_redzone * 2;
2157 }
2158 #endif
2159
2160 max = round_page(max);
2161
2162 vm_size_t best_alloc = PAGE_SIZE;
2163
2164 if ((size % PAGE_SIZE) == 0) {
2165 /* zero fragmentation by definition */
2166 best_alloc = size;
2167 } else {
2168 vm_size_t alloc_size;
2169 for (alloc_size = (2 * PAGE_SIZE); alloc_size <= ZONE_MAX_ALLOC_SIZE; alloc_size += PAGE_SIZE) {
2170 if (ZONE_ALLOC_FRAG_PERCENT(alloc_size, size) < ZONE_ALLOC_FRAG_PERCENT(best_alloc, size)) {
2171 best_alloc = alloc_size;
2172 }
2173 }
2174 }
2175
2176 alloc = best_alloc;
2177 if (max && (max < alloc))
2178 max = alloc;
2179
2180 z->free_elements = NULL;
2181 queue_init(&z->pages.any_free_foreign);
2182 queue_init(&z->pages.all_free);
2183 queue_init(&z->pages.intermediate);
2184 queue_init(&z->pages.all_used);
2185 z->cur_size = 0;
2186 z->page_count = 0;
2187 z->max_size = max;
2188 z->elem_size = size;
2189 z->alloc_size = alloc;
2190 z->count = 0;
2191 z->countfree = 0;
2192 z->count_all_free_pages = 0;
2193 z->sum_count = 0LL;
2194 z->doing_alloc_without_vm_priv = FALSE;
2195 z->doing_alloc_with_vm_priv = FALSE;
2196 z->exhaustible = FALSE;
2197 z->collectable = TRUE;
2198 z->allows_foreign = FALSE;
2199 z->expandable = TRUE;
2200 z->waiting = FALSE;
2201 z->async_pending = FALSE;
2202 z->caller_acct = TRUE;
2203 z->noencrypt = FALSE;
2204 z->no_callout = FALSE;
2205 z->async_prio_refill = FALSE;
2206 z->gzalloc_exempt = FALSE;
2207 z->alignment_required = FALSE;
2208 z->zone_replenishing = FALSE;
2209 z->prio_refill_watermark = 0;
2210 z->zone_replenish_thread = NULL;
2211 z->zp_count = 0;
2212 z->kasan_quarantine = TRUE;
2213 z->zone_valid = TRUE;
2214
2215 #if CONFIG_ZLEAKS
2216 z->zleak_capture = 0;
2217 z->zleak_on = FALSE;
2218 #endif /* CONFIG_ZLEAKS */
2219
2220 /*
2221 * If the VM is ready to handle kmem_alloc requests, copy the zone name passed in.
2222 *
2223 * Else simply maintain a pointer to the name string. The only zones we'll actually have
2224 * to do this for would be the VM-related zones that are created very early on before any
2225 * kexts can be loaded (unloaded). So we should be fine with just a pointer in this case.
2226 */
2227 if (kmem_alloc_ready) {
2228 size_t len = MIN(strlen(name)+1, MACH_ZONE_NAME_MAX_LEN);
2229
2230 if (zone_names_start == 0 || ((zone_names_next - zone_names_start) + len) > PAGE_SIZE) {
2231 printf("zalloc: allocating memory for zone names buffer\n");
2232 kern_return_t retval = kmem_alloc_kobject(kernel_map, &zone_names_start,
2233 PAGE_SIZE, VM_KERN_MEMORY_OSFMK);
2234 if (retval != KERN_SUCCESS) {
2235 panic("zalloc: zone_names memory allocation failed");
2236 }
2237 bzero((char *)zone_names_start, PAGE_SIZE);
2238 zone_names_next = zone_names_start;
2239 }
2240
2241 strlcpy((char *)zone_names_next, name, len);
2242 z->zone_name = (char *)zone_names_next;
2243 zone_names_next += len;
2244 } else {
2245 z->zone_name = name;
2246 }
2247
2248 /*
2249 * Check for and set up zone leak detection if requested via boot-args. We recognized two
2250 * boot-args:
2251 *
2252 * zlog=<zone_to_log>
2253 * zrecs=<num_records_in_log>
2254 *
2255 * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to
2256 * control the size of the log. If zrecs is not specified, a default value is used.
2257 */
2258
2259 if (num_zones_logged < max_num_zones_to_log) {
2260
2261 int i = 1; /* zlog0 isn't allowed. */
2262 boolean_t zone_logging_enabled = FALSE;
2263 char zlog_name[MAX_ZONE_NAME] = ""; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
2264
2265 while (i <= max_num_zones_to_log) {
2266
2267 snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
2268
2269 if (PE_parse_boot_argn(zlog_name, zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
2270 if (track_this_zone(z->zone_name, zone_name_to_log)) {
2271 if (z->zone_valid) {
2272 z->zone_logging = TRUE;
2273 zone_logging_enabled = TRUE;
2274 num_zones_logged++;
2275 break;
2276 }
2277 }
2278 }
2279 i++;
2280 }
2281
2282 if (zone_logging_enabled == FALSE) {
2283 /*
2284 * Backwards compat. with the old boot-arg used to specify single zone logging i.e. zlog
2285 * Needs to happen after the newer zlogn checks because the prefix will match all the zlogn
2286 * boot-args.
2287 */
2288 if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
2289 if (track_this_zone(z->zone_name, zone_name_to_log)) {
2290 if (z->zone_valid) {
2291 z->zone_logging = TRUE;
2292 zone_logging_enabled = TRUE;
2293 num_zones_logged++;
2294 }
2295 }
2296 }
2297 }
2298
2299 if (log_records_init == FALSE && zone_logging_enabled == TRUE) {
2300 if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) {
2301 /*
2302 * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
2303 * This prevents accidentally hogging too much kernel memory and making the system
2304 * unusable.
2305 */
2306
2307 log_records = MIN(ZRECORDS_MAX, log_records);
2308 log_records_init = TRUE;
2309 } else {
2310 log_records = ZRECORDS_DEFAULT;
2311 log_records_init = TRUE;
2312 }
2313 }
2314
2315 /*
2316 * If we want to log a zone, see if we need to allocate buffer space for the log. Some vm related zones are
2317 * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case. kmem_alloc_ready is set to
2318 * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work. If we want to log one
2319 * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again
2320 * later on some other zone. So note we may be allocating a buffer to log a zone other than the one being initialized
2321 * right now.
2322 */
2323 if (kmem_alloc_ready) {
2324
2325 zone_t curr_zone = NULL;
2326 unsigned int max_zones = 0, zone_idx = 0;
2327
2328 simple_lock(&all_zones_lock);
2329 max_zones = num_zones;
2330 simple_unlock(&all_zones_lock);
2331
2332 for (zone_idx = 0; zone_idx < max_zones; zone_idx++) {
2333
2334 curr_zone = &(zone_array[zone_idx]);
2335
2336 if (!curr_zone->zone_valid) {
2337 continue;
2338 }
2339
2340 /*
2341 * We work with the zone unlocked here because we could end up needing the zone lock to
2342 * enable logging for this zone e.g. need a VM object to allocate memory to enable logging for the
2343 * VM objects zone.
2344 *
2345 * We don't expect these zones to be needed at this early a time in boot and so take this chance.
2346 */
2347 if (curr_zone->zone_logging && curr_zone->zlog_btlog == NULL) {
2348
2349 curr_zone->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
2350
2351 if (curr_zone->zlog_btlog) {
2352
2353 printf("zone: logging started for zone %s\n", curr_zone->zone_name);
2354 } else {
2355 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
2356 curr_zone->zone_logging = FALSE;
2357 }
2358 }
2359
2360 }
2361 }
2362 }
2363
2364 #if CONFIG_GZALLOC
2365 gzalloc_zone_init(z);
2366 #endif
2367
2368 return(z);
2369 }
2370 unsigned zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count;
2371
2372 static void zone_replenish_thread(zone_t);
2373
2374 /* High priority VM privileged thread used to asynchronously refill a designated
2375 * zone, such as the reserved VM map entry zone.
2376 */
2377 __attribute__((noreturn))
2378 static void
2379 zone_replenish_thread(zone_t z)
2380 {
2381 vm_size_t free_size;
2382 current_thread()->options |= TH_OPT_VMPRIV;
2383
2384 for (;;) {
2385 lock_zone(z);
2386 assert(z->zone_valid);
2387 z->zone_replenishing = TRUE;
2388 assert(z->prio_refill_watermark != 0);
2389 while ((free_size = (z->cur_size - (z->count * z->elem_size))) < (z->prio_refill_watermark * z->elem_size)) {
2390 assert(z->doing_alloc_without_vm_priv == FALSE);
2391 assert(z->doing_alloc_with_vm_priv == FALSE);
2392 assert(z->async_prio_refill == TRUE);
2393
2394 unlock_zone(z);
2395 int zflags = KMA_KOBJECT|KMA_NOPAGEWAIT;
2396 vm_offset_t space, alloc_size;
2397 kern_return_t kr;
2398
2399 if (vm_pool_low())
2400 alloc_size = round_page(z->elem_size);
2401 else
2402 alloc_size = z->alloc_size;
2403
2404 if (z->noencrypt)
2405 zflags |= KMA_NOENCRYPT;
2406
2407 /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
2408 if (is_zone_map_nearing_exhaustion()) {
2409 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2410 }
2411
2412 kr = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
2413
2414 if (kr == KERN_SUCCESS) {
2415 zcram(z, space, alloc_size);
2416 } else if (kr == KERN_RESOURCE_SHORTAGE) {
2417 VM_PAGE_WAIT();
2418 } else if (kr == KERN_NO_SPACE) {
2419 kr = kernel_memory_allocate(kernel_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
2420 if (kr == KERN_SUCCESS) {
2421 zcram(z, space, alloc_size);
2422 } else {
2423 assert_wait_timeout(&z->zone_replenish_thread, THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
2424 thread_block(THREAD_CONTINUE_NULL);
2425 }
2426 }
2427
2428 lock_zone(z);
2429 assert(z->zone_valid);
2430 zone_replenish_loops++;
2431 }
2432
2433 z->zone_replenishing = FALSE;
2434 /* Signal any potential throttled consumers, terminating
2435 * their timer-bounded waits.
2436 */
2437 thread_wakeup(z);
2438
2439 assert_wait(&z->zone_replenish_thread, THREAD_UNINT);
2440 unlock_zone(z);
2441 thread_block(THREAD_CONTINUE_NULL);
2442 zone_replenish_wakeups++;
2443 }
2444 }
2445
2446 void
2447 zone_prio_refill_configure(zone_t z, vm_size_t low_water_mark) {
2448 z->prio_refill_watermark = low_water_mark;
2449
2450 z->async_prio_refill = TRUE;
2451 OSMemoryBarrier();
2452 kern_return_t tres = kernel_thread_start_priority((thread_continue_t)zone_replenish_thread, z, MAXPRI_KERNEL, &z->zone_replenish_thread);
2453
2454 if (tres != KERN_SUCCESS) {
2455 panic("zone_prio_refill_configure, thread create: 0x%x", tres);
2456 }
2457
2458 thread_deallocate(z->zone_replenish_thread);
2459 }
2460
2461 void
2462 zdestroy(zone_t z)
2463 {
2464 unsigned int zindex;
2465
2466 assert(z != NULL);
2467
2468 lock_zone(z);
2469 assert(z->zone_valid);
2470
2471 /* Assert that the zone does not have any allocations in flight */
2472 assert(z->doing_alloc_without_vm_priv == FALSE);
2473 assert(z->doing_alloc_with_vm_priv == FALSE);
2474 assert(z->async_pending == FALSE);
2475 assert(z->waiting == FALSE);
2476 assert(z->async_prio_refill == FALSE);
2477
2478 #if !KASAN_ZALLOC
2479 /*
2480 * Unset the valid bit. We'll hit an assert failure on further operations on this zone, until zinit() is called again.
2481 * Leave the zone valid for KASan as we will see zfree's on quarantined free elements even after the zone is destroyed.
2482 */
2483 z->zone_valid = FALSE;
2484 #endif
2485 unlock_zone(z);
2486
2487 /* Dump all the free elements */
2488 drop_free_elements(z);
2489
2490 #if CONFIG_GZALLOC
2491 /* If the zone is gzalloc managed dump all the elements in the free cache */
2492 gzalloc_empty_free_cache(z);
2493 #endif
2494
2495 lock_zone(z);
2496
2497 #if !KASAN_ZALLOC
2498 /* Assert that all counts are zero */
2499 assert(z->count == 0);
2500 assert(z->countfree == 0);
2501 assert(z->cur_size == 0);
2502 assert(z->page_count == 0);
2503 assert(z->count_all_free_pages == 0);
2504
2505 /* Assert that all queues except the foreign queue are empty. The zone allocator doesn't know how to free up foreign memory. */
2506 assert(queue_empty(&z->pages.all_used));
2507 assert(queue_empty(&z->pages.intermediate));
2508 assert(queue_empty(&z->pages.all_free));
2509 #endif
2510
2511 zindex = z->index;
2512
2513 unlock_zone(z);
2514
2515 simple_lock(&all_zones_lock);
2516
2517 assert(!bitmap_test(zone_empty_bitmap, zindex));
2518 /* Mark the zone as empty in the bitmap */
2519 bitmap_set(zone_empty_bitmap, zindex);
2520 num_zones_in_use--;
2521 assert(num_zones_in_use > 0);
2522
2523 simple_unlock(&all_zones_lock);
2524 }
2525
2526 /* Initialize the metadata for an allocation chunk */
2527 static inline void
2528 zcram_metadata_init(vm_offset_t newmem, vm_size_t size, struct zone_page_metadata *chunk_metadata)
2529 {
2530 struct zone_page_metadata *page_metadata;
2531
2532 /* The first page is the real metadata for this allocation chunk. We mark the others as fake metadata */
2533 size -= PAGE_SIZE;
2534 newmem += PAGE_SIZE;
2535
2536 for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) {
2537 page_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE);
2538 assert(page_metadata != chunk_metadata);
2539 PAGE_METADATA_SET_ZINDEX(page_metadata, MULTIPAGE_METADATA_MAGIC);
2540 page_metadata_set_realmeta(page_metadata, chunk_metadata);
2541 page_metadata->free_count = 0;
2542 }
2543 return;
2544 }
2545
2546
2547 /*
2548 * Boolean Random Number Generator for generating booleans to randomize
2549 * the order of elements in newly zcram()'ed memory. The algorithm is a
2550 * modified version of the KISS RNG proposed in the paper:
2551 * http://stat.fsu.edu/techreports/M802.pdf
2552 * The modifications have been documented in the technical paper
2553 * paper from UCL:
2554 * http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf
2555 */
2556
2557 static void random_bool_gen_entropy(
2558 int *buffer,
2559 int count)
2560 {
2561
2562 int i, t;
2563 simple_lock(&bool_gen_lock);
2564 for (i = 0; i < count; i++) {
2565 bool_gen_seed[1] ^= (bool_gen_seed[1] << 5);
2566 bool_gen_seed[1] ^= (bool_gen_seed[1] >> 7);
2567 bool_gen_seed[1] ^= (bool_gen_seed[1] << 22);
2568 t = bool_gen_seed[2] + bool_gen_seed[3] + bool_gen_global;
2569 bool_gen_seed[2] = bool_gen_seed[3];
2570 bool_gen_global = t < 0;
2571 bool_gen_seed[3] = t &2147483647;
2572 bool_gen_seed[0] += 1411392427;
2573 buffer[i] = (bool_gen_seed[0] + bool_gen_seed[1] + bool_gen_seed[3]);
2574 }
2575 simple_unlock(&bool_gen_lock);
2576 }
2577
2578 static boolean_t random_bool_gen(
2579 int *buffer,
2580 int index,
2581 int bufsize)
2582 {
2583 int valindex, bitpos;
2584 valindex = (index / (8 * sizeof(int))) % bufsize;
2585 bitpos = index % (8 * sizeof(int));
2586 return (boolean_t)(buffer[valindex] & (1 << bitpos));
2587 }
2588
2589 static void
2590 random_free_to_zone(
2591 zone_t zone,
2592 vm_offset_t newmem,
2593 vm_offset_t first_element_offset,
2594 int element_count,
2595 int *entropy_buffer)
2596 {
2597 vm_offset_t last_element_offset;
2598 vm_offset_t element_addr;
2599 vm_size_t elem_size;
2600 int index;
2601
2602 assert(element_count <= ZONE_CHUNK_MAXELEMENTS);
2603 elem_size = zone->elem_size;
2604 last_element_offset = first_element_offset + ((element_count * elem_size) - elem_size);
2605 for (index = 0; index < element_count; index++) {
2606 assert(first_element_offset <= last_element_offset);
2607 if (
2608 #if DEBUG || DEVELOPMENT
2609 leak_scan_debug_flag || __improbable(zone->tags) ||
2610 #endif /* DEBUG || DEVELOPMENT */
2611 random_bool_gen(entropy_buffer, index, MAX_ENTROPY_PER_ZCRAM)) {
2612 element_addr = newmem + first_element_offset;
2613 first_element_offset += elem_size;
2614 } else {
2615 element_addr = newmem + last_element_offset;
2616 last_element_offset -= elem_size;
2617 }
2618 if (element_addr != (vm_offset_t)zone) {
2619 zone->count++; /* compensate for free_to_zone */
2620 free_to_zone(zone, element_addr, FALSE);
2621 }
2622 zone->cur_size += elem_size;
2623 }
2624 }
2625
2626 /*
2627 * Cram the given memory into the specified zone. Update the zone page count accordingly.
2628 */
2629 void
2630 zcram(
2631 zone_t zone,
2632 vm_offset_t newmem,
2633 vm_size_t size)
2634 {
2635 vm_size_t elem_size;
2636 boolean_t from_zm = FALSE;
2637 int element_count;
2638 int entropy_buffer[MAX_ENTROPY_PER_ZCRAM];
2639
2640 /* Basic sanity checks */
2641 assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
2642 assert(!zone->collectable || zone->allows_foreign
2643 || (from_zone_map(newmem, size)));
2644
2645 elem_size = zone->elem_size;
2646
2647 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START, zone->index, size);
2648
2649 if (from_zone_map(newmem, size))
2650 from_zm = TRUE;
2651
2652 if (!from_zm) {
2653 /* We cannot support elements larger than page size for foreign memory because we
2654 * put metadata on the page itself for each page of foreign memory. We need to do
2655 * this in order to be able to reach the metadata when any element is freed
2656 */
2657 assert((zone->allows_foreign == TRUE) && (zone->elem_size <= (PAGE_SIZE - sizeof(struct zone_page_metadata))));
2658 }
2659
2660 if (zalloc_debug & ZALLOC_DEBUG_ZCRAM)
2661 kprintf("zcram(%p[%s], 0x%lx%s, 0x%lx)\n", zone, zone->zone_name,
2662 (unsigned long)newmem, from_zm ? "" : "[F]", (unsigned long)size);
2663
2664 ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE));
2665
2666 random_bool_gen_entropy(entropy_buffer, MAX_ENTROPY_PER_ZCRAM);
2667
2668 /*
2669 * Initialize the metadata for all pages. We dont need the zone lock
2670 * here because we are not manipulating any zone related state yet.
2671 */
2672
2673 struct zone_page_metadata *chunk_metadata;
2674 size_t zone_page_metadata_size = sizeof(struct zone_page_metadata);
2675
2676 assert((newmem & PAGE_MASK) == 0);
2677 assert((size & PAGE_MASK) == 0);
2678
2679 chunk_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE);
2680 chunk_metadata->pages.next = NULL;
2681 chunk_metadata->pages.prev = NULL;
2682 page_metadata_set_freelist(chunk_metadata, 0);
2683 PAGE_METADATA_SET_ZINDEX(chunk_metadata, zone->index);
2684 chunk_metadata->free_count = 0;
2685 assert((size / PAGE_SIZE) <= ZONE_CHUNK_MAXPAGES);
2686 chunk_metadata->page_count = (unsigned)(size / PAGE_SIZE);
2687
2688 zcram_metadata_init(newmem, size, chunk_metadata);
2689
2690 #if VM_MAX_TAG_ZONES
2691 if (__improbable(zone->tags)) {
2692 assert(from_zm);
2693 ztMemoryAdd(zone, newmem, size);
2694 }
2695 #endif /* VM_MAX_TAG_ZONES */
2696
2697 lock_zone(zone);
2698 assert(zone->zone_valid);
2699 enqueue_tail(&zone->pages.all_used, &(chunk_metadata->pages));
2700
2701 if (!from_zm) {
2702 /* We cannot support elements larger than page size for foreign memory because we
2703 * put metadata on the page itself for each page of foreign memory. We need to do
2704 * this in order to be able to reach the metadata when any element is freed
2705 */
2706
2707 for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) {
2708 vm_offset_t first_element_offset = 0;
2709 if (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT == 0){
2710 first_element_offset = zone_page_metadata_size;
2711 } else {
2712 first_element_offset = zone_page_metadata_size + (ZONE_ELEMENT_ALIGNMENT - (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT));
2713 }
2714 element_count = (int)((PAGE_SIZE - first_element_offset) / elem_size);
2715 random_free_to_zone(zone, newmem, first_element_offset, element_count, entropy_buffer);
2716 }
2717 } else {
2718 element_count = (int)(size / elem_size);
2719 random_free_to_zone(zone, newmem, 0, element_count, entropy_buffer);
2720 }
2721 unlock_zone(zone);
2722
2723 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, zone->index);
2724
2725 }
2726
2727 /*
2728 * Fill a zone with enough memory to contain at least nelem elements.
2729 * Return the number of elements actually put into the zone, which may
2730 * be more than the caller asked for since the memory allocation is
2731 * rounded up to the next zone allocation size.
2732 */
2733 int
2734 zfill(
2735 zone_t zone,
2736 int nelem)
2737 {
2738 kern_return_t kr;
2739 vm_offset_t memory;
2740
2741 vm_size_t alloc_size = zone->alloc_size;
2742 vm_size_t elem_per_alloc = alloc_size / zone->elem_size;
2743 vm_size_t nalloc = (nelem + elem_per_alloc - 1) / elem_per_alloc;
2744
2745 /* Don't mix-and-match zfill with foreign memory */
2746 assert(!zone->allows_foreign);
2747
2748 /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
2749 if (is_zone_map_nearing_exhaustion()) {
2750 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2751 }
2752
2753 kr = kernel_memory_allocate(zone_map, &memory, nalloc * alloc_size, 0, KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
2754 if (kr != KERN_SUCCESS) {
2755 printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
2756 __func__, (unsigned long)(nalloc * alloc_size));
2757 return 0;
2758 }
2759
2760 for (vm_size_t i = 0; i < nalloc; i++) {
2761 zcram(zone, memory + i * alloc_size, alloc_size);
2762 }
2763
2764 return (int)(nalloc * elem_per_alloc);
2765 }
2766
2767 /*
2768 * Initialize the "zone of zones" which uses fixed memory allocated
2769 * earlier in memory initialization. zone_bootstrap is called
2770 * before zone_init.
2771 */
2772 void
2773 zone_bootstrap(void)
2774 {
2775 char temp_buf[16];
2776 unsigned int i;
2777
2778 if (!PE_parse_boot_argn("zalloc_debug", &zalloc_debug, sizeof(zalloc_debug)))
2779 zalloc_debug = 0;
2780
2781 /* Set up zone element poisoning */
2782 zp_init();
2783
2784 /* Seed the random boolean generator for elements in zone free list */
2785 for (i = 0; i < RANDOM_BOOL_GEN_SEED_COUNT; i++) {
2786 bool_gen_seed[i] = (unsigned int)early_random();
2787 }
2788 simple_lock_init(&bool_gen_lock, 0);
2789
2790 /* should zlog log to debug zone corruption instead of leaks? */
2791 if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) {
2792 corruption_debug_flag = TRUE;
2793 }
2794
2795 #if DEBUG || DEVELOPMENT
2796 #if VM_MAX_TAG_ZONES
2797 /* enable tags for zones that ask for */
2798 if (PE_parse_boot_argn("-zt", temp_buf, sizeof(temp_buf))) {
2799 zone_tagging_on = TRUE;
2800 }
2801 #endif /* VM_MAX_TAG_ZONES */
2802 /* disable element location randomization in a page */
2803 if (PE_parse_boot_argn("-zl", temp_buf, sizeof(temp_buf))) {
2804 leak_scan_debug_flag = TRUE;
2805 }
2806 #endif
2807
2808 simple_lock_init(&all_zones_lock, 0);
2809
2810 num_zones_in_use = 0;
2811 num_zones = 0;
2812 /* Mark all zones as empty */
2813 bitmap_full(zone_empty_bitmap, BITMAP_LEN(MAX_ZONES));
2814 zone_names_next = zone_names_start = 0;
2815
2816 #if DEBUG || DEVELOPMENT
2817 simple_lock_init(&zone_test_lock, 0);
2818 #endif /* DEBUG || DEVELOPMENT */
2819
2820 thread_call_setup(&call_async_alloc, zalloc_async, NULL);
2821
2822 /* initializing global lock group for zones */
2823 lck_grp_attr_setdefault(&zone_locks_grp_attr);
2824 lck_grp_init(&zone_locks_grp, "zone_locks", &zone_locks_grp_attr);
2825
2826 lck_attr_setdefault(&zone_metadata_lock_attr);
2827 lck_mtx_init_ext(&zone_metadata_region_lck, &zone_metadata_region_lck_ext, &zone_locks_grp, &zone_metadata_lock_attr);
2828 }
2829
2830 /*
2831 * We're being very conservative here and picking a value of 95%. We might need to lower this if
2832 * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
2833 */
2834 #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
2835
2836 /*
2837 * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
2838 * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
2839 */
2840 unsigned int zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
2841
2842 /*
2843 * Returns pid of the task with the largest number of VM map entries.
2844 */
2845 extern pid_t find_largest_process_vm_map_entries(void);
2846
2847 /*
2848 * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
2849 * For any other pid we try to kill that process synchronously.
2850 */
2851 boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
2852
2853 void get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
2854 {
2855 *current_size = zone_map->size;
2856 *capacity = vm_map_max(zone_map) - vm_map_min(zone_map);
2857 }
2858
2859 void get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
2860 {
2861 zone_t largest_zone = zone_find_largest();
2862 strlcpy(zone_name, largest_zone->zone_name, zone_name_len);
2863 *zone_size = largest_zone->cur_size;
2864 }
2865
2866 boolean_t is_zone_map_nearing_exhaustion(void)
2867 {
2868 uint64_t size = zone_map->size;
2869 uint64_t capacity = vm_map_max(zone_map) - vm_map_min(zone_map);
2870 if (size > ((capacity * zone_map_jetsam_limit) / 100)) {
2871 return TRUE;
2872 }
2873 return FALSE;
2874 }
2875
2876 extern zone_t vm_map_entry_zone;
2877 extern zone_t vm_object_zone;
2878
2879 #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
2880
2881 /*
2882 * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
2883 * to walk through the jetsam priority bands and kill processes.
2884 */
2885 static void kill_process_in_largest_zone(void)
2886 {
2887 pid_t pid = -1;
2888 zone_t largest_zone = zone_find_largest();
2889
2890 printf("zone_map_exhaustion: Zone map size %lld, capacity %lld [jetsam limit %d%%]\n", (uint64_t)zone_map->size,
2891 (uint64_t)(vm_map_max(zone_map) - vm_map_min(zone_map)), zone_map_jetsam_limit);
2892 printf("zone_map_exhaustion: Largest zone %s, size %lu\n", largest_zone->zone_name, (uintptr_t)largest_zone->cur_size);
2893
2894 /*
2895 * We want to make sure we don't call this function from userspace. Or we could end up trying to synchronously kill the process
2896 * whose context we're in, causing the system to hang.
2897 */
2898 assert(current_task() == kernel_task);
2899
2900 /*
2901 * If vm_object_zone is the largest, check to see if the number of elements in vm_map_entry_zone is comparable. If so, consider
2902 * vm_map_entry_zone as the largest. This lets us target a specific process to jetsam to quickly recover from the zone map bloat.
2903 */
2904 if (largest_zone == vm_object_zone) {
2905 int vm_object_zone_count = vm_object_zone->count;
2906 int vm_map_entry_zone_count = vm_map_entry_zone->count;
2907 /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
2908 if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
2909 largest_zone = vm_map_entry_zone;
2910 printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n", (uintptr_t)largest_zone->cur_size);
2911 }
2912 }
2913
2914 /* TODO: Extend this to check for the largest process in other zones as well. */
2915 if (largest_zone == vm_map_entry_zone) {
2916 pid = find_largest_process_vm_map_entries();
2917 } else {
2918 printf("zone_map_exhaustion: Nothing to do for the largest zone [%s]. Waking up memorystatus thread.\n", largest_zone->zone_name);
2919 }
2920 if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
2921 printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
2922 }
2923 }
2924
2925 /* Global initialization of Zone Allocator.
2926 * Runs after zone_bootstrap.
2927 */
2928 void
2929 zone_init(
2930 vm_size_t max_zonemap_size)
2931 {
2932 kern_return_t retval;
2933 vm_offset_t zone_min;
2934 vm_offset_t zone_max;
2935 vm_offset_t zone_metadata_space;
2936 unsigned int zone_pages;
2937 vm_map_kernel_flags_t vmk_flags;
2938
2939 #if VM_MAX_TAG_ZONES
2940 if (zone_tagging_on) ztInit(max_zonemap_size, &zone_locks_grp);
2941 #endif
2942
2943 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
2944 vmk_flags.vmkf_permanent = TRUE;
2945 retval = kmem_suballoc(kernel_map, &zone_min, max_zonemap_size,
2946 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_ZONE,
2947 &zone_map);
2948
2949 if (retval != KERN_SUCCESS)
2950 panic("zone_init: kmem_suballoc failed");
2951 zone_max = zone_min + round_page(max_zonemap_size);
2952 #if CONFIG_GZALLOC
2953 gzalloc_init(max_zonemap_size);
2954 #endif
2955 /*
2956 * Setup garbage collection information:
2957 */
2958 zone_map_min_address = zone_min;
2959 zone_map_max_address = zone_max;
2960
2961 zone_pages = (unsigned int)atop_kernel(zone_max - zone_min);
2962 zone_metadata_space = round_page(zone_pages * sizeof(struct zone_page_metadata));
2963 retval = kernel_memory_allocate(zone_map, &zone_metadata_region_min, zone_metadata_space,
2964 0, KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT, VM_KERN_MEMORY_OSFMK);
2965 if (retval != KERN_SUCCESS)
2966 panic("zone_init: zone_metadata_region initialization failed!");
2967 zone_metadata_region_max = zone_metadata_region_min + zone_metadata_space;
2968
2969 #if defined(__LP64__)
2970 /*
2971 * ensure that any vm_page_t that gets created from
2972 * the vm_page zone can be packed properly (see vm_page.h
2973 * for the packing requirements
2974 */
2975 if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_metadata_region_max))) != (vm_page_t)zone_metadata_region_max)
2976 panic("VM_PAGE_PACK_PTR failed on zone_metadata_region_max - %p", (void *)zone_metadata_region_max);
2977
2978 if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_max_address))) != (vm_page_t)zone_map_max_address)
2979 panic("VM_PAGE_PACK_PTR failed on zone_map_max_address - %p", (void *)zone_map_max_address);
2980 #endif
2981
2982 lck_grp_attr_setdefault(&zone_gc_lck_grp_attr);
2983 lck_grp_init(&zone_gc_lck_grp, "zone_gc", &zone_gc_lck_grp_attr);
2984 lck_attr_setdefault(&zone_gc_lck_attr);
2985 lck_mtx_init_ext(&zone_gc_lock, &zone_gc_lck_ext, &zone_gc_lck_grp, &zone_gc_lck_attr);
2986
2987 #if CONFIG_ZLEAKS
2988 /*
2989 * Initialize the zone leak monitor
2990 */
2991 zleak_init(max_zonemap_size);
2992 #endif /* CONFIG_ZLEAKS */
2993
2994 #if VM_MAX_TAG_ZONES
2995 if (zone_tagging_on) vm_allocation_zones_init();
2996 #endif
2997
2998 int jetsam_limit_temp = 0;
2999 if (PE_parse_boot_argn("zone_map_jetsam_limit", &jetsam_limit_temp, sizeof (jetsam_limit_temp)) &&
3000 jetsam_limit_temp > 0 && jetsam_limit_temp <= 100)
3001 zone_map_jetsam_limit = jetsam_limit_temp;
3002 }
3003
3004 extern volatile SInt32 kfree_nop_count;
3005
3006 #pragma mark -
3007 #pragma mark zalloc_canblock
3008
3009 extern boolean_t early_boot_complete;
3010
3011 /*
3012 * zalloc returns an element from the specified zone.
3013 */
3014 static void *
3015 zalloc_internal(
3016 zone_t zone,
3017 boolean_t canblock,
3018 boolean_t nopagewait,
3019 vm_size_t
3020 #if !VM_MAX_TAG_ZONES
3021 __unused
3022 #endif
3023 reqsize,
3024 vm_tag_t tag)
3025 {
3026 vm_offset_t addr = 0;
3027 kern_return_t retval;
3028 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */
3029 int numsaved = 0;
3030 boolean_t zone_replenish_wakeup = FALSE, zone_alloc_throttle = FALSE;
3031 thread_t thr = current_thread();
3032 boolean_t check_poison = FALSE;
3033 boolean_t set_doing_alloc_with_vm_priv = FALSE;
3034
3035 #if CONFIG_ZLEAKS
3036 uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */
3037 #endif /* CONFIG_ZLEAKS */
3038
3039 #if KASAN
3040 /*
3041 * KASan uses zalloc() for fakestack, which can be called anywhere. However,
3042 * we make sure these calls can never block.
3043 */
3044 boolean_t irq_safe = FALSE;
3045 const char *fakestack_name = "fakestack.";
3046 if (strncmp(zone->zone_name, fakestack_name, strlen(fakestack_name)) == 0) {
3047 irq_safe = TRUE;
3048 }
3049 #elif MACH_ASSERT
3050 /* In every other case, zalloc() from interrupt context is unsafe. */
3051 const boolean_t irq_safe = FALSE;
3052 #endif
3053
3054 assert(zone != ZONE_NULL);
3055 assert(irq_safe || ml_get_interrupts_enabled() || ml_is_quiescing() || debug_mode_active() || !early_boot_complete);
3056
3057 #if CONFIG_GZALLOC
3058 addr = gzalloc_alloc(zone, canblock);
3059 #endif
3060 /*
3061 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
3062 */
3063 if (__improbable(DO_LOGGING(zone)))
3064 numsaved = OSBacktrace((void*) zbt, MAX_ZTRACE_DEPTH);
3065
3066 #if CONFIG_ZLEAKS
3067 /*
3068 * Zone leak detection: capture a backtrace every zleak_sample_factor
3069 * allocations in this zone.
3070 */
3071 if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) {
3072 /* Avoid backtracing twice if zone logging is on */
3073 if (numsaved == 0)
3074 zleak_tracedepth = backtrace(zbt, MAX_ZTRACE_DEPTH);
3075 else
3076 zleak_tracedepth = numsaved;
3077 }
3078 #endif /* CONFIG_ZLEAKS */
3079
3080 #if VM_MAX_TAG_ZONES
3081 if (__improbable(zone->tags)) vm_tag_will_update_zone(tag, zone->tag_zone_index);
3082 #endif /* VM_MAX_TAG_ZONES */
3083
3084 lock_zone(zone);
3085 assert(zone->zone_valid);
3086
3087 if (zone->async_prio_refill && zone->zone_replenish_thread) {
3088 vm_size_t zfreec = (zone->cur_size - (zone->count * zone->elem_size));
3089 vm_size_t zrefillwm = zone->prio_refill_watermark * zone->elem_size;
3090 zone_replenish_wakeup = (zfreec < zrefillwm);
3091 zone_alloc_throttle = (((zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0)) || (zfreec == 0));
3092
3093 do {
3094 if (zone_replenish_wakeup) {
3095 zone_replenish_wakeups_initiated++;
3096 /* Signal the potentially waiting
3097 * refill thread.
3098 */
3099 thread_wakeup(&zone->zone_replenish_thread);
3100
3101 /* We don't want to wait around for zone_replenish_thread to bump up the free count
3102 * if we're in zone_gc(). This keeps us from deadlocking with zone_replenish_thread.
3103 */
3104 if (thr->options & TH_OPT_ZONE_GC)
3105 break;
3106
3107 unlock_zone(zone);
3108 /* Scheduling latencies etc. may prevent
3109 * the refill thread from keeping up
3110 * with demand. Throttle consumers
3111 * when we fall below half the
3112 * watermark, unless VM privileged
3113 */
3114 if (zone_alloc_throttle) {
3115 zone_replenish_throttle_count++;
3116 assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
3117 thread_block(THREAD_CONTINUE_NULL);
3118 }
3119 lock_zone(zone);
3120 assert(zone->zone_valid);
3121 }
3122
3123 zfreec = (zone->cur_size - (zone->count * zone->elem_size));
3124 zrefillwm = zone->prio_refill_watermark * zone->elem_size;
3125 zone_replenish_wakeup = (zfreec < zrefillwm);
3126 zone_alloc_throttle = (((zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0)) || (zfreec == 0));
3127
3128 } while (zone_alloc_throttle == TRUE);
3129 }
3130
3131 if (__probable(addr == 0))
3132 addr = try_alloc_from_zone(zone, tag, &check_poison);
3133
3134 /* If we're here because of zone_gc(), we didn't wait for zone_replenish_thread to finish.
3135 * So we need to ensure that we did successfully grab an element. And we only need to assert
3136 * this for zones that have a replenish thread configured (in this case, the Reserved VM map
3137 * entries zone).
3138 */
3139 if (thr->options & TH_OPT_ZONE_GC && zone->async_prio_refill)
3140 assert(addr != 0);
3141
3142 while ((addr == 0) && canblock) {
3143 /*
3144 * zone is empty, try to expand it
3145 *
3146 * Note that we now allow up to 2 threads (1 vm_privliged and 1 non-vm_privliged)
3147 * to expand the zone concurrently... this is necessary to avoid stalling
3148 * vm_privileged threads running critical code necessary to continue compressing/swapping
3149 * pages (i.e. making new free pages) from stalling behind non-vm_privileged threads
3150 * waiting to acquire free pages when the vm_page_free_count is below the
3151 * vm_page_free_reserved limit.
3152 */
3153 if ((zone->doing_alloc_without_vm_priv || zone->doing_alloc_with_vm_priv) &&
3154 (((thr->options & TH_OPT_VMPRIV) == 0) || zone->doing_alloc_with_vm_priv)) {
3155 /*
3156 * This is a non-vm_privileged thread and a non-vm_privileged or
3157 * a vm_privileged thread is already expanding the zone...
3158 * OR
3159 * this is a vm_privileged thread and a vm_privileged thread is
3160 * already expanding the zone...
3161 *
3162 * In either case wait for a thread to finish, then try again.
3163 */
3164 zone->waiting = TRUE;
3165 zone_sleep(zone);
3166 } else {
3167 vm_offset_t space;
3168 vm_size_t alloc_size;
3169 int retry = 0;
3170
3171 if ((zone->cur_size + zone->elem_size) >
3172 zone->max_size) {
3173 if (zone->exhaustible)
3174 break;
3175 if (zone->expandable) {
3176 /*
3177 * We're willing to overflow certain
3178 * zones, but not without complaining.
3179 *
3180 * This is best used in conjunction
3181 * with the collectable flag. What we
3182 * want is an assurance we can get the
3183 * memory back, assuming there's no
3184 * leak.
3185 */
3186 zone->max_size += (zone->max_size >> 1);
3187 } else {
3188 unlock_zone(zone);
3189
3190 panic_include_zprint = TRUE;
3191 #if CONFIG_ZLEAKS
3192 if (zleak_state & ZLEAK_STATE_ACTIVE)
3193 panic_include_ztrace = TRUE;
3194 #endif /* CONFIG_ZLEAKS */
3195 panic("zalloc: zone \"%s\" empty.", zone->zone_name);
3196 }
3197 }
3198 /*
3199 * It is possible that a BG thread is refilling/expanding the zone
3200 * and gets pre-empted during that operation. That blocks all other
3201 * threads from making progress leading to a watchdog timeout. To
3202 * avoid that, boost the thread priority using the rwlock boost
3203 */
3204 set_thread_rwlock_boost();
3205
3206 if ((thr->options & TH_OPT_VMPRIV)) {
3207 zone->doing_alloc_with_vm_priv = TRUE;
3208 set_doing_alloc_with_vm_priv = TRUE;
3209 } else {
3210 zone->doing_alloc_without_vm_priv = TRUE;
3211 }
3212 unlock_zone(zone);
3213
3214 for (;;) {
3215 int zflags = KMA_KOBJECT|KMA_NOPAGEWAIT;
3216
3217 if (vm_pool_low() || retry >= 1)
3218 alloc_size =
3219 round_page(zone->elem_size);
3220 else
3221 alloc_size = zone->alloc_size;
3222
3223 if (zone->noencrypt)
3224 zflags |= KMA_NOENCRYPT;
3225
3226 /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
3227 if (is_zone_map_nearing_exhaustion()) {
3228 thread_wakeup((event_t) &vm_pageout_garbage_collect);
3229 }
3230
3231 retval = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
3232 if (retval == KERN_SUCCESS) {
3233 #if CONFIG_ZLEAKS
3234 if ((zleak_state & (ZLEAK_STATE_ENABLED | ZLEAK_STATE_ACTIVE)) == ZLEAK_STATE_ENABLED) {
3235 if (zone_map->size >= zleak_global_tracking_threshold) {
3236 kern_return_t kr;
3237
3238 kr = zleak_activate();
3239 if (kr != KERN_SUCCESS) {
3240 printf("Failed to activate live zone leak debugging (%d).\n", kr);
3241 }
3242 }
3243 }
3244
3245 if ((zleak_state & ZLEAK_STATE_ACTIVE) && !(zone->zleak_on)) {
3246 if (zone->cur_size > zleak_per_zone_tracking_threshold) {
3247 zone->zleak_on = TRUE;
3248 }
3249 }
3250 #endif /* CONFIG_ZLEAKS */
3251 zcram(zone, space, alloc_size);
3252
3253 break;
3254 } else if (retval != KERN_RESOURCE_SHORTAGE) {
3255 retry++;
3256
3257 if (retry == 3) {
3258 panic_include_zprint = TRUE;
3259 #if CONFIG_ZLEAKS
3260 if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
3261 panic_include_ztrace = TRUE;
3262 }
3263 #endif /* CONFIG_ZLEAKS */
3264 if (retval == KERN_NO_SPACE) {
3265 zone_t zone_largest = zone_find_largest();
3266 panic("zalloc: zone map exhausted while allocating from zone %s, likely due to memory leak in zone %s (%lu total bytes, %d elements allocated)",
3267 zone->zone_name, zone_largest->zone_name,
3268 (unsigned long)zone_largest->cur_size, zone_largest->count);
3269
3270 }
3271 panic("zalloc: \"%s\" (%d elements) retry fail %d, kfree_nop_count: %d", zone->zone_name, zone->count, retval, (int)kfree_nop_count);
3272 }
3273 } else {
3274 break;
3275 }
3276 }
3277 lock_zone(zone);
3278 assert(zone->zone_valid);
3279
3280 if (set_doing_alloc_with_vm_priv == TRUE)
3281 zone->doing_alloc_with_vm_priv = FALSE;
3282 else
3283 zone->doing_alloc_without_vm_priv = FALSE;
3284
3285 if (zone->waiting) {
3286 zone->waiting = FALSE;
3287 zone_wakeup(zone);
3288 }
3289 clear_thread_rwlock_boost();
3290
3291 addr = try_alloc_from_zone(zone, tag, &check_poison);
3292 if (addr == 0 &&
3293 retval == KERN_RESOURCE_SHORTAGE) {
3294 if (nopagewait == TRUE)
3295 break; /* out of the main while loop */
3296 unlock_zone(zone);
3297
3298 VM_PAGE_WAIT();
3299 lock_zone(zone);
3300 assert(zone->zone_valid);
3301 }
3302 }
3303 if (addr == 0)
3304 addr = try_alloc_from_zone(zone, tag, &check_poison);
3305 }
3306
3307 #if CONFIG_ZLEAKS
3308 /* Zone leak detection:
3309 * If we're sampling this allocation, add it to the zleaks hash table.
3310 */
3311 if (addr && zleak_tracedepth > 0) {
3312 /* Sampling can fail if another sample is happening at the same time in a different zone. */
3313 if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) {
3314 /* If it failed, roll back the counter so we sample the next allocation instead. */
3315 zone->zleak_capture = zleak_sample_factor;
3316 }
3317 }
3318 #endif /* CONFIG_ZLEAKS */
3319
3320
3321 if ((addr == 0) && (!canblock || nopagewait) && (zone->async_pending == FALSE) && (zone->no_callout == FALSE) && (zone->exhaustible == FALSE) && (!vm_pool_low())) {
3322 zone->async_pending = TRUE;
3323 unlock_zone(zone);
3324 thread_call_enter(&call_async_alloc);
3325 lock_zone(zone);
3326 assert(zone->zone_valid);
3327 addr = try_alloc_from_zone(zone, tag, &check_poison);
3328 }
3329
3330 #if VM_MAX_TAG_ZONES
3331 if (__improbable(zone->tags) && addr) {
3332 if (reqsize) reqsize = zone->elem_size - reqsize;
3333 vm_tag_update_zone_size(tag, zone->tag_zone_index, zone->elem_size, reqsize);
3334 }
3335 #endif /* VM_MAX_TAG_ZONES */
3336
3337 unlock_zone(zone);
3338
3339 vm_offset_t inner_size = zone->elem_size;
3340
3341 if (__improbable(DO_LOGGING(zone) && addr)) {
3342 btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved);
3343 }
3344
3345 if (__improbable(check_poison && addr)) {
3346 vm_offset_t *element_cursor = ((vm_offset_t *) addr) + 1;
3347 vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *) addr);
3348
3349 for ( ; element_cursor < backup ; element_cursor++)
3350 if (__improbable(*element_cursor != ZP_POISON))
3351 zone_element_was_modified_panic(zone,
3352 addr,
3353 *element_cursor,
3354 ZP_POISON,
3355 ((vm_offset_t)element_cursor) - addr);
3356 }
3357
3358 if (addr) {
3359 /*
3360 * Clear out the old next pointer and backup to avoid leaking the cookie
3361 * and so that only values on the freelist have a valid cookie
3362 */
3363
3364 vm_offset_t *primary = (vm_offset_t *) addr;
3365 vm_offset_t *backup = get_backup_ptr(inner_size, primary);
3366
3367 *primary = ZP_POISON;
3368 *backup = ZP_POISON;
3369
3370 #if DEBUG || DEVELOPMENT
3371 if (__improbable(leak_scan_debug_flag && !(zone->elem_size & (sizeof(uintptr_t) - 1)))) {
3372 int count, idx;
3373 /* Fill element, from tail, with backtrace in reverse order */
3374 if (numsaved == 0) numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH);
3375 count = (int) (zone->elem_size / sizeof(uintptr_t));
3376 if (count >= numsaved) count = numsaved - 1;
3377 for (idx = 0; idx < count; idx++) ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
3378 }
3379 #endif /* DEBUG || DEVELOPMENT */
3380 }
3381
3382 TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr);
3383
3384 #if KASAN_ZALLOC
3385 /* Fixup the return address to skip the redzone */
3386 if (zone->kasan_redzone) {
3387 addr = kasan_alloc(addr, zone->elem_size,
3388 zone->elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
3389 }
3390 #endif
3391
3392 return((void *)addr);
3393 }
3394
3395 void *
3396 zalloc(zone_t zone)
3397 {
3398 return (zalloc_internal(zone, TRUE, FALSE, 0, VM_KERN_MEMORY_NONE));
3399 }
3400
3401 void *
3402 zalloc_noblock(zone_t zone)
3403 {
3404 return (zalloc_internal(zone, FALSE, FALSE, 0, VM_KERN_MEMORY_NONE));
3405 }
3406
3407 void *
3408 zalloc_nopagewait(zone_t zone)
3409 {
3410 return (zalloc_internal(zone, TRUE, TRUE, 0, VM_KERN_MEMORY_NONE));
3411 }
3412
3413 void *
3414 zalloc_canblock_tag(zone_t zone, boolean_t canblock, vm_size_t reqsize, vm_tag_t tag)
3415 {
3416 return (zalloc_internal(zone, canblock, FALSE, reqsize, tag));
3417 }
3418
3419 void *
3420 zalloc_canblock(zone_t zone, boolean_t canblock)
3421 {
3422 return (zalloc_internal(zone, canblock, FALSE, 0, VM_KERN_MEMORY_NONE));
3423 }
3424
3425
3426 void
3427 zalloc_async(
3428 __unused thread_call_param_t p0,
3429 __unused thread_call_param_t p1)
3430 {
3431 zone_t current_z = NULL;
3432 unsigned int max_zones, i;
3433 void *elt = NULL;
3434 boolean_t pending = FALSE;
3435
3436 simple_lock(&all_zones_lock);
3437 max_zones = num_zones;
3438 simple_unlock(&all_zones_lock);
3439 for (i = 0; i < max_zones; i++) {
3440 current_z = &(zone_array[i]);
3441
3442 if (current_z->no_callout == TRUE) {
3443 /* async_pending will never be set */
3444 continue;
3445 }
3446
3447 lock_zone(current_z);
3448 if (current_z->zone_valid && current_z->async_pending == TRUE) {
3449 current_z->async_pending = FALSE;
3450 pending = TRUE;
3451 }
3452 unlock_zone(current_z);
3453
3454 if (pending == TRUE) {
3455 elt = zalloc_canblock_tag(current_z, TRUE, 0, VM_KERN_MEMORY_OSFMK);
3456 zfree(current_z, elt);
3457 pending = FALSE;
3458 }
3459 }
3460 }
3461
3462 /*
3463 * zget returns an element from the specified zone
3464 * and immediately returns nothing if there is nothing there.
3465 */
3466 void *
3467 zget(
3468 zone_t zone)
3469 {
3470 return zalloc_internal(zone, FALSE, TRUE, 0, VM_KERN_MEMORY_NONE);
3471 }
3472
3473 /* Keep this FALSE by default. Large memory machine run orders of magnitude
3474 slower in debug mode when true. Use debugger to enable if needed */
3475 /* static */ boolean_t zone_check = FALSE;
3476
3477 static void zone_check_freelist(zone_t zone, vm_offset_t elem)
3478 {
3479 struct zone_free_element *this;
3480 struct zone_page_metadata *thispage;
3481
3482 if (zone->allows_foreign) {
3483 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
3484 !queue_end(&zone->pages.any_free_foreign, &(thispage->pages));
3485 thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3486 for (this = page_metadata_get_freelist(thispage);
3487 this != NULL;
3488 this = this->next) {
3489 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem)
3490 panic("zone_check_freelist");
3491 }
3492 }
3493 }
3494 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
3495 !queue_end(&zone->pages.all_free, &(thispage->pages));
3496 thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3497 for (this = page_metadata_get_freelist(thispage);
3498 this != NULL;
3499 this = this->next) {
3500 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem)
3501 panic("zone_check_freelist");
3502 }
3503 }
3504 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
3505 !queue_end(&zone->pages.intermediate, &(thispage->pages));
3506 thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3507 for (this = page_metadata_get_freelist(thispage);
3508 this != NULL;
3509 this = this->next) {
3510 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem)
3511 panic("zone_check_freelist");
3512 }
3513 }
3514 }
3515
3516 void
3517 zfree(
3518 zone_t zone,
3519 void *addr)
3520 {
3521 vm_offset_t elem = (vm_offset_t) addr;
3522 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* only used if zone logging is enabled via boot-args */
3523 int numsaved = 0;
3524 boolean_t gzfreed = FALSE;
3525 boolean_t poison = FALSE;
3526 #if VM_MAX_TAG_ZONES
3527 vm_tag_t tag;
3528 #endif /* VM_MAX_TAG_ZONES */
3529
3530 assert(zone != ZONE_NULL);
3531
3532 #if KASAN_ZALLOC
3533 /*
3534 * Resize back to the real allocation size and hand off to the KASan
3535 * quarantine. `addr` may then point to a different allocation.
3536 */
3537 vm_size_t usersz = zone->elem_size - 2 * zone->kasan_redzone;
3538 vm_size_t sz = usersz;
3539 if (addr && zone->kasan_redzone) {
3540 kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
3541 addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
3542 assert(sz == zone->elem_size);
3543 }
3544 if (addr && zone->kasan_quarantine) {
3545 kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, &zone, usersz, true);
3546 if (!addr) {
3547 return;
3548 }
3549 }
3550 elem = (vm_offset_t)addr;
3551 #endif
3552
3553 /*
3554 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
3555 */
3556
3557 if (__improbable(DO_LOGGING(zone) && corruption_debug_flag))
3558 numsaved = OSBacktrace((void *)zbt, MAX_ZTRACE_DEPTH);
3559
3560 #if MACH_ASSERT
3561 /* Basic sanity checks */
3562 if (zone == ZONE_NULL || elem == (vm_offset_t)0)
3563 panic("zfree: NULL");
3564 #endif
3565
3566 #if CONFIG_GZALLOC
3567 gzfreed = gzalloc_free(zone, addr);
3568 #endif
3569
3570 if (!gzfreed) {
3571 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
3572 if (zone != PAGE_METADATA_GET_ZONE(page_meta)) {
3573 panic("Element %p from zone %s caught being freed to wrong zone %s\n", addr, PAGE_METADATA_GET_ZONE(page_meta)->zone_name, zone->zone_name);
3574 }
3575 }
3576
3577 TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, zone->elem_size, (uintptr_t)addr);
3578
3579 if (__improbable(!gzfreed && zone->collectable && !zone->allows_foreign &&
3580 !from_zone_map(elem, zone->elem_size))) {
3581 panic("zfree: non-allocated memory in collectable zone!");
3582 }
3583
3584 if ((zp_factor != 0 || zp_tiny_zone_limit != 0) && !gzfreed) {
3585 /*
3586 * Poison the memory before it ends up on the freelist to catch
3587 * use-after-free and use of uninitialized memory
3588 *
3589 * Always poison tiny zones' elements (limit is 0 if -no-zp is set)
3590 * Also poison larger elements periodically
3591 */
3592
3593 vm_offset_t inner_size = zone->elem_size;
3594
3595 uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale);
3596
3597 if (inner_size <= zp_tiny_zone_limit)
3598 poison = TRUE;
3599 else if (zp_factor != 0 && sample_counter(&zone->zp_count, sample_factor) == TRUE)
3600 poison = TRUE;
3601
3602 if (__improbable(poison)) {
3603
3604 /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
3605 /* Poison everything but primary and backup */
3606 vm_offset_t *element_cursor = ((vm_offset_t *) elem) + 1;
3607 vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *)elem);
3608
3609 for ( ; element_cursor < backup; element_cursor++)
3610 *element_cursor = ZP_POISON;
3611 }
3612 }
3613
3614 /*
3615 * See if we're doing logging on this zone. There are two styles of logging used depending on
3616 * whether we're trying to catch a leak or corruption. See comments above in zalloc for details.
3617 */
3618
3619 if (__improbable(DO_LOGGING(zone))) {
3620 if (corruption_debug_flag) {
3621 /*
3622 * We're logging to catch a corruption. Add a record of this zfree operation
3623 * to log.
3624 */
3625 btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE, (void **)zbt, numsaved);
3626 } else {
3627 /*
3628 * We're logging to catch a leak. Remove any record we might have for this
3629 * element since it's being freed. Note that we may not find it if the buffer
3630 * overflowed and that's OK. Since the log is of a limited size, old records
3631 * get overwritten if there are more zallocs than zfrees.
3632 */
3633 btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
3634 }
3635 }
3636
3637 lock_zone(zone);
3638 assert(zone->zone_valid);
3639
3640 if (zone_check) {
3641 zone_check_freelist(zone, elem);
3642 }
3643
3644 if (__probable(!gzfreed)) {
3645 #if VM_MAX_TAG_ZONES
3646 if (__improbable(zone->tags)) {
3647 tag = (ZTAG(zone, elem)[0] >> 1);
3648 // set the tag with b0 clear so the block remains inuse
3649 ZTAG(zone, elem)[0] = 0xFFFE;
3650 }
3651 #endif /* VM_MAX_TAG_ZONES */
3652 free_to_zone(zone, elem, poison);
3653 }
3654
3655 #if MACH_ASSERT
3656 if (zone->count < 0)
3657 panic("zfree: zone count underflow in zone %s while freeing element %p, possible cause: double frees or freeing memory that did not come from this zone",
3658 zone->zone_name, addr);
3659 #endif
3660
3661
3662 #if CONFIG_ZLEAKS
3663 /*
3664 * Zone leak detection: un-track the allocation
3665 */
3666 if (zone->zleak_on) {
3667 zleak_free(elem, zone->elem_size);
3668 }
3669 #endif /* CONFIG_ZLEAKS */
3670
3671 #if VM_MAX_TAG_ZONES
3672 if (__improbable(zone->tags) && __probable(!gzfreed)) {
3673 vm_tag_update_zone_size(tag, zone->tag_zone_index, -((int64_t)zone->elem_size), 0);
3674 }
3675 #endif /* VM_MAX_TAG_ZONES */
3676
3677 unlock_zone(zone);
3678 }
3679
3680 /* Change a zone's flags.
3681 * This routine must be called immediately after zinit.
3682 */
3683 void
3684 zone_change(
3685 zone_t zone,
3686 unsigned int item,
3687 boolean_t value)
3688 {
3689 assert( zone != ZONE_NULL );
3690 assert( value == TRUE || value == FALSE );
3691
3692 switch(item){
3693 case Z_NOENCRYPT:
3694 zone->noencrypt = value;
3695 break;
3696 case Z_EXHAUST:
3697 zone->exhaustible = value;
3698 break;
3699 case Z_COLLECT:
3700 zone->collectable = value;
3701 break;
3702 case Z_EXPAND:
3703 zone->expandable = value;
3704 break;
3705 case Z_FOREIGN:
3706 zone->allows_foreign = value;
3707 break;
3708 case Z_CALLERACCT:
3709 zone->caller_acct = value;
3710 break;
3711 case Z_NOCALLOUT:
3712 zone->no_callout = value;
3713 break;
3714 case Z_TAGS_ENABLED:
3715 #if VM_MAX_TAG_ZONES
3716 {
3717 static int tag_zone_index;
3718 zone->tags = TRUE;
3719 zone->tags_inline = (((page_size + zone->elem_size - 1) / zone->elem_size) <= (sizeof(uint32_t) / sizeof(uint16_t)));
3720 zone->tag_zone_index = OSAddAtomic(1, &tag_zone_index);
3721 }
3722 #endif /* VM_MAX_TAG_ZONES */
3723 break;
3724 case Z_GZALLOC_EXEMPT:
3725 zone->gzalloc_exempt = value;
3726 #if CONFIG_GZALLOC
3727 gzalloc_reconfigure(zone);
3728 #endif
3729 break;
3730 case Z_ALIGNMENT_REQUIRED:
3731 zone->alignment_required = value;
3732 #if KASAN_ZALLOC
3733 if (zone->kasan_redzone == KASAN_GUARD_SIZE) {
3734 /* Don't disturb alignment with the redzone for zones with
3735 * specific alignment requirements. */
3736 zone->elem_size -= zone->kasan_redzone * 2;
3737 zone->kasan_redzone = 0;
3738 }
3739 #endif
3740 #if CONFIG_GZALLOC
3741 gzalloc_reconfigure(zone);
3742 #endif
3743 break;
3744 case Z_KASAN_QUARANTINE:
3745 zone->kasan_quarantine = value;
3746 break;
3747 default:
3748 panic("Zone_change: Wrong Item Type!");
3749 /* break; */
3750 }
3751 }
3752
3753 /*
3754 * Return the expected number of free elements in the zone.
3755 * This calculation will be incorrect if items are zfree'd that
3756 * were never zalloc'd/zget'd. The correct way to stuff memory
3757 * into a zone is by zcram.
3758 */
3759
3760 integer_t
3761 zone_free_count(zone_t zone)
3762 {
3763 integer_t free_count;
3764
3765 lock_zone(zone);
3766 free_count = zone->countfree;
3767 unlock_zone(zone);
3768
3769 assert(free_count >= 0);
3770
3771 return(free_count);
3772 }
3773
3774 /* Drops the elements in the free queue of a zone. Called by zone_gc() on each zone, and when a zone is zdestroy'ed. */
3775 void
3776 drop_free_elements(zone_t z)
3777 {
3778 vm_size_t elt_size, size_freed;
3779 int total_freed_pages = 0;
3780 uint64_t old_all_free_count;
3781 struct zone_page_metadata *page_meta;
3782 queue_head_t page_meta_head;
3783
3784 lock_zone(z);
3785 if (queue_empty(&z->pages.all_free)) {
3786 unlock_zone(z);
3787 return;
3788 }
3789
3790 /*
3791 * Snatch all of the free elements away from the zone.
3792 */
3793 elt_size = z->elem_size;
3794 old_all_free_count = z->count_all_free_pages;
3795 queue_new_head(&z->pages.all_free, &page_meta_head, struct zone_page_metadata *, pages);
3796 queue_init(&z->pages.all_free);
3797 z->count_all_free_pages = 0;
3798 unlock_zone(z);
3799
3800 /* Iterate through all elements to find out size and count of elements we snatched */
3801 size_freed = 0;
3802 queue_iterate(&page_meta_head, page_meta, struct zone_page_metadata *, pages) {
3803 assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */
3804 size_freed += elt_size * page_meta->free_count;
3805 }
3806
3807 /* Update the zone size and free element count */
3808 lock_zone(z);
3809 z->cur_size -= size_freed;
3810 z->countfree -= size_freed/elt_size;
3811 unlock_zone(z);
3812
3813 while ((page_meta = (struct zone_page_metadata *)dequeue_head(&page_meta_head)) != NULL) {
3814 vm_address_t free_page_address;
3815 /* Free the pages for metadata and account for them */
3816 free_page_address = get_zone_page(page_meta);
3817 ZONE_PAGE_COUNT_DECR(z, page_meta->page_count);
3818 total_freed_pages += page_meta->page_count;
3819 old_all_free_count -= page_meta->page_count;
3820 #if KASAN_ZALLOC
3821 kasan_poison_range(free_page_address, page_meta->page_count * PAGE_SIZE, ASAN_VALID);
3822 #endif
3823 #if VM_MAX_TAG_ZONES
3824 if (z->tags) ztMemoryRemove(z, free_page_address, (page_meta->page_count * PAGE_SIZE));
3825 #endif /* VM_MAX_TAG_ZONES */
3826 kmem_free(zone_map, free_page_address, (page_meta->page_count * PAGE_SIZE));
3827 if (current_thread()->options & TH_OPT_ZONE_GC) {
3828 thread_yield_to_preemption();
3829 }
3830 }
3831
3832 /* We freed all the pages from the all_free list for this zone */
3833 assert(old_all_free_count == 0);
3834
3835 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC)
3836 kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed/elt_size, total_freed_pages);
3837 }
3838
3839 /* Zone garbage collection
3840 *
3841 * zone_gc will walk through all the free elements in all the
3842 * zones that are marked collectable looking for reclaimable
3843 * pages. zone_gc is called by consider_zone_gc when the system
3844 * begins to run out of memory.
3845 *
3846 * We should ensure that zone_gc never blocks.
3847 */
3848 void
3849 zone_gc(boolean_t consider_jetsams)
3850 {
3851 unsigned int max_zones;
3852 zone_t z;
3853 unsigned int i;
3854
3855 if (consider_jetsams) {
3856 kill_process_in_largest_zone();
3857 /*
3858 * If we do end up jetsamming something, we need to do a zone_gc so that
3859 * we can reclaim free zone elements and update the zone map size.
3860 * Fall through.
3861 */
3862 }
3863
3864 lck_mtx_lock(&zone_gc_lock);
3865
3866 current_thread()->options |= TH_OPT_ZONE_GC;
3867
3868 simple_lock(&all_zones_lock);
3869 max_zones = num_zones;
3870 simple_unlock(&all_zones_lock);
3871
3872 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC)
3873 kprintf("zone_gc() starting...\n");
3874
3875 for (i = 0; i < max_zones; i++) {
3876 z = &(zone_array[i]);
3877 assert(z != ZONE_NULL);
3878
3879 if (!z->collectable) {
3880 continue;
3881 }
3882
3883 if (queue_empty(&z->pages.all_free)) {
3884 continue;
3885 }
3886
3887 drop_free_elements(z);
3888 }
3889
3890 current_thread()->options &= ~TH_OPT_ZONE_GC;
3891
3892 lck_mtx_unlock(&zone_gc_lock);
3893 }
3894
3895 extern vm_offset_t kmapoff_kaddr;
3896 extern unsigned int kmapoff_pgcnt;
3897
3898 /*
3899 * consider_zone_gc:
3900 *
3901 * Called by the pageout daemon when the system needs more free pages.
3902 */
3903
3904 void
3905 consider_zone_gc(boolean_t consider_jetsams)
3906 {
3907 if (kmapoff_kaddr != 0) {
3908 /*
3909 * One-time reclaim of kernel_map resources we allocated in
3910 * early boot.
3911 */
3912 (void) vm_deallocate(kernel_map,
3913 kmapoff_kaddr, kmapoff_pgcnt * PAGE_SIZE_64);
3914 kmapoff_kaddr = 0;
3915 }
3916
3917 if (zone_gc_allowed)
3918 zone_gc(consider_jetsams);
3919 }
3920
3921 kern_return_t
3922 task_zone_info(
3923 __unused task_t task,
3924 __unused mach_zone_name_array_t *namesp,
3925 __unused mach_msg_type_number_t *namesCntp,
3926 __unused task_zone_info_array_t *infop,
3927 __unused mach_msg_type_number_t *infoCntp)
3928 {
3929 return KERN_FAILURE;
3930 }
3931
3932 kern_return_t
3933 mach_zone_info(
3934 host_priv_t host,
3935 mach_zone_name_array_t *namesp,
3936 mach_msg_type_number_t *namesCntp,
3937 mach_zone_info_array_t *infop,
3938 mach_msg_type_number_t *infoCntp)
3939 {
3940 return (mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL));
3941 }
3942
3943
3944 kern_return_t
3945 mach_memory_info(
3946 host_priv_t host,
3947 mach_zone_name_array_t *namesp,
3948 mach_msg_type_number_t *namesCntp,
3949 mach_zone_info_array_t *infop,
3950 mach_msg_type_number_t *infoCntp,
3951 mach_memory_info_array_t *memoryInfop,
3952 mach_msg_type_number_t *memoryInfoCntp)
3953 {
3954 mach_zone_name_t *names;
3955 vm_offset_t names_addr;
3956 vm_size_t names_size;
3957
3958 mach_zone_info_t *info;
3959 vm_offset_t info_addr;
3960 vm_size_t info_size;
3961
3962 mach_memory_info_t *memory_info;
3963 vm_offset_t memory_info_addr;
3964 vm_size_t memory_info_size;
3965 vm_size_t memory_info_vmsize;
3966 unsigned int num_info;
3967
3968 unsigned int max_zones, used_zones, i;
3969 zone_t z;
3970 mach_zone_name_t *zn;
3971 mach_zone_info_t *zi;
3972 kern_return_t kr;
3973
3974 vm_size_t used;
3975 vm_map_copy_t copy;
3976 uint64_t zones_collectable_bytes = 0;
3977
3978 if (host == HOST_NULL)
3979 return KERN_INVALID_HOST;
3980 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
3981 if (!PE_i_can_has_debugger(NULL))
3982 return KERN_INVALID_HOST;
3983 #endif
3984
3985 /*
3986 * We assume that zones aren't freed once allocated.
3987 * We won't pick up any zones that are allocated later.
3988 */
3989
3990 simple_lock(&all_zones_lock);
3991 max_zones = (unsigned int)(num_zones);
3992 simple_unlock(&all_zones_lock);
3993
3994 names_size = round_page(max_zones * sizeof *names);
3995 kr = kmem_alloc_pageable(ipc_kernel_map,
3996 &names_addr, names_size, VM_KERN_MEMORY_IPC);
3997 if (kr != KERN_SUCCESS)
3998 return kr;
3999 names = (mach_zone_name_t *) names_addr;
4000
4001 info_size = round_page(max_zones * sizeof *info);
4002 kr = kmem_alloc_pageable(ipc_kernel_map,
4003 &info_addr, info_size, VM_KERN_MEMORY_IPC);
4004 if (kr != KERN_SUCCESS) {
4005 kmem_free(ipc_kernel_map,
4006 names_addr, names_size);
4007 return kr;
4008 }
4009 info = (mach_zone_info_t *) info_addr;
4010
4011 zn = &names[0];
4012 zi = &info[0];
4013
4014 used_zones = max_zones;
4015 for (i = 0; i < max_zones; i++) {
4016 struct zone zcopy;
4017 z = &(zone_array[i]);
4018 assert(z != ZONE_NULL);
4019
4020 lock_zone(z);
4021 if (!z->zone_valid) {
4022 unlock_zone(z);
4023 used_zones--;
4024 continue;
4025 }
4026 zcopy = *z;
4027 unlock_zone(z);
4028
4029 /* assuming here the name data is static */
4030 (void) __nosan_strncpy(zn->mzn_name, zcopy.zone_name,
4031 sizeof zn->mzn_name);
4032 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0';
4033
4034 zi->mzi_count = (uint64_t)zcopy.count;
4035 zi->mzi_cur_size = ptoa_64(zcopy.page_count);
4036 zi->mzi_max_size = (uint64_t)zcopy.max_size;
4037 zi->mzi_elem_size = (uint64_t)zcopy.elem_size;
4038 zi->mzi_alloc_size = (uint64_t)zcopy.alloc_size;
4039 zi->mzi_sum_size = zcopy.sum_count * zcopy.elem_size;
4040 zi->mzi_exhaustible = (uint64_t)zcopy.exhaustible;
4041 zi->mzi_collectable = (uint64_t)zcopy.collectable;
4042 zones_collectable_bytes += ((uint64_t)zcopy.count_all_free_pages * PAGE_SIZE);
4043 zn++;
4044 zi++;
4045 }
4046
4047 used = used_zones * sizeof *names;
4048 if (used != names_size)
4049 bzero((char *) (names_addr + used), names_size - used);
4050
4051 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr,
4052 (vm_map_size_t)used, TRUE, &copy);
4053 assert(kr == KERN_SUCCESS);
4054
4055 *namesp = (mach_zone_name_t *) copy;
4056 *namesCntp = used_zones;
4057
4058 used = used_zones * sizeof *info;
4059
4060 if (used != info_size)
4061 bzero((char *) (info_addr + used), info_size - used);
4062
4063 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr,
4064 (vm_map_size_t)used, TRUE, &copy);
4065 assert(kr == KERN_SUCCESS);
4066
4067 *infop = (mach_zone_info_t *) copy;
4068 *infoCntp = used_zones;
4069
4070 num_info = 0;
4071 memory_info_addr = 0;
4072
4073 if (memoryInfop && memoryInfoCntp)
4074 {
4075 num_info = vm_page_diagnose_estimate();
4076 memory_info_size = num_info * sizeof(*memory_info);
4077 memory_info_vmsize = round_page(memory_info_size);
4078 kr = kmem_alloc_pageable(ipc_kernel_map,
4079 &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
4080 if (kr != KERN_SUCCESS) {
4081 kmem_free(ipc_kernel_map,
4082 names_addr, names_size);
4083 kmem_free(ipc_kernel_map,
4084 info_addr, info_size);
4085 return kr;
4086 }
4087
4088 kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
4089 VM_PROT_READ|VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
4090 assert(kr == KERN_SUCCESS);
4091
4092 memory_info = (mach_memory_info_t *) memory_info_addr;
4093 vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
4094
4095 kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
4096 assert(kr == KERN_SUCCESS);
4097
4098 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
4099 (vm_map_size_t)memory_info_size, TRUE, &copy);
4100 assert(kr == KERN_SUCCESS);
4101
4102 *memoryInfop = (mach_memory_info_t *) copy;
4103 *memoryInfoCntp = num_info;
4104 }
4105
4106 return KERN_SUCCESS;
4107 }
4108
4109 uint64_t
4110 get_zones_collectable_bytes(void)
4111 {
4112 zone_t z;
4113 unsigned int i, max_zones;
4114 uint64_t zones_collectable_bytes = 0;
4115
4116 simple_lock(&all_zones_lock);
4117 max_zones = (unsigned int)(num_zones);
4118 simple_unlock(&all_zones_lock);
4119
4120 for (i = 0; i < max_zones; i++) {
4121 z = &(zone_array[i]);
4122 assert(z != ZONE_NULL);
4123
4124 lock_zone(z);
4125 zones_collectable_bytes += ((uint64_t)z->count_all_free_pages * PAGE_SIZE);
4126 unlock_zone(z);
4127 }
4128
4129 return zones_collectable_bytes;
4130 }
4131
4132 #if DEBUG || DEVELOPMENT
4133
4134 kern_return_t
4135 mach_memory_info_check(void)
4136 {
4137 mach_memory_info_t * memory_info;
4138 mach_memory_info_t * info;
4139 zone_t zone;
4140 unsigned int idx, num_info, max_zones;
4141 vm_offset_t memory_info_addr;
4142 kern_return_t kr;
4143 size_t memory_info_size, memory_info_vmsize;
4144 uint64_t top_wired, zonestotal, total;
4145
4146 num_info = vm_page_diagnose_estimate();
4147 memory_info_size = num_info * sizeof(*memory_info);
4148 memory_info_vmsize = round_page(memory_info_size);
4149 kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG);
4150 assert (kr == KERN_SUCCESS);
4151
4152 memory_info = (mach_memory_info_t *) memory_info_addr;
4153 vm_page_diagnose(memory_info, num_info, 0);
4154
4155 simple_lock(&all_zones_lock);
4156 max_zones = num_zones;
4157 simple_unlock(&all_zones_lock);
4158
4159 top_wired = total = zonestotal = 0;
4160 for (idx = 0; idx < max_zones; idx++)
4161 {
4162 zone = &(zone_array[idx]);
4163 assert(zone != ZONE_NULL);
4164 lock_zone(zone);
4165 zonestotal += ptoa_64(zone->page_count);
4166 unlock_zone(zone);
4167 }
4168 for (idx = 0; idx < num_info; idx++)
4169 {
4170 info = &memory_info[idx];
4171 if (!info->size) continue;
4172 if (VM_KERN_COUNT_WIRED == info->site) top_wired = info->size;
4173 if (VM_KERN_SITE_HIDE & info->flags) continue;
4174 if (!(VM_KERN_SITE_WIRED & info->flags)) continue;
4175 total += info->size;
4176 }
4177 total += zonestotal;
4178
4179 printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n", total, top_wired, zonestotal, top_wired - total);
4180
4181 kmem_free(kernel_map, memory_info_addr, memory_info_vmsize);
4182
4183 return (kr);
4184 }
4185
4186 #endif /* DEBUG || DEVELOPMENT */
4187
4188 kern_return_t
4189 mach_zone_force_gc(
4190 host_t host)
4191 {
4192 if (host == HOST_NULL)
4193 return KERN_INVALID_HOST;
4194
4195 #if DEBUG || DEVELOPMENT
4196 consider_zone_gc(FALSE);
4197 #endif /* DEBUG || DEVELOPMENT */
4198 return (KERN_SUCCESS);
4199 }
4200
4201 extern unsigned int stack_total;
4202 extern unsigned long long stack_allocs;
4203
4204 #if defined(__i386__) || defined (__x86_64__)
4205 extern unsigned int inuse_ptepages_count;
4206 extern long long alloc_ptepages_count;
4207 #endif
4208
4209 zone_t
4210 zone_find_largest(void)
4211 {
4212 unsigned int i;
4213 unsigned int max_zones;
4214 zone_t the_zone;
4215 zone_t zone_largest;
4216
4217 simple_lock(&all_zones_lock);
4218 max_zones = num_zones;
4219 simple_unlock(&all_zones_lock);
4220
4221 zone_largest = &(zone_array[0]);
4222 for (i = 0; i < max_zones; i++) {
4223 the_zone = &(zone_array[i]);
4224 if (the_zone->cur_size > zone_largest->cur_size) {
4225 zone_largest = the_zone;
4226 }
4227 }
4228 return zone_largest;
4229 }
4230
4231 #if ZONE_DEBUG
4232
4233 /* should we care about locks here ? */
4234
4235 #define zone_in_use(z) ( z->count || z->free_elements \
4236 || !queue_empty(&z->pages.all_free) \
4237 || !queue_empty(&z->pages.intermediate) \
4238 || (z->allows_foreign && !queue_empty(&z->pages.any_free_foreign)))
4239
4240
4241 #endif /* ZONE_DEBUG */
4242
4243
4244 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
4245
4246 #if DEBUG || DEVELOPMENT
4247
4248 static uintptr_t *
4249 zone_copy_all_allocations_inqueue(zone_t z, queue_head_t * queue, uintptr_t * elems)
4250 {
4251 struct zone_page_metadata *page_meta;
4252 vm_offset_t free, elements;
4253 vm_offset_t idx, numElements, freeCount, bytesAvail, metaSize;
4254
4255 queue_iterate(queue, page_meta, struct zone_page_metadata *, pages)
4256 {
4257 elements = get_zone_page(page_meta);
4258 bytesAvail = ptoa(page_meta->page_count);
4259 freeCount = 0;
4260 if (z->allows_foreign && !from_zone_map(elements, z->elem_size))
4261 {
4262 metaSize = (sizeof(struct zone_page_metadata) + ZONE_ELEMENT_ALIGNMENT - 1) & ~(ZONE_ELEMENT_ALIGNMENT - 1);
4263 bytesAvail -= metaSize;
4264 elements += metaSize;
4265 }
4266 numElements = bytesAvail / z->elem_size;
4267 // construct array of all possible elements
4268 for (idx = 0; idx < numElements; idx++)
4269 {
4270 elems[idx] = INSTANCE_PUT(elements + idx * z->elem_size);
4271 }
4272 // remove from the array all free elements
4273 free = (vm_offset_t)page_metadata_get_freelist(page_meta);
4274 while (free)
4275 {
4276 // find idx of free element
4277 for (idx = 0; (idx < numElements) && (elems[idx] != INSTANCE_PUT(free)); idx++) {}
4278 assert(idx < numElements);
4279 // remove it
4280 bcopy(&elems[idx + 1], &elems[idx], (numElements - (idx + 1)) * sizeof(elems[0]));
4281 numElements--;
4282 freeCount++;
4283 // next free element
4284 vm_offset_t *primary = (vm_offset_t *) free;
4285 free = *primary ^ zp_nopoison_cookie;
4286 }
4287 elems += numElements;
4288 }
4289
4290 return (elems);
4291 }
4292
4293 kern_return_t
4294 zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon)
4295 {
4296 uintptr_t zbt[MAX_ZTRACE_DEPTH];
4297 zone_t zone;
4298 uintptr_t * array;
4299 uintptr_t * next;
4300 uintptr_t element, bt;
4301 uint32_t idx, count, found;
4302 uint32_t btidx, btcount, nobtcount, btfound;
4303 uint32_t elemSize;
4304 uint64_t maxElems;
4305 unsigned int max_zones;
4306 kern_return_t kr;
4307
4308 simple_lock(&all_zones_lock);
4309 max_zones = num_zones;
4310 simple_unlock(&all_zones_lock);
4311
4312 for (idx = 0; idx < max_zones; idx++)
4313 {
4314 if (!strncmp(zoneName, zone_array[idx].zone_name, nameLen)) break;
4315 }
4316 if (idx >= max_zones) return (KERN_INVALID_NAME);
4317 zone = &zone_array[idx];
4318
4319 elemSize = (uint32_t) zone->elem_size;
4320 maxElems = ptoa(zone->page_count) / elemSize;
4321
4322 if ((zone->alloc_size % elemSize)
4323 && !leak_scan_debug_flag) return (KERN_INVALID_CAPABILITY);
4324
4325 kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array,
4326 maxElems * sizeof(uintptr_t), VM_KERN_MEMORY_DIAG);
4327 if (KERN_SUCCESS != kr) return (kr);
4328
4329 lock_zone(zone);
4330
4331 next = array;
4332 next = zone_copy_all_allocations_inqueue(zone, &zone->pages.any_free_foreign, next);
4333 next = zone_copy_all_allocations_inqueue(zone, &zone->pages.intermediate, next);
4334 next = zone_copy_all_allocations_inqueue(zone, &zone->pages.all_used, next);
4335 count = (uint32_t)(next - array);
4336
4337 unlock_zone(zone);
4338
4339 zone_leaks_scan(array, count, (uint32_t)zone->elem_size, &found);
4340 assert(found <= count);
4341
4342 for (idx = 0; idx < count; idx++)
4343 {
4344 element = array[idx];
4345 if (kInstanceFlagReferenced & element) continue;
4346 element = INSTANCE_PUT(element) & ~kInstanceFlags;
4347 }
4348
4349 if (zone->zlog_btlog && !corruption_debug_flag)
4350 {
4351 // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
4352 btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon);
4353 }
4354
4355 for (nobtcount = idx = 0; idx < count; idx++)
4356 {
4357 element = array[idx];
4358 if (!element) continue;
4359 if (kInstanceFlagReferenced & element) continue;
4360 element = INSTANCE_PUT(element) & ~kInstanceFlags;
4361
4362 // see if we can find any backtrace left in the element
4363 btcount = (typeof(btcount)) (zone->elem_size / sizeof(uintptr_t));
4364 if (btcount >= MAX_ZTRACE_DEPTH) btcount = MAX_ZTRACE_DEPTH - 1;
4365 for (btfound = btidx = 0; btidx < btcount; btidx++)
4366 {
4367 bt = ((uintptr_t *)element)[btcount - 1 - btidx];
4368 if (!VM_KERNEL_IS_SLID(bt)) break;
4369 zbt[btfound++] = bt;
4370 }
4371 if (btfound) (*proc)(refCon, 1, elemSize, &zbt[0], btfound);
4372 else nobtcount++;
4373 }
4374 if (nobtcount)
4375 {
4376 // fake backtrace when we found nothing
4377 zbt[0] = (uintptr_t) &zalloc;
4378 (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1);
4379 }
4380
4381 kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t));
4382
4383 return (KERN_SUCCESS);
4384 }
4385
4386 boolean_t
4387 kdp_is_in_zone(void *addr, const char *zone_name)
4388 {
4389 zone_t z;
4390 return (zone_element_size(addr, &z) && !strcmp(z->zone_name, zone_name));
4391 }
4392
4393 boolean_t
4394 run_zone_test(void)
4395 {
4396 int i = 0, max_iter = 5;
4397 void * test_ptr;
4398 zone_t test_zone;
4399
4400 simple_lock(&zone_test_lock);
4401 if (!zone_test_running) {
4402 zone_test_running = TRUE;
4403 } else {
4404 simple_unlock(&zone_test_lock);
4405 printf("run_zone_test: Test already running.\n");
4406 return FALSE;
4407 }
4408 simple_unlock(&zone_test_lock);
4409
4410 printf("run_zone_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
4411
4412 /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
4413 do {
4414 test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
4415 if (test_zone == NULL) {
4416 printf("run_zone_test: zinit() failed\n");
4417 return FALSE;
4418 }
4419
4420 #if KASAN_ZALLOC
4421 if (test_zone_ptr == NULL && zone_free_count(test_zone) != 0) {
4422 #else
4423 if (zone_free_count(test_zone) != 0) {
4424 #endif
4425 printf("run_zone_test: free count is not zero\n");
4426 return FALSE;
4427 }
4428
4429 if (test_zone_ptr == NULL) {
4430 /* Stash the zone pointer returned on the fist zinit */
4431 printf("run_zone_test: zone created for the first time\n");
4432 test_zone_ptr = test_zone;
4433 } else if (test_zone != test_zone_ptr) {
4434 printf("run_zone_test: old zone pointer and new zone pointer don't match\n");
4435 return FALSE;
4436 }
4437
4438 test_ptr = zalloc(test_zone);
4439 if (test_ptr == NULL) {
4440 printf("run_zone_test: zalloc() failed\n");
4441 return FALSE;
4442 }
4443 zfree(test_zone, test_ptr);
4444
4445 zdestroy(test_zone);
4446 i++;
4447
4448 printf("run_zone_test: Iteration %d successful\n", i);
4449 } while (i < max_iter);
4450
4451 printf("run_zone_test: Test passed\n");
4452
4453 simple_lock(&zone_test_lock);
4454 zone_test_running = FALSE;
4455 simple_unlock(&zone_test_lock);
4456
4457 return TRUE;
4458 }
4459
4460 #endif /* DEBUG || DEVELOPMENT */