osfmk/kern/zalloc.c

   1 /*
   2  * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   kern/zalloc.c
  60  *      Author: Avadis Tevanian, Jr.
  61  *
  62  *      Zone-based memory allocator.  A zone is a collection of fixed size
  63  *      data blocks for which quick allocation/deallocation is possible.
  64  */
  65 #include <zone_debug.h>
  66
  67 #include <mach/mach_types.h>
  68 #include <mach/vm_param.h>
  69 #include <mach/kern_return.h>
  70 #include <mach/mach_host_server.h>
  71 #include <mach/task_server.h>
  72 #include <mach/machine/vm_types.h>
  73 #include <mach/vm_map.h>
  74 #include <mach/sdt.h>
  75
  76 #include <kern/bits.h>
  77 #include <kern/kern_types.h>
  78 #include <kern/assert.h>
  79 #include <kern/backtrace.h>
  80 #include <kern/host.h>
  81 #include <kern/macro_help.h>
  82 #include <kern/sched.h>
  83 #include <kern/locks.h>
  84 #include <kern/sched_prim.h>
  85 #include <kern/misc_protos.h>
  86 #include <kern/thread_call.h>
  87 #include <kern/zalloc.h>
  88 #include <kern/kalloc.h>
  89
  90 #include <prng/random.h>
  91
  92 #include <vm/pmap.h>
  93 #include <vm/vm_map.h>
  94 #include <vm/vm_kern.h>
  95 #include <vm/vm_page.h>
  96
  97 #include <pexpert/pexpert.h>
  98
  99 #include <machine/machparam.h>
 100 #include <machine/machine_routines.h>  /* ml_cpu_get_info */
 101
 102 #include <libkern/OSDebug.h>
 103 #include <libkern/OSAtomic.h>
 104 #include <libkern/section_keywords.h>
 105 #include <sys/kdebug.h>
 106
 107 #include <san/kasan.h>
 108
 109 /*
 110  *      The zone_locks_grp allows for collecting lock statistics.
 111  *      All locks are associated to this group in zinit.
 112  *      Look at tools/lockstat for debugging lock contention.
 113  */
 114
 115 lck_grp_t       zone_locks_grp;
 116 lck_grp_attr_t  zone_locks_grp_attr;
 117
 118 /*
 119  *  ZONE_ALIAS_ADDR (deprecated)
 120  */
 121
 122 #define from_zone_map(addr, size) \
 123         ((vm_offset_t)(addr)             >= zone_map_min_address && \
 124         ((vm_offset_t)(addr) + size - 1) <  zone_map_max_address )
 125
 126 /*
 127  * Zone Corruption Debugging
 128  *
 129  * We use three techniques to detect modification of a zone element
 130  * after it's been freed.
 131  *
 132  * (1) Check the freelist next pointer for sanity.
 133  * (2) Store a backup of the next pointer at the end of the element,
 134  *     and compare it to the primary next pointer when the element is allocated
 135  *     to detect corruption of the freelist due to use-after-free bugs.
 136  *     The backup pointer is also XORed with a per-boot random cookie.
 137  * (3) Poison the freed element by overwriting it with 0xdeadbeef,
 138  *     and check for that value when the element is being reused to make sure
 139  *     no part of the element has been modified while it was on the freelist.
 140  *     This will also help catch read-after-frees, as code will now dereference
 141  *     0xdeadbeef instead of a valid but freed pointer.
 142  *
 143  * (1) and (2) occur for every allocation and free to a zone.
 144  * This is done to make it slightly more difficult for an attacker to
 145  * manipulate the freelist to behave in a specific way.
 146  *
 147  * Poisoning (3) occurs periodically for every N frees (counted per-zone)
 148  * and on every free for zones smaller than a cacheline.  If -zp
 149  * is passed as a boot arg, poisoning occurs for every free.
 150  *
 151  * Performance slowdown is inversely proportional to the frequency of poisoning,
 152  * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
 153  * and higher. You can expect to find a 100% reproducible bug in an average of
 154  * N tries, with a standard deviation of about N, but you will want to set
 155  * "-zp" to always poison every free if you are attempting to reproduce
 156  * a known bug.
 157  *
 158  * For a more heavyweight, but finer-grained method of detecting misuse
 159  * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
 160  *
 161  * Zone Corruption Logging
 162  *
 163  * You can also track where corruptions come from by using the boot-arguments
 164  * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
 165  * in this document for more implementation and usage information.
 166  *
 167  * Zone Leak Detection
 168  *
 169  * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
 170  * found later in this file via the showtopztrace and showz* macros in kgmacros,
 171  * or use zlog without the -zc argument.
 172  *
 173  */
 174
 175 /* Returns TRUE if we rolled over the counter at factor */
 176 static inline boolean_t
 177 sample_counter(volatile uint32_t * count_p, uint32_t factor)
 178 {
 179         uint32_t old_count, new_count;
 180         boolean_t rolled_over;
 181
 182         do {
 183                 new_count = old_count = *count_p;
 184
 185                 if (++new_count >= factor) {
 186                         rolled_over = TRUE;
 187                         new_count = 0;
 188                 } else {
 189                         rolled_over = FALSE;
 190                 }
 191         } while (!OSCompareAndSwap(old_count, new_count, count_p));
 192
 193         return rolled_over;
 194 }
 195
 196 #if defined(__LP64__)
 197 #define ZP_POISON       0xdeadbeefdeadbeef
 198 #else
 199 #define ZP_POISON       0xdeadbeef
 200 #endif
 201
 202 boolean_t zfree_poison_element(zone_t zone, vm_offset_t elem);
 203 void zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr);
 204
 205 #define ZP_DEFAULT_SAMPLING_FACTOR 16
 206 #define ZP_DEFAULT_SCALE_FACTOR 4
 207
 208 /*
 209  *  A zp_factor of 0 indicates zone poisoning is disabled,
 210  *  however, we still poison zones smaller than zp_tiny_zone_limit (a cacheline).
 211  *  Passing the -no-zp boot-arg disables even this behavior.
 212  *  In all cases, we record and check the integrity of a backup pointer.
 213  */
 214
 215 /* set by zp-factor=N boot arg, zero indicates non-tiny poisoning disabled */
 216 #if DEBUG
 217 #define DEFAULT_ZP_FACTOR (1)
 218 #else
 219 #define DEFAULT_ZP_FACTOR (0)
 220 #endif
 221 uint32_t        zp_factor               = DEFAULT_ZP_FACTOR;
 222
 223 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
 224 uint32_t        zp_scale                = 0;
 225
 226 /* set in zp_init, zero indicates -no-zp boot-arg */
 227 vm_size_t       zp_tiny_zone_limit      = 0;
 228
 229 /* initialized to a per-boot random value in zp_init */
 230 uintptr_t       zp_poisoned_cookie      = 0;
 231 uintptr_t       zp_nopoison_cookie      = 0;
 232
 233 #if VM_MAX_TAG_ZONES
 234 boolean_t       zone_tagging_on;
 235 #endif /* VM_MAX_TAG_ZONES */
 236
 237 SECURITY_READ_ONLY_LATE(boolean_t) copyio_zalloc_check = TRUE;
 238 static struct bool_gen zone_bool_gen;
 239
 240 /*
 241  * initialize zone poisoning
 242  * called from zone_bootstrap before any allocations are made from zalloc
 243  */
 244 static inline void
 245 zp_init(void)
 246 {
 247         char temp_buf[16];
 248
 249         /*
 250          * Initialize backup pointer random cookie for poisoned elements
 251          * Try not to call early_random() back to back, it may return
 252          * the same value if mach_absolute_time doesn't have sufficient time
 253          * to tick over between calls.  <rdar://problem/11597395>
 254          * (This is only a problem on embedded devices)
 255          */
 256         zp_poisoned_cookie = (uintptr_t) early_random();
 257
 258         /*
 259          * Always poison zones smaller than a cacheline,
 260          * because it's pretty close to free
 261          */
 262         ml_cpu_info_t cpu_info;
 263         ml_cpu_get_info(&cpu_info);
 264         zp_tiny_zone_limit = (vm_size_t) cpu_info.cache_line_size;
 265
 266         zp_factor = ZP_DEFAULT_SAMPLING_FACTOR;
 267         zp_scale  = ZP_DEFAULT_SCALE_FACTOR;
 268
 269         //TODO: Bigger permutation?
 270         /*
 271          * Permute the default factor +/- 1 to make it less predictable
 272          * This adds or subtracts ~4 poisoned objects per 1000 frees.
 273          */
 274         if (zp_factor != 0) {
 275                 uint32_t rand_bits = early_random() & 0x3;
 276
 277                 if (rand_bits == 0x1) {
 278                         zp_factor += 1;
 279                 } else if (rand_bits == 0x2) {
 280                         zp_factor -= 1;
 281                 }
 282                 /* if 0x0 or 0x3, leave it alone */
 283         }
 284
 285         /* -zp: enable poisoning for every alloc and free */
 286         if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
 287                 zp_factor = 1;
 288         }
 289
 290         /* -no-zp: disable poisoning completely even for tiny zones */
 291         if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
 292                 zp_factor          = 0;
 293                 zp_tiny_zone_limit = 0;
 294                 printf("Zone poisoning disabled\n");
 295         }
 296
 297         /* zp-factor=XXXX: override how often to poison freed zone elements */
 298         if (PE_parse_boot_argn("zp-factor", &zp_factor, sizeof(zp_factor))) {
 299                 printf("Zone poisoning factor override: %u\n", zp_factor);
 300         }
 301
 302         /* zp-scale=XXXX: override how much zone size scales zp-factor by */
 303         if (PE_parse_boot_argn("zp-scale", &zp_scale, sizeof(zp_scale))) {
 304                 printf("Zone poisoning scale factor override: %u\n", zp_scale);
 305         }
 306
 307         /* Initialize backup pointer random cookie for unpoisoned elements */
 308         zp_nopoison_cookie = (uintptr_t) early_random();
 309
 310 #if MACH_ASSERT
 311         if (zp_poisoned_cookie == zp_nopoison_cookie) {
 312                 panic("early_random() is broken: %p and %p are not random\n",
 313                     (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
 314         }
 315 #endif
 316
 317         /*
 318          * Use the last bit in the backup pointer to hint poisoning state
 319          * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
 320          * the low bits are zero.
 321          */
 322         zp_poisoned_cookie |=   (uintptr_t)0x1ULL;
 323         zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
 324
 325 #if defined(__LP64__)
 326         /*
 327          * Make backup pointers more obvious in GDB for 64 bit
 328          * by making OxFFFFFF... ^ cookie = 0xFACADE...
 329          * (0xFACADE = 0xFFFFFF ^ 0x053521)
 330          * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
 331          * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
 332          * by the sanity check, so it's OK for that part of the cookie to be predictable.
 333          *
 334          * TODO: Use #defines, xors, and shifts
 335          */
 336
 337         zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
 338         zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
 339
 340         zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
 341         zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
 342 #endif
 343 }
 344
 345 /*
 346  * These macros are used to keep track of the number
 347  * of pages being used by the zone currently. The
 348  * z->page_count is not protected by the zone lock.
 349  */
 350 #define ZONE_PAGE_COUNT_INCR(z, count)          \
 351 {                                               \
 352         OSAddAtomic64(count, &(z->page_count)); \
 353 }
 354
 355 #define ZONE_PAGE_COUNT_DECR(z, count)                  \
 356 {                                                       \
 357         OSAddAtomic64(-count, &(z->page_count));        \
 358 }
 359
 360 vm_map_t        zone_map = VM_MAP_NULL;
 361
 362 /* for is_sane_zone_element and garbage collection */
 363
 364 vm_offset_t     zone_map_min_address = 0;  /* initialized in zone_init */
 365 vm_offset_t     zone_map_max_address = 0;
 366
 367 /* Globals for random boolean generator for elements in free list */
 368 #define MAX_ENTROPY_PER_ZCRAM           4
 369
 370 /* VM region for all metadata structures */
 371 vm_offset_t     zone_metadata_region_min = 0;
 372 vm_offset_t     zone_metadata_region_max = 0;
 373 decl_lck_mtx_data(static, zone_metadata_region_lck)
 374 lck_attr_t      zone_metadata_lock_attr;
 375 lck_mtx_ext_t   zone_metadata_region_lck_ext;
 376
 377 /* Helpful for walking through a zone's free element list. */
 378 struct zone_free_element {
 379         struct zone_free_element *next;
 380         /* ... */
 381         /* void *backup_ptr; */
 382 };
 383
 384 #if CONFIG_ZCACHE
 385
 386 #if !CONFIG_GZALLOC
 387 bool use_caching = TRUE;
 388 #else
 389 bool use_caching = FALSE;
 390 #endif /* !CONFIG_GZALLOC */
 391
 392 /*
 393  * Decides whether per-cpu zone caching is to be enabled for all zones.
 394  * Can be set to TRUE via the boot-arg '-zcache_all'.
 395  */
 396 bool cache_all_zones = FALSE;
 397
 398 /*
 399  * Specifies a single zone to enable CPU caching for.
 400  * Can be set using boot-args: zcc_enable_for_zone_name=<zone>
 401  */
 402 static char cache_zone_name[MAX_ZONE_NAME];
 403
 404 static inline bool
 405 zone_caching_enabled(zone_t z)
 406 {
 407         return z->cpu_cache_enabled && !z->tags && !z->zleak_on;
 408 }
 409
 410 #endif /* CONFIG_ZCACHE */
 411
 412 /*
 413  *      Protects zone_array, num_zones, num_zones_in_use, and zone_empty_bitmap
 414  */
 415 decl_simple_lock_data(, all_zones_lock)
 416 unsigned int            num_zones_in_use;
 417 unsigned int            num_zones;
 418
 419 #define MAX_ZONES       320
 420 struct zone             zone_array[MAX_ZONES];
 421
 422 /* Used to keep track of empty slots in the zone_array */
 423 bitmap_t zone_empty_bitmap[BITMAP_LEN(MAX_ZONES)];
 424
 425 #if DEBUG || DEVELOPMENT
 426 /*
 427  * Used for sysctl kern.run_zone_test which is not thread-safe. Ensure only one thread goes through at a time.
 428  * Or we can end up with multiple test zones (if a second zinit() comes through before zdestroy()),  which could lead us to
 429  * run out of zones.
 430  */
 431 decl_simple_lock_data(, zone_test_lock)
 432 static boolean_t zone_test_running = FALSE;
 433 static zone_t test_zone_ptr = NULL;
 434 #endif /* DEBUG || DEVELOPMENT */
 435
 436 #define PAGE_METADATA_GET_ZINDEX(page_meta)                     \
 437         (page_meta->zindex)
 438
 439 #define PAGE_METADATA_GET_ZONE(page_meta)                               \
 440         (&(zone_array[page_meta->zindex]))
 441
 442 #define PAGE_METADATA_SET_ZINDEX(page_meta, index)              \
 443         page_meta->zindex = (index);
 444
 445 struct zone_page_metadata {
 446         queue_chain_t           pages; /* linkage pointer for metadata lists */
 447
 448         /* Union for maintaining start of element free list and real metadata (for multipage allocations) */
 449         union {
 450                 /*
 451                  * The start of the freelist can be maintained as a 32-bit offset instead of a pointer because
 452                  * the free elements would be at max ZONE_MAX_ALLOC_SIZE bytes away from the metadata. Offset
 453                  * from start of the allocation chunk to free element list head.
 454                  */
 455                 uint32_t                freelist_offset;
 456                 /*
 457                  * This field is used to lookup the real metadata for multipage allocations, where we mark the
 458                  * metadata for all pages except the first as "fake" metadata using MULTIPAGE_METADATA_MAGIC.
 459                  * Offset from this fake metadata to real metadata of allocation chunk (-ve offset).
 460                  */
 461                 uint32_t                real_metadata_offset;
 462         };
 463
 464         /*
 465          * For the first page in the allocation chunk, this represents the total number of free elements in
 466          * the chunk.
 467          */
 468         uint16_t                        free_count;
 469         unsigned                        zindex     : ZINDEX_BITS;    /* Zone index within the zone_array */
 470         unsigned                        page_count : PAGECOUNT_BITS; /* Count of pages within the allocation chunk */
 471 };
 472
 473 /* Macro to get page index (within zone_map) of page containing element */
 474 #define PAGE_INDEX_FOR_ELEMENT(element)                         \
 475         (((vm_offset_t)trunc_page(element) - zone_map_min_address) / PAGE_SIZE)
 476
 477 /* Macro to get metadata structure given a page index in zone_map */
 478 #define PAGE_METADATA_FOR_PAGE_INDEX(index)                     \
 479         (zone_metadata_region_min + ((index) * sizeof(struct zone_page_metadata)))
 480
 481 /* Macro to get index (within zone_map) for given metadata */
 482 #define PAGE_INDEX_FOR_METADATA(page_meta)                      \
 483         (((vm_offset_t)page_meta - zone_metadata_region_min) / sizeof(struct zone_page_metadata))
 484
 485 /* Macro to get page for given page index in zone_map */
 486 #define PAGE_FOR_PAGE_INDEX(index)                              \
 487         (zone_map_min_address + (PAGE_SIZE * (index)))
 488
 489 /* Macro to get the actual metadata for a given address */
 490 #define PAGE_METADATA_FOR_ELEMENT(element)              \
 491         (struct zone_page_metadata *)(PAGE_METADATA_FOR_PAGE_INDEX(PAGE_INDEX_FOR_ELEMENT(element)))
 492
 493 /* Magic value to indicate empty element free list */
 494 #define PAGE_METADATA_EMPTY_FREELIST            ((uint32_t)(~0))
 495
 496 vm_map_copy_t create_vm_map_copy(vm_offset_t start_addr, vm_size_t total_size, vm_size_t used_size);
 497 boolean_t get_zone_info(zone_t z, mach_zone_name_t *zn, mach_zone_info_t *zi);
 498 boolean_t is_zone_map_nearing_exhaustion(void);
 499 extern void vm_pageout_garbage_collect(int collect);
 500
 501 static inline void *
 502 page_metadata_get_freelist(struct zone_page_metadata *page_meta)
 503 {
 504         assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
 505         if (page_meta->freelist_offset == PAGE_METADATA_EMPTY_FREELIST) {
 506                 return NULL;
 507         } else {
 508                 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) {
 509                         return (void *)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)) + page_meta->freelist_offset);
 510                 } else {
 511                         return (void *)((vm_offset_t)page_meta + page_meta->freelist_offset);
 512                 }
 513         }
 514 }
 515
 516 static inline void
 517 page_metadata_set_freelist(struct zone_page_metadata *page_meta, void *addr)
 518 {
 519         assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
 520         if (addr == NULL) {
 521                 page_meta->freelist_offset = PAGE_METADATA_EMPTY_FREELIST;
 522         } else {
 523                 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) {
 524                         page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)));
 525                 } else {
 526                         page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - (vm_offset_t)page_meta);
 527                 }
 528         }
 529 }
 530
 531 static inline struct zone_page_metadata *
 532 page_metadata_get_realmeta(struct zone_page_metadata *page_meta)
 533 {
 534         assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC);
 535         return (struct zone_page_metadata *)((vm_offset_t)page_meta - page_meta->real_metadata_offset);
 536 }
 537
 538 static inline void
 539 page_metadata_set_realmeta(struct zone_page_metadata *page_meta, struct zone_page_metadata *real_meta)
 540 {
 541         assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC);
 542         assert(PAGE_METADATA_GET_ZINDEX(real_meta) != MULTIPAGE_METADATA_MAGIC);
 543         assert((vm_offset_t)page_meta > (vm_offset_t)real_meta);
 544         vm_offset_t offset = (vm_offset_t)page_meta - (vm_offset_t)real_meta;
 545         assert(offset <= UINT32_MAX);
 546         page_meta->real_metadata_offset = (uint32_t)offset;
 547 }
 548
 549 /* The backup pointer is stored in the last pointer-sized location in an element. */
 550 static inline vm_offset_t *
 551 get_backup_ptr(vm_size_t  elem_size,
 552     vm_offset_t *element)
 553 {
 554         return (vm_offset_t *) ((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
 555 }
 556
 557 /*
 558  * Routine to populate a page backing metadata in the zone_metadata_region.
 559  * Must be called without the zone lock held as it might potentially block.
 560  */
 561 static inline void
 562 zone_populate_metadata_page(struct zone_page_metadata *page_meta)
 563 {
 564         vm_offset_t page_metadata_begin = trunc_page(page_meta);
 565         vm_offset_t page_metadata_end = trunc_page((vm_offset_t)page_meta + sizeof(struct zone_page_metadata));
 566
 567         for (; page_metadata_begin <= page_metadata_end; page_metadata_begin += PAGE_SIZE) {
 568 #if !KASAN
 569                 /*
 570                  * This can race with another thread doing a populate on the same metadata
 571                  * page, where we see an updated pmap but unmapped KASan shadow, causing a
 572                  * fault in the shadow when we first access the metadata page. Avoid this
 573                  * by always synchronizing on the zone_metadata_region lock with KASan.
 574                  */
 575                 if (pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) {
 576                         continue;
 577                 }
 578 #endif
 579                 /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
 580                 lck_mtx_lock(&zone_metadata_region_lck);
 581                 if (0 == pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) {
 582                         kern_return_t __assert_only ret = kernel_memory_populate(zone_map,
 583                             page_metadata_begin,
 584                             PAGE_SIZE,
 585                             KMA_KOBJECT,
 586                             VM_KERN_MEMORY_OSFMK);
 587
 588                         /* should not fail with the given arguments */
 589                         assert(ret == KERN_SUCCESS);
 590                 }
 591                 lck_mtx_unlock(&zone_metadata_region_lck);
 592         }
 593         return;
 594 }
 595
 596 static inline uint16_t
 597 get_metadata_alloc_count(struct zone_page_metadata *page_meta)
 598 {
 599         assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
 600         struct zone *z = PAGE_METADATA_GET_ZONE(page_meta);
 601         return (page_meta->page_count * PAGE_SIZE) / z->elem_size;
 602 }
 603
 604 /*
 605  * Routine to lookup metadata for any given address.
 606  * If init is marked as TRUE, this should be called without holding the zone lock
 607  * since the initialization might block.
 608  */
 609 static inline struct zone_page_metadata *
 610 get_zone_page_metadata(struct zone_free_element *element, boolean_t init)
 611 {
 612         struct zone_page_metadata *page_meta = 0;
 613
 614         if (from_zone_map(element, sizeof(struct zone_free_element))) {
 615                 page_meta = (struct zone_page_metadata *)(PAGE_METADATA_FOR_ELEMENT(element));
 616                 if (init) {
 617                         zone_populate_metadata_page(page_meta);
 618                 }
 619         } else {
 620                 page_meta = (struct zone_page_metadata *)(trunc_page((vm_offset_t)element));
 621         }
 622         if (init) {
 623                 bzero((char *)page_meta, sizeof(struct zone_page_metadata));
 624         }
 625         return (PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC) ? page_meta : page_metadata_get_realmeta(page_meta);
 626 }
 627
 628 /* Routine to get the page for a given metadata */
 629 static inline vm_offset_t
 630 get_zone_page(struct zone_page_metadata *page_meta)
 631 {
 632         if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) {
 633                 return (vm_offset_t)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)));
 634         } else {
 635                 return (vm_offset_t)(trunc_page(page_meta));
 636         }
 637 }
 638
 639 /*
 640  * ZTAGS
 641  */
 642
 643 #if VM_MAX_TAG_ZONES
 644
 645 // for zones with tagging enabled:
 646
 647 // calculate a pointer to the tag base entry,
 648 // holding either a uint32_t the first tag offset for a page in the zone map,
 649 // or two uint16_t tags if the page can only hold one or two elements
 650
 651 #define ZTAGBASE(zone, element) \
 652     (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_map_min_address)])
 653
 654 // pointer to the tag for an element
 655 #define ZTAG(zone, element)                                     \
 656     ({                                                          \
 657         vm_tag_t * result;                                      \
 658         if ((zone)->tags_inline) {                              \
 659             result = (vm_tag_t *) ZTAGBASE((zone), (element));  \
 660             if ((page_mask & element) >= (zone)->elem_size) result++;    \
 661         } else {                                                \
 662             result =  &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / (zone)->elem_size];   \
 663         }                                                       \
 664         result;                                                 \
 665     })
 666
 667
 668 static vm_offset_t  zone_tagbase_min;
 669 static vm_offset_t  zone_tagbase_max;
 670 static vm_offset_t  zone_tagbase_map_size;
 671 static vm_map_t     zone_tagbase_map;
 672
 673 static vm_offset_t  zone_tags_min;
 674 static vm_offset_t  zone_tags_max;
 675 static vm_offset_t  zone_tags_map_size;
 676 static vm_map_t     zone_tags_map;
 677
 678 // simple heap allocator for allocating the tags for new memory
 679
 680 decl_lck_mtx_data(, ztLock)    /* heap lock */
 681 enum{
 682         ztFreeIndexCount = 8,
 683         ztFreeIndexMax   = (ztFreeIndexCount - 1),
 684         ztTagsPerBlock   = 4
 685 };
 686
 687 struct ztBlock {
 688 #if __LITTLE_ENDIAN__
 689         uint64_t free:1,
 690             next:21,
 691             prev:21,
 692             size:21;
 693 #else
 694 // ztBlock needs free bit least significant
 695 #error !__LITTLE_ENDIAN__
 696 #endif
 697 };
 698 typedef struct ztBlock ztBlock;
 699
 700 static ztBlock * ztBlocks;
 701 static uint32_t  ztBlocksCount;
 702 static uint32_t  ztBlocksFree;
 703
 704 static uint32_t
 705 ztLog2up(uint32_t size)
 706 {
 707         if (1 == size) {
 708                 size = 0;
 709         } else {
 710                 size = 32 - __builtin_clz(size - 1);
 711         }
 712         return size;
 713 }
 714
 715 static uint32_t
 716 ztLog2down(uint32_t size)
 717 {
 718         size = 31 - __builtin_clz(size);
 719         return size;
 720 }
 721
 722 static void
 723 ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags)
 724 {
 725         vm_map_offset_t addr = (vm_map_offset_t) address;
 726         vm_map_offset_t page, end;
 727
 728         page = trunc_page(addr);
 729         end  = round_page(addr + size);
 730
 731         for (; page < end; page += page_size) {
 732                 if (!pmap_find_phys(kernel_pmap, page)) {
 733                         kern_return_t __unused
 734                         ret = kernel_memory_populate(map, page, PAGE_SIZE,
 735                             KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG);
 736                         assert(ret == KERN_SUCCESS);
 737                 }
 738         }
 739 }
 740
 741 static boolean_t
 742 ztPresent(const void * address, size_t size)
 743 {
 744         vm_map_offset_t addr = (vm_map_offset_t) address;
 745         vm_map_offset_t page, end;
 746         boolean_t       result;
 747
 748         page = trunc_page(addr);
 749         end  = round_page(addr + size);
 750         for (result = TRUE; (page < end); page += page_size) {
 751                 result = pmap_find_phys(kernel_pmap, page);
 752                 if (!result) {
 753                         break;
 754                 }
 755         }
 756         return result;
 757 }
 758
 759
 760 void __unused
 761 ztDump(boolean_t sanity);
 762 void __unused
 763 ztDump(boolean_t sanity)
 764 {
 765         uint32_t q, cq, p;
 766
 767         for (q = 0; q <= ztFreeIndexMax; q++) {
 768                 p = q;
 769                 do{
 770                         if (sanity) {
 771                                 cq = ztLog2down(ztBlocks[p].size);
 772                                 if (cq > ztFreeIndexMax) {
 773                                         cq = ztFreeIndexMax;
 774                                 }
 775                                 if (!ztBlocks[p].free
 776                                     || ((p != q) && (q != cq))
 777                                     || (ztBlocks[ztBlocks[p].next].prev != p)
 778                                     || (ztBlocks[ztBlocks[p].prev].next != p)) {
 779                                         kprintf("zterror at %d", p);
 780                                         ztDump(FALSE);
 781                                         kprintf("zterror at %d", p);
 782                                         assert(FALSE);
 783                                 }
 784                                 continue;
 785                         }
 786                         kprintf("zt[%03d]%c %d, %d, %d\n",
 787                             p, ztBlocks[p].free ? 'F' : 'A',
 788                             ztBlocks[p].next, ztBlocks[p].prev,
 789                             ztBlocks[p].size);
 790                         p = ztBlocks[p].next;
 791                         if (p == q) {
 792                                 break;
 793                         }
 794                 }while (p != q);
 795                 if (!sanity) {
 796                         printf("\n");
 797                 }
 798         }
 799         if (!sanity) {
 800                 printf("-----------------------\n");
 801         }
 802 }
 803
 804
 805
 806 #define ZTBDEQ(idx)                                                 \
 807     ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next;     \
 808     ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
 809
 810 static void
 811 ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
 812 {
 813         uint32_t q, w, p, size, merge;
 814
 815         assert(count);
 816         ztBlocksFree += count;
 817
 818         // merge with preceding
 819         merge = (index + count);
 820         if ((merge < ztBlocksCount)
 821             && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
 822             && ztBlocks[merge].free) {
 823                 ZTBDEQ(merge);
 824                 count += ztBlocks[merge].size;
 825         }
 826
 827         // merge with following
 828         merge = (index - 1);
 829         if ((merge > ztFreeIndexMax)
 830             && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
 831             && ztBlocks[merge].free) {
 832                 size = ztBlocks[merge].size;
 833                 count += size;
 834                 index -= size;
 835                 ZTBDEQ(index);
 836         }
 837
 838         q = ztLog2down(count);
 839         if (q > ztFreeIndexMax) {
 840                 q = ztFreeIndexMax;
 841         }
 842         w = q;
 843         // queue in order of size
 844         while (TRUE) {
 845                 p = ztBlocks[w].next;
 846                 if (p == q) {
 847                         break;
 848                 }
 849                 if (ztBlocks[p].size >= count) {
 850                         break;
 851                 }
 852                 w = p;
 853         }
 854         ztBlocks[p].prev = index;
 855         ztBlocks[w].next = index;
 856
 857         // fault in first
 858         ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
 859
 860         // mark first & last with free flag and size
 861         ztBlocks[index].free = TRUE;
 862         ztBlocks[index].size = count;
 863         ztBlocks[index].prev = w;
 864         ztBlocks[index].next = p;
 865         if (count > 1) {
 866                 index += (count - 1);
 867                 // fault in last
 868                 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
 869                 ztBlocks[index].free = TRUE;
 870                 ztBlocks[index].size = count;
 871         }
 872 }
 873
 874 static uint32_t
 875 ztAlloc(zone_t zone, uint32_t count)
 876 {
 877         uint32_t q, w, p, leftover;
 878
 879         assert(count);
 880
 881         q = ztLog2up(count);
 882         if (q > ztFreeIndexMax) {
 883                 q = ztFreeIndexMax;
 884         }
 885         do{
 886                 w = q;
 887                 while (TRUE) {
 888                         p = ztBlocks[w].next;
 889                         if (p == q) {
 890                                 break;
 891                         }
 892                         if (ztBlocks[p].size >= count) {
 893                                 // dequeue, mark both ends allocated
 894                                 ztBlocks[w].next = ztBlocks[p].next;
 895                                 ztBlocks[ztBlocks[p].next].prev = w;
 896                                 ztBlocks[p].free = FALSE;
 897                                 ztBlocksFree -= ztBlocks[p].size;
 898                                 if (ztBlocks[p].size > 1) {
 899                                         ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
 900                                 }
 901
 902                                 // fault all the allocation
 903                                 ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
 904                                 // mark last as allocated
 905                                 if (count > 1) {
 906                                         ztBlocks[p + count - 1].free = FALSE;
 907                                 }
 908                                 // free remainder
 909                                 leftover = ztBlocks[p].size - count;
 910                                 if (leftover) {
 911                                         ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
 912                                 }
 913
 914                                 return p;
 915                         }
 916                         w = p;
 917                 }
 918                 q++;
 919         }while (q <= ztFreeIndexMax);
 920
 921         return -1U;
 922 }
 923
 924 static void
 925 ztInit(vm_size_t max_zonemap_size, lck_grp_t * group)
 926 {
 927         kern_return_t         ret;
 928         vm_map_kernel_flags_t vmk_flags;
 929         uint32_t              idx;
 930
 931         lck_mtx_init(&ztLock, group, LCK_ATTR_NULL);
 932
 933         // allocate submaps VM_KERN_MEMORY_DIAG
 934
 935         zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t);
 936         vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
 937         vmk_flags.vmkf_permanent = TRUE;
 938         ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size,
 939             FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
 940             &zone_tagbase_map);
 941
 942         if (ret != KERN_SUCCESS) {
 943                 panic("zone_init: kmem_suballoc failed");
 944         }
 945         zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size);
 946
 947         zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t);
 948         vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
 949         vmk_flags.vmkf_permanent = TRUE;
 950         ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size,
 951             FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
 952             &zone_tags_map);
 953
 954         if (ret != KERN_SUCCESS) {
 955                 panic("zone_init: kmem_suballoc failed");
 956         }
 957         zone_tags_max = zone_tags_min + round_page(zone_tags_map_size);
 958
 959         ztBlocks = (ztBlock *) zone_tags_min;
 960         ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
 961
 962         // initialize the qheads
 963         lck_mtx_lock(&ztLock);
 964
 965         ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0);
 966         for (idx = 0; idx < ztFreeIndexCount; idx++) {
 967                 ztBlocks[idx].free = TRUE;
 968                 ztBlocks[idx].next = idx;
 969                 ztBlocks[idx].prev = idx;
 970                 ztBlocks[idx].size = 0;
 971         }
 972         // free remaining space
 973         ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
 974
 975         lck_mtx_unlock(&ztLock);
 976 }
 977
 978 static void
 979 ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
 980 {
 981         uint32_t * tagbase;
 982         uint32_t   count, block, blocks, idx;
 983         size_t     pages;
 984
 985         pages = atop(size);
 986         tagbase = ZTAGBASE(zone, mem);
 987
 988         lck_mtx_lock(&ztLock);
 989
 990         // fault tagbase
 991         ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0);
 992
 993         if (!zone->tags_inline) {
 994                 // allocate tags
 995                 count = (uint32_t)(size / zone->elem_size);
 996                 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
 997                 block = ztAlloc(zone, blocks);
 998                 if (-1U == block) {
 999                         ztDump(false);
1000                 }
1001                 assert(-1U != block);
1002         }
1003
1004         lck_mtx_unlock(&ztLock);
1005
1006         if (!zone->tags_inline) {
1007                 // set tag base for each page
1008                 block *= ztTagsPerBlock;
1009                 for (idx = 0; idx < pages; idx++) {
1010                         tagbase[idx] = block + (uint32_t)((ptoa(idx) + (zone->elem_size - 1)) / zone->elem_size);
1011                 }
1012         }
1013 }
1014
1015 static void
1016 ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
1017 {
1018         uint32_t * tagbase;
1019         uint32_t   count, block, blocks, idx;
1020         size_t     pages;
1021
1022         // set tag base for each page
1023         pages = atop(size);
1024         tagbase = ZTAGBASE(zone, mem);
1025         block = tagbase[0];
1026         for (idx = 0; idx < pages; idx++) {
1027                 tagbase[idx] = 0xFFFFFFFF;
1028         }
1029
1030         lck_mtx_lock(&ztLock);
1031         if (!zone->tags_inline) {
1032                 count = (uint32_t)(size / zone->elem_size);
1033                 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1034                 assert(block != 0xFFFFFFFF);
1035                 block /= ztTagsPerBlock;
1036                 ztFree(NULL /* zone is unlocked */, block, blocks);
1037         }
1038
1039         lck_mtx_unlock(&ztLock);
1040 }
1041
1042 uint32_t
1043 zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size)
1044 {
1045         zone_t z;
1046         uint32_t idx;
1047
1048         simple_lock(&all_zones_lock, &zone_locks_grp);
1049
1050         for (idx = 0; idx < num_zones; idx++) {
1051                 z = &(zone_array[idx]);
1052                 if (!z->tags) {
1053                         continue;
1054                 }
1055                 if (tag_zone_index != z->tag_zone_index) {
1056                         continue;
1057                 }
1058                 *elem_size = z->elem_size;
1059                 break;
1060         }
1061
1062         simple_unlock(&all_zones_lock);
1063
1064         if (idx == num_zones) {
1065                 idx = -1U;
1066         }
1067
1068         return idx;
1069 }
1070
1071 #endif /* VM_MAX_TAG_ZONES */
1072
1073 /* Routine to get the size of a zone allocated address. If the address doesnt belong to the
1074  * zone_map, returns 0.
1075  */
1076 vm_size_t
1077 zone_element_size(void *addr, zone_t *z)
1078 {
1079         struct zone *src_zone;
1080         if (from_zone_map(addr, sizeof(void *))) {
1081                 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
1082                 src_zone = PAGE_METADATA_GET_ZONE(page_meta);
1083                 if (z) {
1084                         *z = src_zone;
1085                 }
1086                 return src_zone->elem_size;
1087         } else {
1088 #if CONFIG_GZALLOC
1089                 vm_size_t gzsize;
1090                 if (gzalloc_element_size(addr, z, &gzsize)) {
1091                         return gzsize;
1092                 }
1093 #endif /* CONFIG_GZALLOC */
1094
1095                 return 0;
1096         }
1097 }
1098
1099 #if DEBUG || DEVELOPMENT
1100
1101 vm_size_t
1102 zone_element_info(void *addr, vm_tag_t * ptag)
1103 {
1104         vm_size_t     size = 0;
1105         vm_tag_t      tag = VM_KERN_MEMORY_NONE;
1106         struct zone * src_zone;
1107
1108         if (from_zone_map(addr, sizeof(void *))) {
1109                 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
1110                 src_zone = PAGE_METADATA_GET_ZONE(page_meta);
1111 #if VM_MAX_TAG_ZONES
1112                 if (__improbable(src_zone->tags)) {
1113                         tag = (ZTAG(src_zone, (vm_offset_t) addr)[0] >> 1);
1114                 }
1115 #endif /* VM_MAX_TAG_ZONES */
1116                 size = src_zone->elem_size;
1117         } else {
1118 #if CONFIG_GZALLOC
1119                 gzalloc_element_size(addr, NULL, &size);
1120 #endif /* CONFIG_GZALLOC */
1121         }
1122         *ptag = tag;
1123         return size;
1124 }
1125
1126 #endif /* DEBUG || DEVELOPMENT */
1127
1128 /*
1129  * Zone checking helper function.
1130  * A pointer that satisfies these conditions is OK to be a freelist next pointer
1131  * A pointer that doesn't satisfy these conditions indicates corruption
1132  */
1133 static inline boolean_t
1134 is_sane_zone_ptr(zone_t         zone,
1135     vm_offset_t    addr,
1136     size_t         obj_size)
1137 {
1138         /*  Must be aligned to pointer boundary */
1139         if (__improbable((addr & (sizeof(vm_offset_t) - 1)) != 0)) {
1140                 return FALSE;
1141         }
1142
1143         /*  Must be a kernel address */
1144         if (__improbable(!pmap_kernel_va(addr))) {
1145                 return FALSE;
1146         }
1147
1148         /*  Must be from zone map if the zone only uses memory from the zone_map */
1149         /*
1150          *  TODO: Remove the zone->collectable check when every
1151          *  zone using foreign memory is properly tagged with allows_foreign
1152          */
1153         if (zone->collectable && !zone->allows_foreign) {
1154                 /*  check if addr is from zone map */
1155                 if (addr >= zone_map_min_address &&
1156                     (addr + obj_size - 1) < zone_map_max_address) {
1157                         return TRUE;
1158                 }
1159
1160                 return FALSE;
1161         }
1162
1163         return TRUE;
1164 }
1165
1166 static inline boolean_t
1167 is_sane_zone_page_metadata(zone_t       zone,
1168     vm_offset_t  page_meta)
1169 {
1170         /* NULL page metadata structures are invalid */
1171         if (page_meta == 0) {
1172                 return FALSE;
1173         }
1174         return is_sane_zone_ptr(zone, page_meta, sizeof(struct zone_page_metadata));
1175 }
1176
1177 static inline boolean_t
1178 is_sane_zone_element(zone_t      zone,
1179     vm_offset_t addr)
1180 {
1181         /*  NULL is OK because it indicates the tail of the list */
1182         if (addr == 0) {
1183                 return TRUE;
1184         }
1185         return is_sane_zone_ptr(zone, addr, zone->elem_size);
1186 }
1187
1188 /* Someone wrote to freed memory. */
1189 static inline void
1190 /* noreturn */
1191 zone_element_was_modified_panic(zone_t        zone,
1192     vm_offset_t   element,
1193     vm_offset_t   found,
1194     vm_offset_t   expected,
1195     vm_offset_t   offset)
1196 {
1197         panic("a freed zone element has been modified in zone %s: expected %p but found %p, bits changed %p, at offset %d of %d in element %p, cookies %p %p",
1198             zone->zone_name,
1199             (void *)   expected,
1200             (void *)   found,
1201             (void *)   (expected ^ found),
1202             (uint32_t) offset,
1203             (uint32_t) zone->elem_size,
1204             (void *)   element,
1205             (void *)   zp_nopoison_cookie,
1206             (void *)   zp_poisoned_cookie);
1207 }
1208
1209 /*
1210  * The primary and backup pointers don't match.
1211  * Determine which one was likely the corrupted pointer, find out what it
1212  * probably should have been, and panic.
1213  * I would like to mark this as noreturn, but panic() isn't marked noreturn.
1214  */
1215 static void
1216 /* noreturn */
1217 backup_ptr_mismatch_panic(zone_t        zone,
1218     vm_offset_t   element,
1219     vm_offset_t   primary,
1220     vm_offset_t   backup)
1221 {
1222         vm_offset_t likely_backup;
1223         vm_offset_t likely_primary;
1224
1225         likely_primary = primary ^ zp_nopoison_cookie;
1226         boolean_t   sane_backup;
1227         boolean_t   sane_primary = is_sane_zone_element(zone, likely_primary);
1228         boolean_t   element_was_poisoned = (backup & 0x1) ? TRUE : FALSE;
1229
1230 #if defined(__LP64__)
1231         /* We can inspect the tag in the upper bits for additional confirmation */
1232         if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000) {
1233                 element_was_poisoned = TRUE;
1234         } else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000) {
1235                 element_was_poisoned = FALSE;
1236         }
1237 #endif
1238
1239         if (element_was_poisoned) {
1240                 likely_backup = backup ^ zp_poisoned_cookie;
1241                 sane_backup = is_sane_zone_element(zone, likely_backup);
1242         } else {
1243                 likely_backup = backup ^ zp_nopoison_cookie;
1244                 sane_backup = is_sane_zone_element(zone, likely_backup);
1245         }
1246
1247         /* The primary is definitely the corrupted one */
1248         if (!sane_primary && sane_backup) {
1249                 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1250         }
1251
1252         /* The backup is definitely the corrupted one */
1253         if (sane_primary && !sane_backup) {
1254                 zone_element_was_modified_panic(zone, element, backup,
1255                     (likely_primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
1256                     zone->elem_size - sizeof(vm_offset_t));
1257         }
1258
1259         /*
1260          * Not sure which is the corrupted one.
1261          * It's less likely that the backup pointer was overwritten with
1262          * ( (sane address) ^ (valid cookie) ), so we'll guess that the
1263          * primary pointer has been overwritten with a sane but incorrect address.
1264          */
1265         if (sane_primary && sane_backup) {
1266                 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1267         }
1268
1269         /* Neither are sane, so just guess. */
1270         zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1271 }
1272
1273 /*
1274  * Adds the element to the head of the zone's free list
1275  * Keeps a backup next-pointer at the end of the element
1276  */
1277 static inline void
1278 free_to_zone(zone_t      zone,
1279     vm_offset_t element,
1280     boolean_t   poison)
1281 {
1282         vm_offset_t old_head;
1283         struct zone_page_metadata *page_meta;
1284
1285         vm_offset_t *primary  = (vm_offset_t *) element;
1286         vm_offset_t *backup   = get_backup_ptr(zone->elem_size, primary);
1287
1288         page_meta = get_zone_page_metadata((struct zone_free_element *)element, FALSE);
1289         assert(PAGE_METADATA_GET_ZONE(page_meta) == zone);
1290         old_head = (vm_offset_t)page_metadata_get_freelist(page_meta);
1291
1292         if (__improbable(!is_sane_zone_element(zone, old_head))) {
1293                 panic("zfree: invalid head pointer %p for freelist of zone %s\n",
1294                     (void *) old_head, zone->zone_name);
1295         }
1296
1297         if (__improbable(!is_sane_zone_element(zone, element))) {
1298                 panic("zfree: freeing invalid pointer %p to zone %s\n",
1299                     (void *) element, zone->zone_name);
1300         }
1301
1302         if (__improbable(old_head == element)) {
1303                 panic("zfree: double free of %p to zone %s\n",
1304                     (void *) element, zone->zone_name);
1305         }
1306         /*
1307          * Always write a redundant next pointer
1308          * So that it is more difficult to forge, xor it with a random cookie
1309          * A poisoned element is indicated by using zp_poisoned_cookie
1310          * instead of zp_nopoison_cookie
1311          */
1312
1313         *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
1314
1315         /*
1316          * Insert this element at the head of the free list. We also xor the
1317          * primary pointer with the zp_nopoison_cookie to make sure a free
1318          * element does not provide the location of the next free element directly.
1319          */
1320         *primary             = old_head ^ zp_nopoison_cookie;
1321         page_metadata_set_freelist(page_meta, (struct zone_free_element *)element);
1322         page_meta->free_count++;
1323         if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) {
1324                 if (page_meta->free_count == 1) {
1325                         /* first foreign element freed on page, move from all_used */
1326                         re_queue_tail(&zone->pages.any_free_foreign, &(page_meta->pages));
1327                 } else {
1328                         /* no other list transitions */
1329                 }
1330         } else if (page_meta->free_count == get_metadata_alloc_count(page_meta)) {
1331                 /* whether the page was on the intermediate or all_used, queue, move it to free */
1332                 re_queue_tail(&zone->pages.all_free, &(page_meta->pages));
1333                 zone->count_all_free_pages += page_meta->page_count;
1334         } else if (page_meta->free_count == 1) {
1335                 /* first free element on page, move from all_used */
1336                 re_queue_tail(&zone->pages.intermediate, &(page_meta->pages));
1337         }
1338         zone->count--;
1339         zone->countfree++;
1340
1341 #if KASAN_ZALLOC
1342         kasan_poison_range(element, zone->elem_size, ASAN_HEAP_FREED);
1343 #endif
1344 }
1345
1346
1347 /*
1348  * Removes an element from the zone's free list, returning 0 if the free list is empty.
1349  * Verifies that the next-pointer and backup next-pointer are intact,
1350  * and verifies that a poisoned element hasn't been modified.
1351  */
1352 static inline vm_offset_t
1353 try_alloc_from_zone(zone_t zone,
1354     vm_tag_t tag __unused,
1355     boolean_t* check_poison)
1356 {
1357         vm_offset_t  element;
1358         struct zone_page_metadata *page_meta;
1359
1360         *check_poison = FALSE;
1361
1362         /* if zone is empty, bail */
1363         if (zone->allows_foreign && !queue_empty(&zone->pages.any_free_foreign)) {
1364                 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
1365         } else if (!queue_empty(&zone->pages.intermediate)) {
1366                 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
1367         } else if (!queue_empty(&zone->pages.all_free)) {
1368                 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
1369                 assert(zone->count_all_free_pages >= page_meta->page_count);
1370                 zone->count_all_free_pages -= page_meta->page_count;
1371         } else {
1372                 return 0;
1373         }
1374         /* Check if page_meta passes is_sane_zone_element */
1375         if (__improbable(!is_sane_zone_page_metadata(zone, (vm_offset_t)page_meta))) {
1376                 panic("zalloc: invalid metadata structure %p for freelist of zone %s\n",
1377                     (void *) page_meta, zone->zone_name);
1378         }
1379         assert(PAGE_METADATA_GET_ZONE(page_meta) == zone);
1380         element = (vm_offset_t)page_metadata_get_freelist(page_meta);
1381
1382         if (__improbable(!is_sane_zone_ptr(zone, element, zone->elem_size))) {
1383                 panic("zfree: invalid head pointer %p for freelist of zone %s\n",
1384                     (void *) element, zone->zone_name);
1385         }
1386
1387         vm_offset_t *primary = (vm_offset_t *) element;
1388         vm_offset_t *backup  = get_backup_ptr(zone->elem_size, primary);
1389
1390         /*
1391          * Since the primary next pointer is xor'ed with zp_nopoison_cookie
1392          * for obfuscation, retrieve the original value back
1393          */
1394         vm_offset_t  next_element          = *primary ^ zp_nopoison_cookie;
1395         vm_offset_t  next_element_primary  = *primary;
1396         vm_offset_t  next_element_backup   = *backup;
1397
1398         /*
1399          * backup_ptr_mismatch_panic will determine what next_element
1400          * should have been, and print it appropriately
1401          */
1402         if (__improbable(!is_sane_zone_element(zone, next_element))) {
1403                 backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup);
1404         }
1405
1406         /* Check the backup pointer for the regular cookie */
1407         if (__improbable(next_element != (next_element_backup ^ zp_nopoison_cookie))) {
1408                 /* Check for the poisoned cookie instead */
1409                 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) {
1410                         /* Neither cookie is valid, corruption has occurred */
1411                         backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup);
1412                 }
1413
1414                 /*
1415                  * Element was marked as poisoned, so check its integrity before using it.
1416                  */
1417                 *check_poison = TRUE;
1418         }
1419
1420         /* Make sure the page_meta is at the correct offset from the start of page */
1421         if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)element, FALSE))) {
1422                 panic("zalloc: Incorrect metadata %p found in zone %s page queue. Expected metadata: %p\n",
1423                     page_meta, zone->zone_name, get_zone_page_metadata((struct zone_free_element *)element, FALSE));
1424         }
1425
1426         /* Make sure next_element belongs to the same page as page_meta */
1427         if (next_element) {
1428                 if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)next_element, FALSE))) {
1429                         panic("zalloc: next element pointer %p for element %p points to invalid element for zone %s\n",
1430                             (void *)next_element, (void *)element, zone->zone_name);
1431                 }
1432         }
1433
1434         /* Remove this element from the free list */
1435         page_metadata_set_freelist(page_meta, (struct zone_free_element *)next_element);
1436         page_meta->free_count--;
1437
1438         if (page_meta->free_count == 0) {
1439                 /* move to all used */
1440                 re_queue_tail(&zone->pages.all_used, &(page_meta->pages));
1441         } else {
1442                 if (!zone->allows_foreign || from_zone_map(element, zone->elem_size)) {
1443                         if (get_metadata_alloc_count(page_meta) == page_meta->free_count + 1) {
1444                                 /* remove from free, move to intermediate */
1445                                 re_queue_tail(&zone->pages.intermediate, &(page_meta->pages));
1446                         }
1447                 }
1448         }
1449         zone->countfree--;
1450         zone->count++;
1451         zone->sum_count++;
1452
1453 #if VM_MAX_TAG_ZONES
1454         if (__improbable(zone->tags)) {
1455                 // set the tag with b0 clear so the block remains inuse
1456                 ZTAG(zone, element)[0] = (tag << 1);
1457         }
1458 #endif /* VM_MAX_TAG_ZONES */
1459
1460
1461 #if KASAN_ZALLOC
1462         kasan_poison_range(element, zone->elem_size, ASAN_VALID);
1463 #endif
1464
1465         return element;
1466 }
1467
1468 /*
1469  * End of zone poisoning
1470  */
1471
1472 /*
1473  * Zone info options
1474  */
1475 #define ZINFO_SLOTS     MAX_ZONES               /* for now */
1476
1477 zone_t          zone_find_largest(void);
1478
1479 /*
1480  * Async allocation of zones
1481  * This mechanism allows for bootstrapping an empty zone which is setup with
1482  * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
1483  * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
1484  * This will prime the zone for the next use.
1485  *
1486  * Currently the thread_callout function (zalloc_async) will loop through all zones
1487  * looking for any zone with async_pending set and do the work for it.
1488  *
1489  * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
1490  * then zalloc_noblock to an empty zone may succeed.
1491  */
1492 void            zalloc_async(
1493         thread_call_param_t     p0,
1494         thread_call_param_t     p1);
1495
1496 static thread_call_data_t call_async_alloc;
1497
1498 /*
1499  * Align elements that use the zone page list to 32 byte boundaries.
1500  */
1501 #define ZONE_ELEMENT_ALIGNMENT 32
1502
1503 #define zone_wakeup(zone) thread_wakeup((event_t)(zone))
1504 #define zone_sleep(zone)                                \
1505         (void) lck_mtx_sleep(&(zone)->lock, LCK_SLEEP_SPIN_ALWAYS, (event_t)(zone), THREAD_UNINT);
1506
1507
1508 #define lock_zone_init(zone)                            \
1509 MACRO_BEGIN                                             \
1510         lck_attr_setdefault(&(zone)->lock_attr);                        \
1511         lck_mtx_init_ext(&(zone)->lock, &(zone)->lock_ext,              \
1512             &zone_locks_grp, &(zone)->lock_attr);                       \
1513 MACRO_END
1514
1515 #define lock_try_zone(zone)     lck_mtx_try_lock_spin(&zone->lock)
1516
1517 /*
1518  *      Exclude more than one concurrent garbage collection
1519  */
1520 decl_lck_mtx_data(, zone_gc_lock)
1521
1522 lck_attr_t      zone_gc_lck_attr;
1523 lck_grp_t       zone_gc_lck_grp;
1524 lck_grp_attr_t  zone_gc_lck_grp_attr;
1525 lck_mtx_ext_t   zone_gc_lck_ext;
1526
1527 boolean_t zone_gc_allowed = TRUE;
1528 boolean_t panic_include_zprint = FALSE;
1529
1530 mach_memory_info_t *panic_kext_memory_info = NULL;
1531 vm_size_t panic_kext_memory_size = 0;
1532
1533 #define ZALLOC_DEBUG_ZONEGC             0x00000001
1534 #define ZALLOC_DEBUG_ZCRAM              0x00000002
1535 uint32_t zalloc_debug = 0;
1536
1537 /*
1538  * Zone leak debugging code
1539  *
1540  * When enabled, this code keeps a log to track allocations to a particular zone that have not
1541  * yet been freed.  Examining this log will reveal the source of a zone leak.  The log is allocated
1542  * only when logging is enabled, so there is no effect on the system when it's turned off.  Logging is
1543  * off by default.
1544  *
1545  * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
1546  * is the name of the zone you wish to log.
1547  *
1548  * This code only tracks one zone, so you need to identify which one is leaking first.
1549  * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
1550  * garbage collector.  Note that the zone name printed in the panic message is not necessarily the one
1551  * containing the leak.  So do a zprint from gdb and locate the zone with the bloated size.  This
1552  * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test.  The
1553  * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
1554  * See the help in the kgmacros for usage info.
1555  *
1556  *
1557  * Zone corruption logging
1558  *
1559  * Logging can also be used to help identify the source of a zone corruption.  First, identify the zone
1560  * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args.  When -zc is used in conjunction
1561  * with zlog, it changes the logging style to track both allocations and frees to the zone.  So when the
1562  * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
1563  * and freed any particular element in the zone.  Use the findelem kgmacro with the address of the element that's been
1564  * corrupted to examine its history.  This should lead to the source of the corruption.
1565  */
1566
1567 static boolean_t log_records_init = FALSE;
1568 static int log_records; /* size of the log, expressed in number of records */
1569
1570 #define MAX_NUM_ZONES_ALLOWED_LOGGING   10 /* Maximum 10 zones can be logged at once */
1571
1572 static int  max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
1573 static int  num_zones_logged = 0;
1574
1575 static char zone_name_to_log[MAX_ZONE_NAME] = "";       /* the zone name we're logging, if any */
1576
1577 /* Log allocations and frees to help debug a zone element corruption */
1578 boolean_t       corruption_debug_flag    = DEBUG;    /* enabled by "-zc" boot-arg */
1579 /* Making pointer scanning leaks detection possible for all zones */
1580
1581 #if DEBUG || DEVELOPMENT
1582 boolean_t       leak_scan_debug_flag     = FALSE;    /* enabled by "-zl" boot-arg */
1583 #endif /* DEBUG || DEVELOPMENT */
1584
1585
1586 /*
1587  * The number of records in the log is configurable via the zrecs parameter in boot-args.  Set this to
1588  * the number of records you want in the log.  For example, "zrecs=10" sets it to 10 records. Since this
1589  * is the number of stacks suspected of leaking, we don't need many records.
1590  */
1591
1592 #if     defined(__LP64__)
1593 #define ZRECORDS_MAX            2560            /* Max records allowed in the log */
1594 #else
1595 #define ZRECORDS_MAX            1536            /* Max records allowed in the log */
1596 #endif
1597 #define ZRECORDS_DEFAULT        1024            /* default records in log if zrecs is not specificed in boot-args */
1598
1599 /*
1600  * Each record in the log contains a pointer to the zone element it refers to,
1601  * and a small array to hold the pc's from the stack trace.  A
1602  * record is added to the log each time a zalloc() is done in the zone_of_interest.  For leak debugging,
1603  * the record is cleared when a zfree() is done.  For corruption debugging, the log tracks both allocs and frees.
1604  * If the log fills, old records are replaced as if it were a circular buffer.
1605  */
1606
1607
1608 /*
1609  * Decide if we want to log this zone by doing a string compare between a zone name and the name
1610  * of the zone to log. Return true if the strings are equal, false otherwise.  Because it's not
1611  * possible to include spaces in strings passed in via the boot-args, a period in the logname will
1612  * match a space in the zone name.
1613  */
1614
1615 int
1616 track_this_zone(const char *zonename, const char *logname)
1617 {
1618         unsigned int len;
1619         const char *zc = zonename;
1620         const char *lc = logname;
1621
1622         /*
1623          * Compare the strings.  We bound the compare by MAX_ZONE_NAME.
1624          */
1625
1626         for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
1627                 /*
1628                  * If the current characters don't match, check for a space in
1629                  * in the zone name and a corresponding period in the log name.
1630                  * If that's not there, then the strings don't match.
1631                  */
1632
1633                 if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
1634                         break;
1635                 }
1636
1637                 /*
1638                  * The strings are equal so far.  If we're at the end, then it's a match.
1639                  */
1640
1641                 if (*zc == '\0') {
1642                         return TRUE;
1643                 }
1644         }
1645
1646         return FALSE;
1647 }
1648
1649
1650 /*
1651  * Test if we want to log this zalloc/zfree event.  We log if this is the zone we're interested in and
1652  * the buffer for the records has been allocated.
1653  */
1654
1655 #define DO_LOGGING(z)           (z->zone_logging == TRUE && z->zlog_btlog)
1656
1657 extern boolean_t kmem_alloc_ready;
1658
1659 #if CONFIG_ZLEAKS
1660 #pragma mark -
1661 #pragma mark Zone Leak Detection
1662
1663 /*
1664  * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
1665  * allocations made by the zone allocator.  Every zleak_sample_factor allocations in each zone, we capture a
1666  * backtrace.  Every free, we examine the table and determine if the allocation was being tracked,
1667  * and stop tracking it if it was being tracked.
1668  *
1669  * We track the allocations in the zallocations hash table, which stores the address that was returned from
1670  * the zone allocator.  Each stored entry in the zallocations table points to an entry in the ztraces table, which
1671  * stores the backtrace associated with that allocation.  This provides uniquing for the relatively large
1672  * backtraces - we don't store them more than once.
1673  *
1674  * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
1675  * a large amount of virtual space.
1676  */
1677 #define ZLEAK_STATE_ENABLED             0x01    /* Zone leak monitoring should be turned on if zone_map fills up. */
1678 #define ZLEAK_STATE_ACTIVE              0x02    /* We are actively collecting traces. */
1679 #define ZLEAK_STATE_ACTIVATING          0x04    /* Some thread is doing setup; others should move along. */
1680 #define ZLEAK_STATE_FAILED              0x08    /* Attempt to allocate tables failed.  We will not try again. */
1681 uint32_t        zleak_state = 0;                /* State of collection, as above */
1682
1683 boolean_t       panic_include_ztrace    = FALSE;        /* Enable zleak logging on panic */
1684 vm_size_t       zleak_global_tracking_threshold;        /* Size of zone map at which to start collecting data */
1685 vm_size_t       zleak_per_zone_tracking_threshold;      /* Size a zone will have before we will collect data on it */
1686 unsigned int    zleak_sample_factor     = 1000;         /* Allocations per sample attempt */
1687
1688 /*
1689  * Counters for allocation statistics.
1690  */
1691
1692 /* Times two active records want to occupy the same spot */
1693 unsigned int z_alloc_collisions = 0;
1694 unsigned int z_trace_collisions = 0;
1695
1696 /* Times a new record lands on a spot previously occupied by a freed allocation */
1697 unsigned int z_alloc_overwrites = 0;
1698 unsigned int z_trace_overwrites = 0;
1699
1700 /* Times a new alloc or trace is put into the hash table */
1701 unsigned int z_alloc_recorded   = 0;
1702 unsigned int z_trace_recorded   = 0;
1703
1704 /* Times zleak_log returned false due to not being able to acquire the lock */
1705 unsigned int z_total_conflicts  = 0;
1706
1707
1708 #pragma mark struct zallocation
1709 /*
1710  * Structure for keeping track of an allocation
1711  * An allocation bucket is in use if its element is not NULL
1712  */
1713 struct zallocation {
1714         uintptr_t               za_element;             /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
1715         vm_size_t               za_size;                        /* how much memory did this allocation take up? */
1716         uint32_t                za_trace_index; /* index into ztraces for backtrace associated with allocation */
1717         /* TODO: #if this out */
1718         uint32_t                za_hit_count;           /* for determining effectiveness of hash function */
1719 };
1720
1721 /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
1722 uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
1723 uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
1724
1725 vm_size_t zleak_max_zonemap_size;
1726
1727 /* Hashmaps of allocations and their corresponding traces */
1728 static struct zallocation*      zallocations;
1729 static struct ztrace*           ztraces;
1730
1731 /* not static so that panic can see this, see kern/debug.c */
1732 struct ztrace*                          top_ztrace;
1733
1734 /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
1735 static lck_spin_t                       zleak_lock;
1736 static lck_attr_t                       zleak_lock_attr;
1737 static lck_grp_t                        zleak_lock_grp;
1738 static lck_grp_attr_t                   zleak_lock_grp_attr;
1739
1740 /*
1741  * Initializes the zone leak monitor.  Called from zone_init()
1742  */
1743 static void
1744 zleak_init(vm_size_t max_zonemap_size)
1745 {
1746         char                    scratch_buf[16];
1747         boolean_t               zleak_enable_flag = FALSE;
1748
1749         zleak_max_zonemap_size = max_zonemap_size;
1750         zleak_global_tracking_threshold = max_zonemap_size / 2;
1751         zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
1752
1753 #if CONFIG_EMBEDDED
1754         if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) {
1755                 zleak_enable_flag = TRUE;
1756                 printf("zone leak detection enabled\n");
1757         } else {
1758                 zleak_enable_flag = FALSE;
1759                 printf("zone leak detection disabled\n");
1760         }
1761 #else /* CONFIG_EMBEDDED */
1762         /* -zleakoff (flag to disable zone leak monitor) */
1763         if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
1764                 zleak_enable_flag = FALSE;
1765                 printf("zone leak detection disabled\n");
1766         } else {
1767                 zleak_enable_flag = TRUE;
1768                 printf("zone leak detection enabled\n");
1769         }
1770 #endif /* CONFIG_EMBEDDED */
1771
1772         /* zfactor=XXXX (override how often to sample the zone allocator) */
1773         if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
1774                 printf("Zone leak factor override: %u\n", zleak_sample_factor);
1775         }
1776
1777         /* zleak-allocs=XXXX (override number of buckets in zallocations) */
1778         if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
1779                 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
1780                 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1781                 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) {
1782                         printf("Override isn't a power of two, bad things might happen!\n");
1783                 }
1784         }
1785
1786         /* zleak-traces=XXXX (override number of buckets in ztraces) */
1787         if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
1788                 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
1789                 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1790                 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) {
1791                         printf("Override isn't a power of two, bad things might happen!\n");
1792                 }
1793         }
1794
1795         /* allocate the zleak_lock */
1796         lck_grp_attr_setdefault(&zleak_lock_grp_attr);
1797         lck_grp_init(&zleak_lock_grp, "zleak_lock", &zleak_lock_grp_attr);
1798         lck_attr_setdefault(&zleak_lock_attr);
1799         lck_spin_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr);
1800
1801         if (zleak_enable_flag) {
1802                 zleak_state = ZLEAK_STATE_ENABLED;
1803         }
1804 }
1805
1806 #if CONFIG_ZLEAKS
1807
1808 /*
1809  * Support for kern.zleak.active sysctl - a simplified
1810  * version of the zleak_state variable.
1811  */
1812 int
1813 get_zleak_state(void)
1814 {
1815         if (zleak_state & ZLEAK_STATE_FAILED) {
1816                 return -1;
1817         }
1818         if (zleak_state & ZLEAK_STATE_ACTIVE) {
1819                 return 1;
1820         }
1821         return 0;
1822 }
1823
1824 #endif
1825
1826
1827 kern_return_t
1828 zleak_activate(void)
1829 {
1830         kern_return_t retval;
1831         vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
1832         vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
1833         void *allocations_ptr = NULL;
1834         void *traces_ptr = NULL;
1835
1836         /* Only one thread attempts to activate at a time */
1837         if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1838                 return KERN_SUCCESS;
1839         }
1840
1841         /* Indicate that we're doing the setup */
1842         lck_spin_lock(&zleak_lock);
1843         if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1844                 lck_spin_unlock(&zleak_lock);
1845                 return KERN_SUCCESS;
1846         }
1847
1848         zleak_state |= ZLEAK_STATE_ACTIVATING;
1849         lck_spin_unlock(&zleak_lock);
1850
1851         /* Allocate and zero tables */
1852         retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK);
1853         if (retval != KERN_SUCCESS) {
1854                 goto fail;
1855         }
1856
1857         retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK);
1858         if (retval != KERN_SUCCESS) {
1859                 goto fail;
1860         }
1861
1862         bzero(allocations_ptr, z_alloc_size);
1863         bzero(traces_ptr, z_trace_size);
1864
1865         /* Everything's set.  Install tables, mark active. */
1866         zallocations = allocations_ptr;
1867         ztraces = traces_ptr;
1868
1869         /*
1870          * Initialize the top_ztrace to the first entry in ztraces,
1871          * so we don't have to check for null in zleak_log
1872          */
1873         top_ztrace = &ztraces[0];
1874
1875         /*
1876          * Note that we do need a barrier between installing
1877          * the tables and setting the active flag, because the zfree()
1878          * path accesses the table without a lock if we're active.
1879          */
1880         lck_spin_lock(&zleak_lock);
1881         zleak_state |= ZLEAK_STATE_ACTIVE;
1882         zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1883         lck_spin_unlock(&zleak_lock);
1884
1885         return 0;
1886
1887 fail:
1888         /*
1889          * If we fail to allocate memory, don't further tax
1890          * the system by trying again.
1891          */
1892         lck_spin_lock(&zleak_lock);
1893         zleak_state |= ZLEAK_STATE_FAILED;
1894         zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1895         lck_spin_unlock(&zleak_lock);
1896
1897         if (allocations_ptr != NULL) {
1898                 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
1899         }
1900
1901         if (traces_ptr != NULL) {
1902                 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
1903         }
1904
1905         return retval;
1906 }
1907
1908 /*
1909  * TODO: What about allocations that never get deallocated,
1910  * especially ones with unique backtraces? Should we wait to record
1911  * until after boot has completed?
1912  * (How many persistent zallocs are there?)
1913  */
1914
1915 /*
1916  * This function records the allocation in the allocations table,
1917  * and stores the associated backtrace in the traces table
1918  * (or just increments the refcount if the trace is already recorded)
1919  * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
1920  * the associated trace's refcount is decremented.
1921  * If the trace slot is in use, it returns.
1922  * The refcount is incremented by the amount of memory the allocation consumes.
1923  * The return value indicates whether to try again next time.
1924  */
1925 static boolean_t
1926 zleak_log(uintptr_t* bt,
1927     uintptr_t addr,
1928     uint32_t depth,
1929     vm_size_t allocation_size)
1930 {
1931         /* Quit if there's someone else modifying the hash tables */
1932         if (!lck_spin_try_lock(&zleak_lock)) {
1933                 z_total_conflicts++;
1934                 return FALSE;
1935         }
1936
1937         struct zallocation* allocation  = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
1938
1939         uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
1940         struct ztrace* trace = &ztraces[trace_index];
1941
1942         allocation->za_hit_count++;
1943         trace->zt_hit_count++;
1944
1945         /*
1946          * If the allocation bucket we want to be in is occupied, and if the occupier
1947          * has the same trace as us, just bail.
1948          */
1949         if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
1950                 z_alloc_collisions++;
1951
1952                 lck_spin_unlock(&zleak_lock);
1953                 return TRUE;
1954         }
1955
1956         /* STEP 1: Store the backtrace in the traces array. */
1957         /* A size of zero indicates that the trace bucket is free. */
1958
1959         if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) {
1960                 /*
1961                  * Different unique trace with same hash!
1962                  * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
1963                  * and get out of the way for later chances
1964                  */
1965                 trace->zt_collisions++;
1966                 z_trace_collisions++;
1967
1968                 lck_spin_unlock(&zleak_lock);
1969                 return TRUE;
1970         } else if (trace->zt_size > 0) {
1971                 /* Same trace, already added, so increment refcount */
1972                 trace->zt_size += allocation_size;
1973         } else {
1974                 /* Found an unused trace bucket, record the trace here! */
1975                 if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */
1976                         z_trace_overwrites++;
1977                 }
1978
1979                 z_trace_recorded++;
1980                 trace->zt_size                  = allocation_size;
1981                 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)));
1982
1983                 trace->zt_depth         = depth;
1984                 trace->zt_collisions    = 0;
1985         }
1986
1987         /* STEP 2: Store the allocation record in the allocations array. */
1988
1989         if (allocation->za_element != (uintptr_t) 0) {
1990                 /*
1991                  * Straight up replace any allocation record that was there.  We don't want to do the work
1992                  * to preserve the allocation entries that were there, because we only record a subset of the
1993                  * allocations anyways.
1994                  */
1995
1996                 z_alloc_collisions++;
1997
1998                 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
1999                 /* Knock off old allocation's size, not the new allocation */
2000                 associated_trace->zt_size -= allocation->za_size;
2001         } else if (allocation->za_trace_index != 0) {
2002                 /* Slot previously used but not currently in use */
2003                 z_alloc_overwrites++;
2004         }
2005
2006         allocation->za_element          = addr;
2007         allocation->za_trace_index      = trace_index;
2008         allocation->za_size             = allocation_size;
2009
2010         z_alloc_recorded++;
2011
2012         if (top_ztrace->zt_size < trace->zt_size) {
2013                 top_ztrace = trace;
2014         }
2015
2016         lck_spin_unlock(&zleak_lock);
2017         return TRUE;
2018 }
2019
2020 /*
2021  * Free the allocation record and release the stacktrace.
2022  * This should be as fast as possible because it will be called for every free.
2023  */
2024 static void
2025 zleak_free(uintptr_t addr,
2026     vm_size_t allocation_size)
2027 {
2028         if (addr == (uintptr_t) 0) {
2029                 return;
2030         }
2031
2032         struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2033
2034         /* Double-checked locking: check to find out if we're interested, lock, check to make
2035          * sure it hasn't changed, then modify it, and release the lock.
2036          */
2037
2038         if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2039                 /* if the allocation was the one, grab the lock, check again, then delete it */
2040                 lck_spin_lock(&zleak_lock);
2041
2042                 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2043                         struct ztrace *trace;
2044
2045                         /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
2046                         if (allocation->za_size != allocation_size) {
2047                                 panic("Freeing as size %lu memory that was allocated with size %lu\n",
2048                                     (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
2049                         }
2050
2051                         trace = &ztraces[allocation->za_trace_index];
2052
2053                         /* size of 0 indicates trace bucket is unused */
2054                         if (trace->zt_size > 0) {
2055                                 trace->zt_size -= allocation_size;
2056                         }
2057
2058                         /* A NULL element means the allocation bucket is unused */
2059                         allocation->za_element = 0;
2060                 }
2061                 lck_spin_unlock(&zleak_lock);
2062         }
2063 }
2064
2065 #endif /* CONFIG_ZLEAKS */
2066
2067 /*  These functions outside of CONFIG_ZLEAKS because they are also used in
2068  *  mbuf.c for mbuf leak-detection.  This is why they lack the z_ prefix.
2069  */
2070
2071 /* "Thomas Wang's 32/64 bit mix functions."  http://www.concentric.net/~Ttwang/tech/inthash.htm */
2072 uintptr_t
2073 hash_mix(uintptr_t x)
2074 {
2075 #ifndef __LP64__
2076         x += ~(x << 15);
2077         x ^=  (x >> 10);
2078         x +=  (x << 3);
2079         x ^=  (x >> 6);
2080         x += ~(x << 11);
2081         x ^=  (x >> 16);
2082 #else
2083         x += ~(x << 32);
2084         x ^=  (x >> 22);
2085         x += ~(x << 13);
2086         x ^=  (x >> 8);
2087         x +=  (x << 3);
2088         x ^=  (x >> 15);
2089         x += ~(x << 27);
2090         x ^=  (x >> 31);
2091 #endif
2092         return x;
2093 }
2094
2095 uint32_t
2096 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
2097 {
2098         uintptr_t hash = 0;
2099         uintptr_t mask = max_size - 1;
2100
2101         while (depth) {
2102                 hash += bt[--depth];
2103         }
2104
2105         hash = hash_mix(hash) & mask;
2106
2107         assert(hash < max_size);
2108
2109         return (uint32_t) hash;
2110 }
2111
2112 /*
2113  *  TODO: Determine how well distributed this is
2114  *      max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
2115  */
2116 uint32_t
2117 hashaddr(uintptr_t pt, uint32_t max_size)
2118 {
2119         uintptr_t hash = 0;
2120         uintptr_t mask = max_size - 1;
2121
2122         hash = hash_mix(pt) & mask;
2123
2124         assert(hash < max_size);
2125
2126         return (uint32_t) hash;
2127 }
2128
2129 /* End of all leak-detection code */
2130 #pragma mark -
2131
2132 #define ZONE_MAX_ALLOC_SIZE     (32 * 1024)
2133 #define ZONE_ALLOC_FRAG_PERCENT(alloc_size, ele_size) (((alloc_size % ele_size) * 100) / alloc_size)
2134
2135 /* Used to manage copying in of new zone names */
2136 static vm_offset_t zone_names_start;
2137 static vm_offset_t zone_names_next;
2138
2139 static vm_size_t
2140 compute_element_size(vm_size_t requested_size)
2141 {
2142         vm_size_t element_size = requested_size;
2143
2144         /* Zone elements must fit both a next pointer and a backup pointer */
2145         vm_size_t  minimum_element_size = sizeof(vm_offset_t) * 2;
2146         if (element_size < minimum_element_size) {
2147                 element_size = minimum_element_size;
2148         }
2149
2150         /*
2151          *  Round element size to a multiple of sizeof(pointer)
2152          *  This also enforces that allocations will be aligned on pointer boundaries
2153          */
2154         element_size = ((element_size - 1) + sizeof(vm_offset_t)) -
2155             ((element_size - 1) % sizeof(vm_offset_t));
2156
2157         return element_size;
2158 }
2159
2160 #if KASAN_ZALLOC
2161
2162 /*
2163  * Called from zinit().
2164  *
2165  * Fixes up the zone's element size to incorporate the redzones.
2166  */
2167 static void
2168 kasan_update_element_size_for_redzone(
2169         zone_t          zone,           /* the zone that needs to be updated */
2170         vm_size_t       *size,          /* requested zone element size */
2171         vm_size_t       *max,           /* maximum memory to use */
2172         const char      *name)          /* zone name */
2173 {
2174         /* Expand the zone allocation size to include the redzones. For page-multiple
2175          * zones add a full guard page because they likely require alignment. kalloc
2176          * and fakestack handles its own KASan state, so ignore those zones. */
2177         /* XXX: remove this when zinit_with_options() is a thing */
2178         const char *kalloc_name = "kalloc.";
2179         const char *fakestack_name = "fakestack.";
2180         if (strncmp(name, kalloc_name, strlen(kalloc_name)) == 0) {
2181                 zone->kasan_redzone = 0;
2182         } else if (strncmp(name, fakestack_name, strlen(fakestack_name)) == 0) {
2183                 zone->kasan_redzone = 0;
2184         } else {
2185                 if ((*size % PAGE_SIZE) != 0) {
2186                         zone->kasan_redzone = KASAN_GUARD_SIZE;
2187                 } else {
2188                         zone->kasan_redzone = PAGE_SIZE;
2189                 }
2190                 *max = (*max / *size) * (*size + zone->kasan_redzone * 2);
2191                 *size += zone->kasan_redzone * 2;
2192         }
2193 }
2194
2195 /*
2196  * Called from zalloc_internal() to fix up the address of the newly
2197  * allocated element.
2198  *
2199  * Returns the element address skipping over the redzone on the left.
2200  */
2201 static vm_offset_t
2202 kasan_fixup_allocated_element_address(
2203         zone_t                  zone,   /* the zone the element belongs to */
2204         vm_offset_t             addr)   /* address of the element, including the redzone */
2205 {
2206         /* Fixup the return address to skip the redzone */
2207         if (zone->kasan_redzone) {
2208                 addr = kasan_alloc(addr, zone->elem_size,
2209                     zone->elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
2210         }
2211         return addr;
2212 }
2213
2214 /*
2215  * Called from zfree() to add the element being freed to the KASan quarantine.
2216  *
2217  * Returns true if the newly-freed element made it into the quarantine without
2218  * displacing another, false otherwise. In the latter case, addrp points to the
2219  * address of the displaced element, which will be freed by the zone.
2220  */
2221 static bool
2222 kasan_quarantine_freed_element(
2223         zone_t          *zonep,         /* the zone the element is being freed to */
2224         void            **addrp)        /* address of the element being freed */
2225 {
2226         zone_t zone = *zonep;
2227         void *addr = *addrp;
2228
2229         /*
2230          * Resize back to the real allocation size and hand off to the KASan
2231          * quarantine. `addr` may then point to a different allocation, if the
2232          * current element replaced another in the quarantine. The zone then
2233          * takes ownership of the swapped out free element.
2234          */
2235         vm_size_t usersz = zone->elem_size - 2 * zone->kasan_redzone;
2236         vm_size_t sz = usersz;
2237
2238         if (addr && zone->kasan_redzone) {
2239                 kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
2240                 addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
2241                 assert(sz == zone->elem_size);
2242         }
2243         if (addr && zone->kasan_quarantine) {
2244                 kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true);
2245                 if (!addr) {
2246                         return TRUE;
2247                 }
2248         }
2249         *addrp = addr;
2250         return FALSE;
2251 }
2252
2253 #endif /* KASAN_ZALLOC */
2254
2255 /*
2256  *      zinit initializes a new zone.  The zone data structures themselves
2257  *      are stored in a zone, which is initially a static structure that
2258  *      is initialized by zone_init.
2259  */
2260
2261 zone_t
2262 zinit(
2263         vm_size_t       size,           /* the size of an element */
2264         vm_size_t       max,            /* maximum memory to use */
2265         vm_size_t       alloc,          /* allocation size */
2266         const char      *name)          /* a name for the zone */
2267 {
2268         zone_t                  z;
2269
2270         size = compute_element_size(size);
2271
2272         simple_lock(&all_zones_lock, &zone_locks_grp);
2273
2274         assert(num_zones < MAX_ZONES);
2275         assert(num_zones_in_use <= num_zones);
2276
2277         /* If possible, find a previously zdestroy'ed zone in the zone_array that we can reuse instead of initializing a new zone. */
2278         for (int index = bitmap_first(zone_empty_bitmap, MAX_ZONES);
2279             index >= 0 && index < (int)num_zones;
2280             index = bitmap_next(zone_empty_bitmap, index)) {
2281                 z = &(zone_array[index]);
2282
2283                 /*
2284                  * If the zone name and the element size are the same, we can just reuse the old zone struct.
2285                  * Otherwise hand out a new zone from the zone_array.
2286                  */
2287                 if (!strcmp(z->zone_name, name)) {
2288                         vm_size_t old_size = z->elem_size;
2289 #if KASAN_ZALLOC
2290                         old_size -= z->kasan_redzone * 2;
2291 #endif
2292                         if (old_size == size) {
2293                                 /* Clear the empty bit for this zone, increment num_zones_in_use, and mark the zone as valid again. */
2294                                 bitmap_clear(zone_empty_bitmap, index);
2295                                 num_zones_in_use++;
2296                                 z->zone_valid = TRUE;
2297
2298                                 /* All other state is already set up since the zone was previously in use. Return early. */
2299                                 simple_unlock(&all_zones_lock);
2300                                 return z;
2301                         }
2302                 }
2303         }
2304
2305         /* If we're here, it means we didn't find a zone above that we could simply reuse. Set up a new zone. */
2306
2307         /* Clear the empty bit for the new zone */
2308         bitmap_clear(zone_empty_bitmap, num_zones);
2309
2310         z = &(zone_array[num_zones]);
2311         z->index = num_zones;
2312
2313         num_zones++;
2314         num_zones_in_use++;
2315
2316         /*
2317          * Initialize the zone lock here before dropping the all_zones_lock. Otherwise we could race with
2318          * zalloc_async() and try to grab the zone lock before it has been initialized, causing a panic.
2319          */
2320         lock_zone_init(z);
2321
2322         simple_unlock(&all_zones_lock);
2323
2324 #if KASAN_ZALLOC
2325         kasan_update_element_size_for_redzone(z, &size, &max, name);
2326 #endif
2327
2328         max = round_page(max);
2329
2330         vm_size_t best_alloc = PAGE_SIZE;
2331
2332         if ((size % PAGE_SIZE) == 0) {
2333                 /* zero fragmentation by definition */
2334                 best_alloc = size;
2335         } else {
2336                 vm_size_t alloc_size;
2337                 for (alloc_size = (2 * PAGE_SIZE); alloc_size <= ZONE_MAX_ALLOC_SIZE; alloc_size += PAGE_SIZE) {
2338                         if (ZONE_ALLOC_FRAG_PERCENT(alloc_size, size) < ZONE_ALLOC_FRAG_PERCENT(best_alloc, size)) {
2339                                 best_alloc = alloc_size;
2340                         }
2341                 }
2342         }
2343
2344         alloc = best_alloc;
2345         if (max && (max < alloc)) {
2346                 max = alloc;
2347         }
2348
2349         z->free_elements = NULL;
2350         queue_init(&z->pages.any_free_foreign);
2351         queue_init(&z->pages.all_free);
2352         queue_init(&z->pages.intermediate);
2353         queue_init(&z->pages.all_used);
2354         z->cur_size = 0;
2355         z->page_count = 0;
2356         z->max_size = max;
2357         z->elem_size = size;
2358         z->alloc_size = alloc;
2359         z->count = 0;
2360         z->countfree = 0;
2361         z->count_all_free_pages = 0;
2362         z->sum_count = 0LL;
2363         z->doing_alloc_without_vm_priv = FALSE;
2364         z->doing_alloc_with_vm_priv = FALSE;
2365         z->exhaustible = FALSE;
2366         z->collectable = TRUE;
2367         z->allows_foreign = FALSE;
2368         z->expandable  = TRUE;
2369         z->waiting = FALSE;
2370         z->async_pending = FALSE;
2371         z->caller_acct = TRUE;
2372         z->noencrypt = FALSE;
2373         z->no_callout = FALSE;
2374         z->async_prio_refill = FALSE;
2375         z->gzalloc_exempt = FALSE;
2376         z->alignment_required = FALSE;
2377         z->zone_replenishing = FALSE;
2378         z->prio_refill_watermark = 0;
2379         z->zone_replenish_thread = NULL;
2380         z->zp_count = 0;
2381         z->kasan_quarantine = TRUE;
2382         z->zone_valid = TRUE;
2383         z->cpu_cache_enabled = FALSE;
2384
2385 #if CONFIG_ZLEAKS
2386         z->zleak_capture = 0;
2387         z->zleak_on = FALSE;
2388 #endif /* CONFIG_ZLEAKS */
2389
2390         /*
2391          * If the VM is ready to handle kmem_alloc requests, copy the zone name passed in.
2392          *
2393          * Else simply maintain a pointer to the name string. The only zones we'll actually have
2394          * to do this for would be the VM-related zones that are created very early on before any
2395          * kexts can be loaded (unloaded). So we should be fine with just a pointer in this case.
2396          */
2397         if (kmem_alloc_ready) {
2398                 size_t len = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
2399
2400                 if (zone_names_start == 0 || ((zone_names_next - zone_names_start) + len) > PAGE_SIZE) {
2401                         printf("zalloc: allocating memory for zone names buffer\n");
2402                         kern_return_t retval = kmem_alloc_kobject(kernel_map, &zone_names_start,
2403                             PAGE_SIZE, VM_KERN_MEMORY_OSFMK);
2404                         if (retval != KERN_SUCCESS) {
2405                                 panic("zalloc: zone_names memory allocation failed");
2406                         }
2407                         bzero((char *)zone_names_start, PAGE_SIZE);
2408                         zone_names_next = zone_names_start;
2409                 }
2410
2411                 strlcpy((char *)zone_names_next, name, len);
2412                 z->zone_name = (char *)zone_names_next;
2413                 zone_names_next += len;
2414         } else {
2415                 z->zone_name = name;
2416         }
2417
2418         /*
2419          * Check for and set up zone leak detection if requested via boot-args.  We recognized two
2420          * boot-args:
2421          *
2422          *      zlog=<zone_to_log>
2423          *      zrecs=<num_records_in_log>
2424          *
2425          * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to
2426          * control the size of the log.  If zrecs is not specified, a default value is used.
2427          */
2428
2429         if (num_zones_logged < max_num_zones_to_log) {
2430                 int             i = 1; /* zlog0 isn't allowed. */
2431                 boolean_t       zone_logging_enabled = FALSE;
2432                 char            zlog_name[MAX_ZONE_NAME] = ""; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
2433
2434                 while (i <= max_num_zones_to_log) {
2435                         snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
2436
2437                         if (PE_parse_boot_argn(zlog_name, zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
2438                                 if (track_this_zone(z->zone_name, zone_name_to_log)) {
2439                                         if (z->zone_valid) {
2440                                                 z->zone_logging = TRUE;
2441                                                 zone_logging_enabled = TRUE;
2442                                                 num_zones_logged++;
2443                                                 break;
2444                                         }
2445                                 }
2446                         }
2447                         i++;
2448                 }
2449
2450                 if (zone_logging_enabled == FALSE) {
2451                         /*
2452                          * Backwards compat. with the old boot-arg used to specify single zone logging i.e. zlog
2453                          * Needs to happen after the newer zlogn checks because the prefix will match all the zlogn
2454                          * boot-args.
2455                          */
2456                         if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
2457                                 if (track_this_zone(z->zone_name, zone_name_to_log)) {
2458                                         if (z->zone_valid) {
2459                                                 z->zone_logging = TRUE;
2460                                                 zone_logging_enabled = TRUE;
2461                                                 num_zones_logged++;
2462                                         }
2463                                 }
2464                         }
2465                 }
2466
2467                 if (log_records_init == FALSE && zone_logging_enabled == TRUE) {
2468                         if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) {
2469                                 /*
2470                                  * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
2471                                  * This prevents accidentally hogging too much kernel memory and making the system
2472                                  * unusable.
2473                                  */
2474
2475                                 log_records = MIN(ZRECORDS_MAX, log_records);
2476                                 log_records_init = TRUE;
2477                         } else {
2478                                 log_records = ZRECORDS_DEFAULT;
2479                                 log_records_init = TRUE;
2480                         }
2481                 }
2482
2483                 /*
2484                  * If we want to log a zone, see if we need to allocate buffer space for the log.  Some vm related zones are
2485                  * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case.  kmem_alloc_ready is set to
2486                  * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work.  If we want to log one
2487                  * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again
2488                  * later on some other zone.  So note we may be allocating a buffer to log a zone other than the one being initialized
2489                  * right now.
2490                  */
2491                 if (kmem_alloc_ready) {
2492                         zone_t curr_zone = NULL;
2493                         unsigned int max_zones = 0, zone_idx = 0;
2494
2495                         simple_lock(&all_zones_lock, &zone_locks_grp);
2496                         max_zones = num_zones;
2497                         simple_unlock(&all_zones_lock);
2498
2499                         for (zone_idx = 0; zone_idx < max_zones; zone_idx++) {
2500                                 curr_zone = &(zone_array[zone_idx]);
2501
2502                                 if (!curr_zone->zone_valid) {
2503                                         continue;
2504                                 }
2505
2506                                 /*
2507                                  * We work with the zone unlocked here because we could end up needing the zone lock to
2508                                  * enable logging for this zone e.g. need a VM object to allocate memory to enable logging for the
2509                                  * VM objects zone.
2510                                  *
2511                                  * We don't expect these zones to be needed at this early a time in boot and so take this chance.
2512                                  */
2513                                 if (curr_zone->zone_logging && curr_zone->zlog_btlog == NULL) {
2514                                         curr_zone->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
2515
2516                                         if (curr_zone->zlog_btlog) {
2517                                                 printf("zone: logging started for zone %s\n", curr_zone->zone_name);
2518                                         } else {
2519                                                 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
2520                                                 curr_zone->zone_logging = FALSE;
2521                                         }
2522                                 }
2523                         }
2524                 }
2525         }
2526
2527 #if     CONFIG_GZALLOC
2528         gzalloc_zone_init(z);
2529 #endif
2530
2531 #if     CONFIG_ZCACHE
2532         /* Check if boot-arg specified it should have a cache */
2533         if (cache_all_zones || track_this_zone(name, cache_zone_name)) {
2534                 zone_change(z, Z_CACHING_ENABLED, TRUE);
2535         }
2536 #endif
2537
2538         return z;
2539 }
2540 unsigned        zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count;
2541
2542 static void zone_replenish_thread(zone_t);
2543
2544 /* High priority VM privileged thread used to asynchronously refill a designated
2545  * zone, such as the reserved VM map entry zone.
2546  */
2547 __attribute__((noreturn))
2548 static void
2549 zone_replenish_thread(zone_t z)
2550 {
2551         vm_size_t free_size;
2552         current_thread()->options |= TH_OPT_VMPRIV;
2553
2554         for (;;) {
2555                 lock_zone(z);
2556                 assert(z->zone_valid);
2557                 z->zone_replenishing = TRUE;
2558                 assert(z->prio_refill_watermark != 0);
2559                 while ((free_size = (z->cur_size - (z->count * z->elem_size))) < (z->prio_refill_watermark * z->elem_size)) {
2560                         assert(z->doing_alloc_without_vm_priv == FALSE);
2561                         assert(z->doing_alloc_with_vm_priv == FALSE);
2562                         assert(z->async_prio_refill == TRUE);
2563
2564                         unlock_zone(z);
2565                         int     zflags = KMA_KOBJECT | KMA_NOPAGEWAIT;
2566                         vm_offset_t space, alloc_size;
2567                         kern_return_t kr;
2568
2569                         if (vm_pool_low()) {
2570                                 alloc_size = round_page(z->elem_size);
2571                         } else {
2572                                 alloc_size = z->alloc_size;
2573                         }
2574
2575                         if (z->noencrypt) {
2576                                 zflags |= KMA_NOENCRYPT;
2577                         }
2578
2579                         /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
2580                         if (is_zone_map_nearing_exhaustion()) {
2581                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2582                         }
2583
2584                         kr = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
2585
2586                         if (kr == KERN_SUCCESS) {
2587                                 zcram(z, space, alloc_size);
2588                         } else if (kr == KERN_RESOURCE_SHORTAGE) {
2589                                 VM_PAGE_WAIT();
2590                         } else if (kr == KERN_NO_SPACE) {
2591                                 kr = kernel_memory_allocate(kernel_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
2592                                 if (kr == KERN_SUCCESS) {
2593                                         zcram(z, space, alloc_size);
2594                                 } else {
2595                                         assert_wait_timeout(&z->zone_replenish_thread, THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
2596                                         thread_block(THREAD_CONTINUE_NULL);
2597                                 }
2598                         }
2599
2600                         lock_zone(z);
2601                         assert(z->zone_valid);
2602                         zone_replenish_loops++;
2603                 }
2604
2605                 z->zone_replenishing = FALSE;
2606                 /* Signal any potential throttled consumers, terminating
2607                  * their timer-bounded waits.
2608                  */
2609                 thread_wakeup(z);
2610
2611                 assert_wait(&z->zone_replenish_thread, THREAD_UNINT);
2612                 unlock_zone(z);
2613                 thread_block(THREAD_CONTINUE_NULL);
2614                 zone_replenish_wakeups++;
2615         }
2616 }
2617
2618 void
2619 zone_prio_refill_configure(zone_t z, vm_size_t low_water_mark)
2620 {
2621         z->prio_refill_watermark = low_water_mark;
2622
2623         z->async_prio_refill = TRUE;
2624         OSMemoryBarrier();
2625         kern_return_t tres = kernel_thread_start_priority((thread_continue_t)zone_replenish_thread, z, MAXPRI_KERNEL, &z->zone_replenish_thread);
2626
2627         if (tres != KERN_SUCCESS) {
2628                 panic("zone_prio_refill_configure, thread create: 0x%x", tres);
2629         }
2630
2631         thread_deallocate(z->zone_replenish_thread);
2632 }
2633
2634 void
2635 zdestroy(zone_t z)
2636 {
2637         unsigned int zindex;
2638
2639         assert(z != NULL);
2640
2641         lock_zone(z);
2642         assert(z->zone_valid);
2643
2644         /* Assert that the zone does not have any allocations in flight */
2645         assert(z->doing_alloc_without_vm_priv == FALSE);
2646         assert(z->doing_alloc_with_vm_priv == FALSE);
2647         assert(z->async_pending == FALSE);
2648         assert(z->waiting == FALSE);
2649         assert(z->async_prio_refill == FALSE);
2650
2651 #if !KASAN_ZALLOC
2652         /*
2653          * Unset the valid bit. We'll hit an assert failure on further operations on this zone, until zinit() is called again.
2654          * Leave the zone valid for KASan as we will see zfree's on quarantined free elements even after the zone is destroyed.
2655          */
2656         z->zone_valid = FALSE;
2657 #endif
2658         unlock_zone(z);
2659
2660 #if CONFIG_ZCACHE
2661         /* Drain the per-cpu caches if caching is enabled for the zone. */
2662         if (zone_caching_enabled(z)) {
2663                 panic("zdestroy: Zone caching enabled for zone %s", z->zone_name);
2664         }
2665 #endif /* CONFIG_ZCACHE */
2666
2667         /* Dump all the free elements */
2668         drop_free_elements(z);
2669
2670 #if     CONFIG_GZALLOC
2671         /* If the zone is gzalloc managed dump all the elements in the free cache */
2672         gzalloc_empty_free_cache(z);
2673 #endif
2674
2675         lock_zone(z);
2676
2677 #if !KASAN_ZALLOC
2678         /* Assert that all counts are zero */
2679         assert(z->count == 0);
2680         assert(z->countfree == 0);
2681         assert(z->cur_size == 0);
2682         assert(z->page_count == 0);
2683         assert(z->count_all_free_pages == 0);
2684
2685         /* Assert that all queues except the foreign queue are empty. The zone allocator doesn't know how to free up foreign memory. */
2686         assert(queue_empty(&z->pages.all_used));
2687         assert(queue_empty(&z->pages.intermediate));
2688         assert(queue_empty(&z->pages.all_free));
2689 #endif
2690
2691         zindex = z->index;
2692
2693         unlock_zone(z);
2694
2695         simple_lock(&all_zones_lock, &zone_locks_grp);
2696
2697         assert(!bitmap_test(zone_empty_bitmap, zindex));
2698         /* Mark the zone as empty in the bitmap */
2699         bitmap_set(zone_empty_bitmap, zindex);
2700         num_zones_in_use--;
2701         assert(num_zones_in_use > 0);
2702
2703         simple_unlock(&all_zones_lock);
2704 }
2705
2706 /* Initialize the metadata for an allocation chunk */
2707 static inline void
2708 zcram_metadata_init(vm_offset_t newmem, vm_size_t size, struct zone_page_metadata *chunk_metadata)
2709 {
2710         struct zone_page_metadata *page_metadata;
2711
2712         /* The first page is the real metadata for this allocation chunk. We mark the others as fake metadata */
2713         size -= PAGE_SIZE;
2714         newmem += PAGE_SIZE;
2715
2716         for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) {
2717                 page_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE);
2718                 assert(page_metadata != chunk_metadata);
2719                 PAGE_METADATA_SET_ZINDEX(page_metadata, MULTIPAGE_METADATA_MAGIC);
2720                 page_metadata_set_realmeta(page_metadata, chunk_metadata);
2721                 page_metadata->free_count = 0;
2722         }
2723         return;
2724 }
2725
2726
2727 static void
2728 random_free_to_zone(
2729         zone_t          zone,
2730         vm_offset_t     newmem,
2731         vm_offset_t     first_element_offset,
2732         int             element_count,
2733         unsigned int    *entropy_buffer)
2734 {
2735         vm_offset_t     last_element_offset;
2736         vm_offset_t     element_addr;
2737         vm_size_t       elem_size;
2738         int             index;
2739
2740         assert(element_count && element_count <= ZONE_CHUNK_MAXELEMENTS);
2741         elem_size = zone->elem_size;
2742         last_element_offset = first_element_offset + ((element_count * elem_size) - elem_size);
2743         for (index = 0; index < element_count; index++) {
2744                 assert(first_element_offset <= last_element_offset);
2745                 if (
2746 #if DEBUG || DEVELOPMENT
2747                         leak_scan_debug_flag || __improbable(zone->tags) ||
2748 #endif /* DEBUG || DEVELOPMENT */
2749                         random_bool_gen_bits(&zone_bool_gen, entropy_buffer, MAX_ENTROPY_PER_ZCRAM, 1)) {
2750                         element_addr = newmem + first_element_offset;
2751                         first_element_offset += elem_size;
2752                 } else {
2753                         element_addr = newmem + last_element_offset;
2754                         last_element_offset -= elem_size;
2755                 }
2756                 if (element_addr != (vm_offset_t)zone) {
2757                         zone->count++;  /* compensate for free_to_zone */
2758                         free_to_zone(zone, element_addr, FALSE);
2759                 }
2760                 zone->cur_size += elem_size;
2761         }
2762 }
2763
2764 /*
2765  *      Cram the given memory into the specified zone. Update the zone page count accordingly.
2766  */
2767 void
2768 zcram(
2769         zone_t          zone,
2770         vm_offset_t                     newmem,
2771         vm_size_t               size)
2772 {
2773         vm_size_t       elem_size;
2774         boolean_t   from_zm = FALSE;
2775         int element_count;
2776         unsigned int entropy_buffer[MAX_ENTROPY_PER_ZCRAM] = { 0 };
2777
2778         /* Basic sanity checks */
2779         assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
2780         assert(!zone->collectable || zone->allows_foreign
2781             || (from_zone_map(newmem, size)));
2782
2783         elem_size = zone->elem_size;
2784
2785         KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START, zone->index, size);
2786
2787         if (from_zone_map(newmem, size)) {
2788                 from_zm = TRUE;
2789         }
2790
2791         if (!from_zm) {
2792                 /* We cannot support elements larger than page size for foreign memory because we
2793                  * put metadata on the page itself for each page of foreign memory. We need to do
2794                  * this in order to be able to reach the metadata when any element is freed
2795                  */
2796                 assert((zone->allows_foreign == TRUE) && (zone->elem_size <= (PAGE_SIZE - sizeof(struct zone_page_metadata))));
2797         }
2798
2799         if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) {
2800                 kprintf("zcram(%p[%s], 0x%lx%s, 0x%lx)\n", zone, zone->zone_name,
2801                     (unsigned long)newmem, from_zm ? "" : "[F]", (unsigned long)size);
2802         }
2803
2804         ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE));
2805
2806         /*
2807          * Initialize the metadata for all pages. We dont need the zone lock
2808          * here because we are not manipulating any zone related state yet.
2809          */
2810
2811         struct zone_page_metadata *chunk_metadata;
2812         size_t zone_page_metadata_size = sizeof(struct zone_page_metadata);
2813
2814         assert((newmem & PAGE_MASK) == 0);
2815         assert((size & PAGE_MASK) == 0);
2816
2817         chunk_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE);
2818         chunk_metadata->pages.next = NULL;
2819         chunk_metadata->pages.prev = NULL;
2820         page_metadata_set_freelist(chunk_metadata, 0);
2821         PAGE_METADATA_SET_ZINDEX(chunk_metadata, zone->index);
2822         chunk_metadata->free_count = 0;
2823         assert((size / PAGE_SIZE) <= ZONE_CHUNK_MAXPAGES);
2824         chunk_metadata->page_count = (unsigned)(size / PAGE_SIZE);
2825
2826         zcram_metadata_init(newmem, size, chunk_metadata);
2827
2828 #if VM_MAX_TAG_ZONES
2829         if (__improbable(zone->tags)) {
2830                 assert(from_zm);
2831                 ztMemoryAdd(zone, newmem, size);
2832         }
2833 #endif /* VM_MAX_TAG_ZONES */
2834
2835         lock_zone(zone);
2836         assert(zone->zone_valid);
2837         enqueue_tail(&zone->pages.all_used, &(chunk_metadata->pages));
2838
2839         if (!from_zm) {
2840                 /* We cannot support elements larger than page size for foreign memory because we
2841                  * put metadata on the page itself for each page of foreign memory. We need to do
2842                  * this in order to be able to reach the metadata when any element is freed
2843                  */
2844
2845                 for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) {
2846                         vm_offset_t first_element_offset = 0;
2847                         if (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT == 0) {
2848                                 first_element_offset = zone_page_metadata_size;
2849                         } else {
2850                                 first_element_offset = zone_page_metadata_size + (ZONE_ELEMENT_ALIGNMENT - (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT));
2851                         }
2852                         element_count = (unsigned int)((PAGE_SIZE - first_element_offset) / elem_size);
2853                         random_free_to_zone(zone, newmem, first_element_offset, element_count, entropy_buffer);
2854                 }
2855         } else {
2856                 element_count = (unsigned int)(size / elem_size);
2857                 random_free_to_zone(zone, newmem, 0, element_count, entropy_buffer);
2858         }
2859         unlock_zone(zone);
2860
2861         KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, zone->index);
2862 }
2863
2864 /*
2865  * Fill a zone with enough memory to contain at least nelem elements.
2866  * Return the number of elements actually put into the zone, which may
2867  * be more than the caller asked for since the memory allocation is
2868  * rounded up to the next zone allocation size.
2869  */
2870 int
2871 zfill(
2872         zone_t  zone,
2873         int     nelem)
2874 {
2875         kern_return_t kr;
2876         vm_offset_t     memory;
2877
2878         vm_size_t alloc_size = zone->alloc_size;
2879         vm_size_t elem_per_alloc = alloc_size / zone->elem_size;
2880         vm_size_t nalloc = (nelem + elem_per_alloc - 1) / elem_per_alloc;
2881
2882         /* Don't mix-and-match zfill with foreign memory */
2883         assert(!zone->allows_foreign);
2884
2885         /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
2886         if (is_zone_map_nearing_exhaustion()) {
2887                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2888         }
2889
2890         kr = kernel_memory_allocate(zone_map, &memory, nalloc * alloc_size, 0, KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
2891         if (kr != KERN_SUCCESS) {
2892                 printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
2893                     __func__, (unsigned long)(nalloc * alloc_size));
2894                 return 0;
2895         }
2896
2897         for (vm_size_t i = 0; i < nalloc; i++) {
2898                 zcram(zone, memory + i * alloc_size, alloc_size);
2899         }
2900
2901         return (int)(nalloc * elem_per_alloc);
2902 }
2903
2904 /*
2905  *      Initialize the "zone of zones" which uses fixed memory allocated
2906  *      earlier in memory initialization.  zone_bootstrap is called
2907  *      before zone_init.
2908  */
2909 void
2910 zone_bootstrap(void)
2911 {
2912         char temp_buf[16];
2913
2914         if (!PE_parse_boot_argn("zalloc_debug", &zalloc_debug, sizeof(zalloc_debug))) {
2915                 zalloc_debug = 0;
2916         }
2917
2918         /* Set up zone element poisoning */
2919         zp_init();
2920
2921         random_bool_init(&zone_bool_gen);
2922
2923         /* should zlog log to debug zone corruption instead of leaks? */
2924         if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) {
2925                 corruption_debug_flag = TRUE;
2926         }
2927
2928 #if DEBUG || DEVELOPMENT
2929         /* should perform zone element size checking in copyin/copyout? */
2930         if (PE_parse_boot_argn("-no-copyio-zalloc-check", temp_buf, sizeof(temp_buf))) {
2931                 copyio_zalloc_check = FALSE;
2932         }
2933 #if VM_MAX_TAG_ZONES
2934         /* enable tags for zones that ask for  */
2935         if (PE_parse_boot_argn("-zt", temp_buf, sizeof(temp_buf))) {
2936                 zone_tagging_on = TRUE;
2937         }
2938 #endif /* VM_MAX_TAG_ZONES */
2939         /* disable element location randomization in a page */
2940         if (PE_parse_boot_argn("-zl", temp_buf, sizeof(temp_buf))) {
2941                 leak_scan_debug_flag = TRUE;
2942         }
2943 #endif
2944
2945         simple_lock_init(&all_zones_lock, 0);
2946
2947         num_zones_in_use = 0;
2948         num_zones = 0;
2949         /* Mark all zones as empty */
2950         bitmap_full(zone_empty_bitmap, BITMAP_LEN(MAX_ZONES));
2951         zone_names_next = zone_names_start = 0;
2952
2953 #if DEBUG || DEVELOPMENT
2954         simple_lock_init(&zone_test_lock, 0);
2955 #endif /* DEBUG || DEVELOPMENT */
2956
2957         thread_call_setup(&call_async_alloc, zalloc_async, NULL);
2958
2959         /* initializing global lock group for zones */
2960         lck_grp_attr_setdefault(&zone_locks_grp_attr);
2961         lck_grp_init(&zone_locks_grp, "zone_locks", &zone_locks_grp_attr);
2962
2963         lck_attr_setdefault(&zone_metadata_lock_attr);
2964         lck_mtx_init_ext(&zone_metadata_region_lck, &zone_metadata_region_lck_ext, &zone_locks_grp, &zone_metadata_lock_attr);
2965
2966 #if     CONFIG_ZCACHE
2967         /* zcc_enable_for_zone_name=<zone>: enable per-cpu zone caching for <zone>. */
2968         if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) {
2969                 printf("zcache: caching enabled for zone %s\n", cache_zone_name);
2970         }
2971
2972         /* -zcache_all: enable per-cpu zone caching for all zones, overrides 'zcc_enable_for_zone_name'. */
2973         if (PE_parse_boot_argn("-zcache_all", temp_buf, sizeof(temp_buf))) {
2974                 cache_all_zones = TRUE;
2975                 printf("zcache: caching enabled for all zones\n");
2976         }
2977 #endif /* CONFIG_ZCACHE */
2978 }
2979
2980 /*
2981  * We're being very conservative here and picking a value of 95%. We might need to lower this if
2982  * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
2983  */
2984 #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
2985
2986 /*
2987  * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
2988  * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
2989  */
2990 unsigned int zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
2991
2992 /*
2993  * Returns pid of the task with the largest number of VM map entries.
2994  */
2995 extern pid_t find_largest_process_vm_map_entries(void);
2996
2997 /*
2998  * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
2999  * For any other pid we try to kill that process synchronously.
3000  */
3001 boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
3002
3003 void
3004 get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
3005 {
3006         *current_size = zone_map->size;
3007         *capacity = vm_map_max(zone_map) - vm_map_min(zone_map);
3008 }
3009
3010 void
3011 get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
3012 {
3013         zone_t largest_zone = zone_find_largest();
3014         strlcpy(zone_name, largest_zone->zone_name, zone_name_len);
3015         *zone_size = largest_zone->cur_size;
3016 }
3017
3018 boolean_t
3019 is_zone_map_nearing_exhaustion(void)
3020 {
3021         uint64_t size = zone_map->size;
3022         uint64_t capacity = vm_map_max(zone_map) - vm_map_min(zone_map);
3023         if (size > ((capacity * zone_map_jetsam_limit) / 100)) {
3024                 return TRUE;
3025         }
3026         return FALSE;
3027 }
3028
3029 extern zone_t vm_map_entry_zone;
3030 extern zone_t vm_object_zone;
3031
3032 #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
3033
3034 /*
3035  * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
3036  * to walk through the jetsam priority bands and kill processes.
3037  */
3038 static void
3039 kill_process_in_largest_zone(void)
3040 {
3041         pid_t pid = -1;
3042         zone_t largest_zone = zone_find_largest();
3043
3044         printf("zone_map_exhaustion: Zone map size %lld, capacity %lld [jetsam limit %d%%]\n", (uint64_t)zone_map->size,
3045             (uint64_t)(vm_map_max(zone_map) - vm_map_min(zone_map)), zone_map_jetsam_limit);
3046         printf("zone_map_exhaustion: Largest zone %s, size %lu\n", largest_zone->zone_name, (uintptr_t)largest_zone->cur_size);
3047
3048         /*
3049          * We want to make sure we don't call this function from userspace. Or we could end up trying to synchronously kill the process
3050          * whose context we're in, causing the system to hang.
3051          */
3052         assert(current_task() == kernel_task);
3053
3054         /*
3055          * If vm_object_zone is the largest, check to see if the number of elements in vm_map_entry_zone is comparable. If so, consider
3056          * vm_map_entry_zone as the largest. This lets us target a specific process to jetsam to quickly recover from the zone map bloat.
3057          */
3058         if (largest_zone == vm_object_zone) {
3059                 unsigned int vm_object_zone_count = vm_object_zone->count;
3060                 unsigned int vm_map_entry_zone_count = vm_map_entry_zone->count;
3061                 /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
3062                 if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
3063                         largest_zone = vm_map_entry_zone;
3064                         printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n", (uintptr_t)largest_zone->cur_size);
3065                 }
3066         }
3067
3068         /* TODO: Extend this to check for the largest process in other zones as well. */
3069         if (largest_zone == vm_map_entry_zone) {
3070                 pid = find_largest_process_vm_map_entries();
3071         } else {
3072                 printf("zone_map_exhaustion: Nothing to do for the largest zone [%s]. Waking up memorystatus thread.\n", largest_zone->zone_name);
3073         }
3074         if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
3075                 printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
3076         }
3077 }
3078
3079 /* Global initialization of Zone Allocator.
3080  * Runs after zone_bootstrap.
3081  */
3082 void
3083 zone_init(
3084         vm_size_t max_zonemap_size)
3085 {
3086         kern_return_t   retval;
3087         vm_offset_t     zone_min;
3088         vm_offset_t     zone_max;
3089         vm_offset_t     zone_metadata_space;
3090         unsigned int    zone_pages;
3091         vm_map_kernel_flags_t vmk_flags;
3092
3093 #if VM_MAX_TAG_ZONES
3094         if (zone_tagging_on) {
3095                 ztInit(max_zonemap_size, &zone_locks_grp);
3096         }
3097 #endif
3098
3099         vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
3100         vmk_flags.vmkf_permanent = TRUE;
3101         retval = kmem_suballoc(kernel_map, &zone_min, max_zonemap_size,
3102             FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_ZONE,
3103             &zone_map);
3104
3105         if (retval != KERN_SUCCESS) {
3106                 panic("zone_init: kmem_suballoc failed");
3107         }
3108         zone_max = zone_min + round_page(max_zonemap_size);
3109
3110 #if     CONFIG_GZALLOC
3111         gzalloc_init(max_zonemap_size);
3112 #endif
3113
3114         /*
3115          * Setup garbage collection information:
3116          */
3117         zone_map_min_address = zone_min;
3118         zone_map_max_address = zone_max;
3119
3120         zone_pages = (unsigned int)atop_kernel(zone_max - zone_min);
3121         zone_metadata_space = round_page(zone_pages * sizeof(struct zone_page_metadata));
3122         retval = kernel_memory_allocate(zone_map, &zone_metadata_region_min, zone_metadata_space,
3123             0, KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT, VM_KERN_MEMORY_OSFMK);
3124         if (retval != KERN_SUCCESS) {
3125                 panic("zone_init: zone_metadata_region initialization failed!");
3126         }
3127         zone_metadata_region_max = zone_metadata_region_min + zone_metadata_space;
3128
3129 #if defined(__LP64__)
3130         /*
3131          * ensure that any vm_page_t that gets created from
3132          * the vm_page zone can be packed properly (see vm_page.h
3133          * for the packing requirements
3134          */
3135         if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_metadata_region_max))) != (vm_page_t)zone_metadata_region_max) {
3136                 panic("VM_PAGE_PACK_PTR failed on zone_metadata_region_max - %p", (void *)zone_metadata_region_max);
3137         }
3138
3139         if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_max_address))) != (vm_page_t)zone_map_max_address) {
3140                 panic("VM_PAGE_PACK_PTR failed on zone_map_max_address - %p", (void *)zone_map_max_address);
3141         }
3142 #endif
3143
3144         lck_grp_attr_setdefault(&zone_gc_lck_grp_attr);
3145         lck_grp_init(&zone_gc_lck_grp, "zone_gc", &zone_gc_lck_grp_attr);
3146         lck_attr_setdefault(&zone_gc_lck_attr);
3147         lck_mtx_init_ext(&zone_gc_lock, &zone_gc_lck_ext, &zone_gc_lck_grp, &zone_gc_lck_attr);
3148
3149 #if CONFIG_ZLEAKS
3150         /*
3151          * Initialize the zone leak monitor
3152          */
3153         zleak_init(max_zonemap_size);
3154 #endif /* CONFIG_ZLEAKS */
3155
3156 #if VM_MAX_TAG_ZONES
3157         if (zone_tagging_on) {
3158                 vm_allocation_zones_init();
3159         }
3160 #endif
3161
3162         int jetsam_limit_temp = 0;
3163         if (PE_parse_boot_argn("zone_map_jetsam_limit", &jetsam_limit_temp, sizeof(jetsam_limit_temp)) &&
3164             jetsam_limit_temp > 0 && jetsam_limit_temp <= 100) {
3165                 zone_map_jetsam_limit = jetsam_limit_temp;
3166         }
3167 }
3168
3169 #pragma mark -
3170 #pragma mark zalloc_canblock
3171
3172 extern boolean_t early_boot_complete;
3173
3174 void
3175 zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr)
3176 {
3177         vm_offset_t     inner_size = zone->elem_size;
3178         if (__improbable(check_poison && addr)) {
3179                 vm_offset_t *element_cursor  = ((vm_offset_t *) addr) + 1;
3180                 vm_offset_t *backup  = get_backup_ptr(inner_size, (vm_offset_t *) addr);
3181
3182                 for (; element_cursor < backup; element_cursor++) {
3183                         if (__improbable(*element_cursor != ZP_POISON)) {
3184                                 zone_element_was_modified_panic(zone,
3185                                     addr,
3186                                     *element_cursor,
3187                                     ZP_POISON,
3188                                     ((vm_offset_t)element_cursor) - addr);
3189                         }
3190                 }
3191         }
3192
3193         if (addr) {
3194                 /*
3195                  * Clear out the old next pointer and backup to avoid leaking the cookie
3196                  * and so that only values on the freelist have a valid cookie
3197                  */
3198
3199                 vm_offset_t *primary  = (vm_offset_t *) addr;
3200                 vm_offset_t *backup   = get_backup_ptr(inner_size, primary);
3201
3202                 *primary = ZP_POISON;
3203                 *backup  = ZP_POISON;
3204         }
3205 }
3206
3207 /*
3208  *      zalloc returns an element from the specified zone.
3209  */
3210 static void *
3211 zalloc_internal(
3212         zone_t  zone,
3213         boolean_t canblock,
3214         boolean_t nopagewait,
3215         vm_size_t
3216 #if !VM_MAX_TAG_ZONES
3217         __unused
3218 #endif
3219         reqsize,
3220         vm_tag_t  tag)
3221 {
3222         vm_offset_t     addr = 0;
3223         kern_return_t   retval;
3224         uintptr_t       zbt[MAX_ZTRACE_DEPTH];  /* used in zone leak logging and zone leak detection */
3225         unsigned int            numsaved = 0;
3226         boolean_t       zone_replenish_wakeup = FALSE, zone_alloc_throttle = FALSE;
3227         thread_t thr = current_thread();
3228         boolean_t       check_poison = FALSE;
3229         boolean_t       set_doing_alloc_with_vm_priv = FALSE;
3230
3231 #if CONFIG_ZLEAKS
3232         uint32_t        zleak_tracedepth = 0;  /* log this allocation if nonzero */
3233 #endif /* CONFIG_ZLEAKS */
3234
3235 #if KASAN
3236         /*
3237          * KASan uses zalloc() for fakestack, which can be called anywhere. However,
3238          * we make sure these calls can never block.
3239          */
3240         boolean_t irq_safe = FALSE;
3241         const char *fakestack_name = "fakestack.";
3242         if (strncmp(zone->zone_name, fakestack_name, strlen(fakestack_name)) == 0) {
3243                 irq_safe = TRUE;
3244         }
3245 #elif MACH_ASSERT
3246         /* In every other case, zalloc() from interrupt context is unsafe. */
3247         const boolean_t irq_safe = FALSE;
3248 #endif
3249
3250         assert(zone != ZONE_NULL);
3251         assert(irq_safe || ml_get_interrupts_enabled() || ml_is_quiescing() || debug_mode_active() || !early_boot_complete);
3252
3253 #if     CONFIG_GZALLOC
3254         addr = gzalloc_alloc(zone, canblock);
3255 #endif
3256         /*
3257          * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
3258          */
3259         if (__improbable(DO_LOGGING(zone))) {
3260                 numsaved = OSBacktrace((void*) zbt, MAX_ZTRACE_DEPTH);
3261         }
3262
3263 #if CONFIG_ZLEAKS
3264         /*
3265          * Zone leak detection: capture a backtrace every zleak_sample_factor
3266          * allocations in this zone.
3267          */
3268         if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) {
3269                 /* Avoid backtracing twice if zone logging is on */
3270                 if (numsaved == 0) {
3271                         zleak_tracedepth = backtrace(zbt, MAX_ZTRACE_DEPTH);
3272                 } else {
3273                         zleak_tracedepth = numsaved;
3274                 }
3275         }
3276 #endif /* CONFIG_ZLEAKS */
3277
3278 #if VM_MAX_TAG_ZONES
3279         if (__improbable(zone->tags)) {
3280                 vm_tag_will_update_zone(tag, zone->tag_zone_index);
3281         }
3282 #endif /* VM_MAX_TAG_ZONES */
3283
3284 #if CONFIG_ZCACHE
3285         if (__probable(addr == 0)) {
3286                 if (zone_caching_enabled(zone)) {
3287                         addr = zcache_alloc_from_cpu_cache(zone);
3288                         if (addr) {
3289 #if KASAN_ZALLOC
3290                                 addr = kasan_fixup_allocated_element_address(zone, addr);
3291 #endif
3292                                 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
3293                                 return (void *)addr;
3294                         }
3295                 }
3296         }
3297 #endif /* CONFIG_ZCACHE */
3298
3299         lock_zone(zone);
3300         assert(zone->zone_valid);
3301
3302         if (zone->async_prio_refill && zone->zone_replenish_thread) {
3303                 vm_size_t zfreec = (zone->cur_size - (zone->count * zone->elem_size));
3304                 vm_size_t zrefillwm = zone->prio_refill_watermark * zone->elem_size;
3305                 zone_replenish_wakeup = (zfreec < zrefillwm);
3306                 zone_alloc_throttle = (((zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0)) || (zfreec == 0));
3307
3308                 do {
3309                         if (zone_replenish_wakeup) {
3310                                 zone_replenish_wakeups_initiated++;
3311                                 /* Signal the potentially waiting
3312                                  * refill thread.
3313                                  */
3314                                 thread_wakeup(&zone->zone_replenish_thread);
3315
3316                                 /* We don't want to wait around for zone_replenish_thread to bump up the free count
3317                                  * if we're in zone_gc(). This keeps us from deadlocking with zone_replenish_thread.
3318                                  */
3319                                 if (thr->options & TH_OPT_ZONE_GC) {
3320                                         break;
3321                                 }
3322
3323                                 unlock_zone(zone);
3324                                 /* Scheduling latencies etc. may prevent
3325                                  * the refill thread from keeping up
3326                                  * with demand. Throttle consumers
3327                                  * when we fall below half the
3328                                  * watermark, unless VM privileged
3329                                  */
3330                                 if (zone_alloc_throttle) {
3331                                         zone_replenish_throttle_count++;
3332                                         assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
3333                                         thread_block(THREAD_CONTINUE_NULL);
3334                                 }
3335                                 lock_zone(zone);
3336                                 assert(zone->zone_valid);
3337                         }
3338
3339                         zfreec = (zone->cur_size - (zone->count * zone->elem_size));
3340                         zrefillwm = zone->prio_refill_watermark * zone->elem_size;
3341                         zone_replenish_wakeup = (zfreec < zrefillwm);
3342                         zone_alloc_throttle = (((zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0)) || (zfreec == 0));
3343                 } while (zone_alloc_throttle == TRUE);
3344         }
3345
3346         if (__probable(addr == 0)) {
3347                 addr = try_alloc_from_zone(zone, tag, &check_poison);
3348         }
3349
3350         /* If we're here because of zone_gc(), we didn't wait for zone_replenish_thread to finish.
3351          * So we need to ensure that we did successfully grab an element. And we only need to assert
3352          * this for zones that have a replenish thread configured (in this case, the Reserved VM map
3353          * entries zone).
3354          */
3355         if (thr->options & TH_OPT_ZONE_GC && zone->async_prio_refill) {
3356                 assert(addr != 0);
3357         }
3358
3359         while ((addr == 0) && canblock) {
3360                 /*
3361                  * zone is empty, try to expand it
3362                  *
3363                  * Note that we now allow up to 2 threads (1 vm_privliged and 1 non-vm_privliged)
3364                  * to expand the zone concurrently...  this is necessary to avoid stalling
3365                  * vm_privileged threads running critical code necessary to continue compressing/swapping
3366                  * pages (i.e. making new free pages) from stalling behind non-vm_privileged threads
3367                  * waiting to acquire free pages when the vm_page_free_count is below the
3368                  * vm_page_free_reserved limit.
3369                  */
3370                 if ((zone->doing_alloc_without_vm_priv || zone->doing_alloc_with_vm_priv) &&
3371                     (((thr->options & TH_OPT_VMPRIV) == 0) || zone->doing_alloc_with_vm_priv)) {
3372                         /*
3373                          * This is a non-vm_privileged thread and a non-vm_privileged or
3374                          * a vm_privileged thread is already expanding the zone...
3375                          *    OR
3376                          * this is a vm_privileged thread and a vm_privileged thread is
3377                          * already expanding the zone...
3378                          *
3379                          * In either case wait for a thread to finish, then try again.
3380                          */
3381                         zone->waiting = TRUE;
3382                         zone_sleep(zone);
3383                 } else {
3384                         vm_offset_t space;
3385                         vm_size_t alloc_size;
3386                         int retry = 0;
3387
3388                         if ((zone->cur_size + zone->elem_size) >
3389                             zone->max_size) {
3390                                 if (zone->exhaustible) {
3391                                         break;
3392                                 }
3393                                 if (zone->expandable) {
3394                                         /*
3395                                          * We're willing to overflow certain
3396                                          * zones, but not without complaining.
3397                                          *
3398                                          * This is best used in conjunction
3399                                          * with the collectable flag. What we
3400                                          * want is an assurance we can get the
3401                                          * memory back, assuming there's no
3402                                          * leak.
3403                                          */
3404                                         zone->max_size += (zone->max_size >> 1);
3405                                 } else {
3406                                         unlock_zone(zone);
3407
3408                                         panic_include_zprint = TRUE;
3409 #if CONFIG_ZLEAKS
3410                                         if (zleak_state & ZLEAK_STATE_ACTIVE) {
3411                                                 panic_include_ztrace = TRUE;
3412                                         }
3413 #endif /* CONFIG_ZLEAKS */
3414                                         panic("zalloc: zone \"%s\" empty.", zone->zone_name);
3415                                 }
3416                         }
3417                         /*
3418                          * It is possible that a BG thread is refilling/expanding the zone
3419                          * and gets pre-empted during that operation. That blocks all other
3420                          * threads from making progress leading to a watchdog timeout. To
3421                          * avoid that, boost the thread priority using the rwlock boost
3422                          */
3423                         set_thread_rwlock_boost();
3424
3425                         if ((thr->options & TH_OPT_VMPRIV)) {
3426                                 zone->doing_alloc_with_vm_priv = TRUE;
3427                                 set_doing_alloc_with_vm_priv = TRUE;
3428                         } else {
3429                                 zone->doing_alloc_without_vm_priv = TRUE;
3430                         }
3431                         unlock_zone(zone);
3432
3433                         for (;;) {
3434                                 int     zflags = KMA_KOBJECT | KMA_NOPAGEWAIT;
3435
3436                                 if (vm_pool_low() || retry >= 1) {
3437                                         alloc_size =
3438                                             round_page(zone->elem_size);
3439                                 } else {
3440                                         alloc_size = zone->alloc_size;
3441                                 }
3442
3443                                 if (zone->noencrypt) {
3444                                         zflags |= KMA_NOENCRYPT;
3445                                 }
3446
3447                                 /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
3448                                 if (is_zone_map_nearing_exhaustion()) {
3449                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
3450                                 }
3451
3452                                 retval = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
3453                                 if (retval == KERN_SUCCESS) {
3454 #if CONFIG_ZLEAKS
3455                                         if ((zleak_state & (ZLEAK_STATE_ENABLED | ZLEAK_STATE_ACTIVE)) == ZLEAK_STATE_ENABLED) {
3456                                                 if (zone_map->size >= zleak_global_tracking_threshold) {
3457                                                         kern_return_t kr;
3458
3459                                                         kr = zleak_activate();
3460                                                         if (kr != KERN_SUCCESS) {
3461                                                                 printf("Failed to activate live zone leak debugging (%d).\n", kr);
3462                                                         }
3463                                                 }
3464                                         }
3465
3466                                         if ((zleak_state & ZLEAK_STATE_ACTIVE) && !(zone->zleak_on)) {
3467                                                 if (zone->cur_size > zleak_per_zone_tracking_threshold) {
3468                                                         zone->zleak_on = TRUE;
3469                                                 }
3470                                         }
3471 #endif /* CONFIG_ZLEAKS */
3472                                         zcram(zone, space, alloc_size);
3473
3474                                         break;
3475                                 } else if (retval != KERN_RESOURCE_SHORTAGE) {
3476                                         retry++;
3477
3478                                         if (retry == 3) {
3479                                                 panic_include_zprint = TRUE;
3480 #if CONFIG_ZLEAKS
3481                                                 if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
3482                                                         panic_include_ztrace = TRUE;
3483                                                 }
3484 #endif /* CONFIG_ZLEAKS */
3485                                                 if (retval == KERN_NO_SPACE) {
3486                                                         zone_t zone_largest = zone_find_largest();
3487                                                         panic("zalloc: zone map exhausted while allocating from zone %s, likely due to memory leak in zone %s (%lu total bytes, %d elements allocated)",
3488                                                             zone->zone_name, zone_largest->zone_name,
3489                                                             (unsigned long)zone_largest->cur_size, zone_largest->count);
3490                                                 }
3491                                                 panic("zalloc: \"%s\" (%d elements) retry fail %d", zone->zone_name, zone->count, retval);
3492                                         }
3493                                 } else {
3494                                         break;
3495                                 }
3496                         }
3497                         lock_zone(zone);
3498                         assert(zone->zone_valid);
3499
3500                         if (set_doing_alloc_with_vm_priv == TRUE) {
3501                                 zone->doing_alloc_with_vm_priv = FALSE;
3502                         } else {
3503                                 zone->doing_alloc_without_vm_priv = FALSE;
3504                         }
3505
3506                         if (zone->waiting) {
3507                                 zone->waiting = FALSE;
3508                                 zone_wakeup(zone);
3509                         }
3510                         clear_thread_rwlock_boost();
3511
3512                         addr = try_alloc_from_zone(zone, tag, &check_poison);
3513                         if (addr == 0 &&
3514                             retval == KERN_RESOURCE_SHORTAGE) {
3515                                 if (nopagewait == TRUE) {
3516                                         break;  /* out of the main while loop */
3517                                 }
3518                                 unlock_zone(zone);
3519
3520                                 VM_PAGE_WAIT();
3521                                 lock_zone(zone);
3522                                 assert(zone->zone_valid);
3523                         }
3524                 }
3525                 if (addr == 0) {
3526                         addr = try_alloc_from_zone(zone, tag, &check_poison);
3527                 }
3528         }
3529
3530 #if CONFIG_ZLEAKS
3531         /* Zone leak detection:
3532          * If we're sampling this allocation, add it to the zleaks hash table.
3533          */
3534         if (addr && zleak_tracedepth > 0) {
3535                 /* Sampling can fail if another sample is happening at the same time in a different zone. */
3536                 if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) {
3537                         /* If it failed, roll back the counter so we sample the next allocation instead. */
3538                         zone->zleak_capture = zleak_sample_factor;
3539                 }
3540         }
3541 #endif /* CONFIG_ZLEAKS */
3542
3543
3544         if ((addr == 0) && (!canblock || nopagewait) && (zone->async_pending == FALSE) && (zone->no_callout == FALSE) && (zone->exhaustible == FALSE) && (!vm_pool_low())) {
3545                 zone->async_pending = TRUE;
3546                 unlock_zone(zone);
3547                 thread_call_enter(&call_async_alloc);
3548                 lock_zone(zone);
3549                 assert(zone->zone_valid);
3550                 addr = try_alloc_from_zone(zone, tag, &check_poison);
3551         }
3552
3553 #if VM_MAX_TAG_ZONES
3554         if (__improbable(zone->tags) && addr) {
3555                 if (reqsize) {
3556                         reqsize = zone->elem_size - reqsize;
3557                 }
3558                 vm_tag_update_zone_size(tag, zone->tag_zone_index, zone->elem_size, reqsize);
3559         }
3560 #endif /* VM_MAX_TAG_ZONES */
3561
3562         unlock_zone(zone);
3563
3564         if (__improbable(DO_LOGGING(zone) && addr)) {
3565                 btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved);
3566         }
3567
3568         zalloc_poison_element(check_poison, zone, addr);
3569
3570         if (addr) {
3571 #if DEBUG || DEVELOPMENT
3572                 if (__improbable(leak_scan_debug_flag && !(zone->elem_size & (sizeof(uintptr_t) - 1)))) {
3573                         unsigned int count, idx;
3574                         /* Fill element, from tail, with backtrace in reverse order */
3575                         if (numsaved == 0) {
3576                                 numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH);
3577                         }
3578                         count = (unsigned int)(zone->elem_size / sizeof(uintptr_t));
3579                         if (count >= numsaved) {
3580                                 count = numsaved - 1;
3581                         }
3582                         for (idx = 0; idx < count; idx++) {
3583                                 ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
3584                         }
3585                 }
3586 #endif /* DEBUG || DEVELOPMENT */
3587         }
3588
3589         TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr);
3590
3591
3592 #if KASAN_ZALLOC
3593         addr = kasan_fixup_allocated_element_address(zone, addr);
3594 #endif
3595
3596         DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
3597
3598         return (void *)addr;
3599 }
3600
3601 void *
3602 zalloc(zone_t zone)
3603 {
3604         return zalloc_internal(zone, TRUE, FALSE, 0, VM_KERN_MEMORY_NONE);
3605 }
3606
3607 void *
3608 zalloc_noblock(zone_t zone)
3609 {
3610         return zalloc_internal(zone, FALSE, FALSE, 0, VM_KERN_MEMORY_NONE);
3611 }
3612
3613 void *
3614 zalloc_nopagewait(zone_t zone)
3615 {
3616         return zalloc_internal(zone, TRUE, TRUE, 0, VM_KERN_MEMORY_NONE);
3617 }
3618
3619 void *
3620 zalloc_canblock_tag(zone_t zone, boolean_t canblock, vm_size_t reqsize, vm_tag_t tag)
3621 {
3622         return zalloc_internal(zone, canblock, FALSE, reqsize, tag);
3623 }
3624
3625 void *
3626 zalloc_canblock(zone_t zone, boolean_t canblock)
3627 {
3628         return zalloc_internal(zone, canblock, FALSE, 0, VM_KERN_MEMORY_NONE);
3629 }
3630
3631 void *
3632 zalloc_attempt(zone_t zone)
3633 {
3634         boolean_t check_poison = FALSE;
3635         vm_offset_t addr = try_alloc_from_zone(zone, VM_KERN_MEMORY_NONE, &check_poison);
3636         zalloc_poison_element(check_poison, zone, addr);
3637         return (void *)addr;
3638 }
3639
3640 void
3641 zfree_direct(zone_t zone, vm_offset_t elem)
3642 {
3643         boolean_t       poison = zfree_poison_element(zone, elem);
3644         free_to_zone(zone, elem, poison);
3645 }
3646
3647
3648 void
3649 zalloc_async(
3650         __unused thread_call_param_t          p0,
3651         __unused thread_call_param_t p1)
3652 {
3653         zone_t current_z = NULL;
3654         unsigned int max_zones, i;
3655         void *elt = NULL;
3656         boolean_t pending = FALSE;
3657
3658         simple_lock(&all_zones_lock, &zone_locks_grp);
3659         max_zones = num_zones;
3660         simple_unlock(&all_zones_lock);
3661         for (i = 0; i < max_zones; i++) {
3662                 current_z = &(zone_array[i]);
3663
3664                 if (current_z->no_callout == TRUE) {
3665                         /* async_pending will never be set */
3666                         continue;
3667                 }
3668
3669                 lock_zone(current_z);
3670                 if (current_z->zone_valid && current_z->async_pending == TRUE) {
3671                         current_z->async_pending = FALSE;
3672                         pending = TRUE;
3673                 }
3674                 unlock_zone(current_z);
3675
3676                 if (pending == TRUE) {
3677                         elt = zalloc_canblock_tag(current_z, TRUE, 0, VM_KERN_MEMORY_OSFMK);
3678                         zfree(current_z, elt);
3679                         pending = FALSE;
3680                 }
3681         }
3682 }
3683
3684 /*
3685  *      zget returns an element from the specified zone
3686  *      and immediately returns nothing if there is nothing there.
3687  */
3688 void *
3689 zget(
3690         zone_t  zone)
3691 {
3692         return zalloc_internal(zone, FALSE, TRUE, 0, VM_KERN_MEMORY_NONE);
3693 }
3694
3695 /* Keep this FALSE by default.  Large memory machine run orders of magnitude
3696  *  slower in debug mode when true.  Use debugger to enable if needed */
3697 /* static */ boolean_t zone_check = FALSE;
3698
3699 static void
3700 zone_check_freelist(zone_t zone, vm_offset_t elem)
3701 {
3702         struct zone_free_element *this;
3703         struct zone_page_metadata *thispage;
3704
3705         if (zone->allows_foreign) {
3706                 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
3707                     !queue_end(&zone->pages.any_free_foreign, &(thispage->pages));
3708                     thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3709                         for (this = page_metadata_get_freelist(thispage);
3710                             this != NULL;
3711                             this = this->next) {
3712                                 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) {
3713                                         panic("zone_check_freelist");
3714                                 }
3715                         }
3716                 }
3717         }
3718         for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
3719             !queue_end(&zone->pages.all_free, &(thispage->pages));
3720             thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3721                 for (this = page_metadata_get_freelist(thispage);
3722                     this != NULL;
3723                     this = this->next) {
3724                         if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) {
3725                                 panic("zone_check_freelist");
3726                         }
3727                 }
3728         }
3729         for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
3730             !queue_end(&zone->pages.intermediate, &(thispage->pages));
3731             thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3732                 for (this = page_metadata_get_freelist(thispage);
3733                     this != NULL;
3734                     this = this->next) {
3735                         if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) {
3736                                 panic("zone_check_freelist");
3737                         }
3738                 }
3739         }
3740 }
3741
3742 boolean_t
3743 zfree_poison_element(zone_t zone, vm_offset_t elem)
3744 {
3745         boolean_t       poison = FALSE;
3746         if (zp_factor != 0 || zp_tiny_zone_limit != 0) {
3747                 /*
3748                  * Poison the memory before it ends up on the freelist to catch
3749                  * use-after-free and use of uninitialized memory
3750                  *
3751                  * Always poison tiny zones' elements (limit is 0 if -no-zp is set)
3752                  * Also poison larger elements periodically
3753                  */
3754
3755                 vm_offset_t     inner_size = zone->elem_size;
3756
3757                 uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale);
3758
3759                 if (inner_size <= zp_tiny_zone_limit) {
3760                         poison = TRUE;
3761                 } else if (zp_factor != 0 && sample_counter(&zone->zp_count, sample_factor) == TRUE) {
3762                         poison = TRUE;
3763                 }
3764
3765                 if (__improbable(poison)) {
3766                         /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
3767                         /* Poison everything but primary and backup */
3768                         vm_offset_t *element_cursor  = ((vm_offset_t *) elem) + 1;
3769                         vm_offset_t *backup   = get_backup_ptr(inner_size, (vm_offset_t *)elem);
3770
3771                         for (; element_cursor < backup; element_cursor++) {
3772                                 *element_cursor = ZP_POISON;
3773                         }
3774                 }
3775         }
3776         return poison;
3777 }
3778 void
3779 (zfree)(
3780         zone_t  zone,
3781         void            *addr)
3782 {
3783         vm_offset_t     elem = (vm_offset_t) addr;
3784         uintptr_t       zbt[MAX_ZTRACE_DEPTH];                  /* only used if zone logging is enabled via boot-args */
3785         unsigned int            numsaved = 0;
3786         boolean_t       gzfreed = FALSE;
3787         boolean_t       poison = FALSE;
3788 #if VM_MAX_TAG_ZONES
3789         vm_tag_t tag;
3790 #endif /* VM_MAX_TAG_ZONES */
3791
3792         assert(zone != ZONE_NULL);
3793         DTRACE_VM2(zfree, zone_t, zone, void*, addr);
3794 #if KASAN_ZALLOC
3795         if (kasan_quarantine_freed_element(&zone, &addr)) {
3796                 return;
3797         }
3798         elem = (vm_offset_t)addr;
3799 #endif
3800
3801         /*
3802          * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
3803          */
3804
3805         if (__improbable(DO_LOGGING(zone) && corruption_debug_flag)) {
3806                 numsaved = OSBacktrace((void *)zbt, MAX_ZTRACE_DEPTH);
3807         }
3808
3809 #if MACH_ASSERT
3810         /* Basic sanity checks */
3811         if (zone == ZONE_NULL || elem == (vm_offset_t)0) {
3812                 panic("zfree: NULL");
3813         }
3814 #endif
3815
3816 #if     CONFIG_GZALLOC
3817         gzfreed = gzalloc_free(zone, addr);
3818 #endif
3819
3820         if (!gzfreed) {
3821                 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
3822                 if (zone != PAGE_METADATA_GET_ZONE(page_meta)) {
3823                         panic("Element %p from zone %s caught being freed to wrong zone %s\n", addr, PAGE_METADATA_GET_ZONE(page_meta)->zone_name, zone->zone_name);
3824                 }
3825         }
3826
3827         TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, zone->elem_size, (uintptr_t)addr);
3828
3829         if (__improbable(!gzfreed && zone->collectable && !zone->allows_foreign &&
3830             !from_zone_map(elem, zone->elem_size))) {
3831                 panic("zfree: non-allocated memory in collectable zone!");
3832         }
3833
3834         if (!gzfreed) {
3835                 poison = zfree_poison_element(zone, elem);
3836         }
3837
3838         /*
3839          * See if we're doing logging on this zone.  There are two styles of logging used depending on
3840          * whether we're trying to catch a leak or corruption.  See comments above in zalloc for details.
3841          */
3842
3843         if (__improbable(DO_LOGGING(zone))) {
3844                 if (corruption_debug_flag) {
3845                         /*
3846                          * We're logging to catch a corruption.  Add a record of this zfree operation
3847                          * to log.
3848                          */
3849                         btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE, (void **)zbt, numsaved);
3850                 } else {
3851                         /*
3852                          * We're logging to catch a leak. Remove any record we might have for this
3853                          * element since it's being freed.  Note that we may not find it if the buffer
3854                          * overflowed and that's OK.  Since the log is of a limited size, old records
3855                          * get overwritten if there are more zallocs than zfrees.
3856                          */
3857                         btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
3858                 }
3859         }
3860
3861 #if CONFIG_ZCACHE
3862         if (zone_caching_enabled(zone)) {
3863                 int __assert_only ret = zcache_free_to_cpu_cache(zone, addr);
3864                 assert(ret != FALSE);
3865                 return;
3866         }
3867 #endif /* CONFIG_ZCACHE */
3868
3869         lock_zone(zone);
3870         assert(zone->zone_valid);
3871
3872         if (zone_check) {
3873                 zone_check_freelist(zone, elem);
3874         }
3875
3876         if (__probable(!gzfreed)) {
3877 #if VM_MAX_TAG_ZONES
3878                 if (__improbable(zone->tags)) {
3879                         tag = (ZTAG(zone, elem)[0] >> 1);
3880                         // set the tag with b0 clear so the block remains inuse
3881                         ZTAG(zone, elem)[0] = 0xFFFE;
3882                 }
3883 #endif /* VM_MAX_TAG_ZONES */
3884                 free_to_zone(zone, elem, poison);
3885         }
3886
3887         if (__improbable(zone->count < 0)) {
3888                 panic("zfree: zone count underflow in zone %s while freeing element %p, possible cause: double frees or freeing memory that did not come from this zone",
3889                     zone->zone_name, addr);
3890         }
3891
3892 #if CONFIG_ZLEAKS
3893         /*
3894          * Zone leak detection: un-track the allocation
3895          */
3896         if (zone->zleak_on) {
3897                 zleak_free(elem, zone->elem_size);
3898         }
3899 #endif /* CONFIG_ZLEAKS */
3900
3901 #if VM_MAX_TAG_ZONES
3902         if (__improbable(zone->tags) && __probable(!gzfreed)) {
3903                 vm_tag_update_zone_size(tag, zone->tag_zone_index, -((int64_t)zone->elem_size), 0);
3904         }
3905 #endif /* VM_MAX_TAG_ZONES */
3906
3907         unlock_zone(zone);
3908 }
3909
3910 /*      Change a zone's flags.
3911  *      This routine must be called immediately after zinit.
3912  */
3913 void
3914 zone_change(
3915         zone_t          zone,
3916         unsigned int    item,
3917         boolean_t       value)
3918 {
3919         assert( zone != ZONE_NULL );
3920         assert( value == TRUE || value == FALSE );
3921
3922         switch (item) {
3923         case Z_NOENCRYPT:
3924                 zone->noencrypt = value;
3925                 break;
3926         case Z_EXHAUST:
3927                 zone->exhaustible = value;
3928                 break;
3929         case Z_COLLECT:
3930                 zone->collectable = value;
3931                 break;
3932         case Z_EXPAND:
3933                 zone->expandable = value;
3934                 break;
3935         case Z_FOREIGN:
3936                 zone->allows_foreign = value;
3937                 break;
3938         case Z_CALLERACCT:
3939                 zone->caller_acct = value;
3940                 break;
3941         case Z_NOCALLOUT:
3942                 zone->no_callout = value;
3943                 break;
3944         case Z_TAGS_ENABLED:
3945 #if VM_MAX_TAG_ZONES
3946                 {
3947                         static int tag_zone_index;
3948                         zone->tags = TRUE;
3949                         zone->tags_inline = (((page_size + zone->elem_size - 1) / zone->elem_size) <= (sizeof(uint32_t) / sizeof(uint16_t)));
3950                         zone->tag_zone_index = OSAddAtomic(1, &tag_zone_index);
3951                 }
3952 #endif /* VM_MAX_TAG_ZONES */
3953                 break;
3954         case Z_GZALLOC_EXEMPT:
3955                 zone->gzalloc_exempt = value;
3956 #if     CONFIG_GZALLOC
3957                 gzalloc_reconfigure(zone);
3958 #endif
3959                 break;
3960         case Z_ALIGNMENT_REQUIRED:
3961                 zone->alignment_required = value;
3962 #if KASAN_ZALLOC
3963                 if (zone->kasan_redzone == KASAN_GUARD_SIZE) {
3964                         /* Don't disturb alignment with the redzone for zones with
3965                          * specific alignment requirements. */
3966                         zone->elem_size -= zone->kasan_redzone * 2;
3967                         zone->kasan_redzone = 0;
3968                 }
3969 #endif
3970 #if     CONFIG_GZALLOC
3971                 gzalloc_reconfigure(zone);
3972 #endif
3973                 break;
3974         case Z_KASAN_QUARANTINE:
3975                 zone->kasan_quarantine = value;
3976                 break;
3977         case Z_CACHING_ENABLED:
3978 #if     CONFIG_ZCACHE
3979                 if (value == TRUE && use_caching) {
3980                         if (zcache_ready()) {
3981                                 zcache_init(zone);
3982                         } else {
3983                                 zone->cpu_cache_enable_when_ready = TRUE;
3984                         }
3985                 }
3986 #endif
3987                 break;
3988         default:
3989                 panic("Zone_change: Wrong Item Type!");
3990                 /* break; */
3991         }
3992 }
3993
3994 /*
3995  * Return the expected number of free elements in the zone.
3996  * This calculation will be incorrect if items are zfree'd that
3997  * were never zalloc'd/zget'd. The correct way to stuff memory
3998  * into a zone is by zcram.
3999  */
4000
4001 integer_t
4002 zone_free_count(zone_t zone)
4003 {
4004         integer_t free_count;
4005
4006         lock_zone(zone);
4007         free_count = zone->countfree;
4008         unlock_zone(zone);
4009
4010         assert(free_count >= 0);
4011
4012         return free_count;
4013 }
4014
4015 /* Drops the elements in the free queue of a zone. Called by zone_gc() on each zone, and when a zone is zdestroy'ed. */
4016 void
4017 drop_free_elements(zone_t z)
4018 {
4019         vm_size_t                                       elt_size, size_freed;
4020         unsigned int                                                    total_freed_pages = 0;
4021         uint64_t                                        old_all_free_count;
4022         struct zone_page_metadata       *page_meta;
4023         queue_head_t                            page_meta_head;
4024
4025         lock_zone(z);
4026         if (queue_empty(&z->pages.all_free)) {
4027                 unlock_zone(z);
4028                 return;
4029         }
4030
4031         /*
4032          * Snatch all of the free elements away from the zone.
4033          */
4034         elt_size = z->elem_size;
4035         old_all_free_count = z->count_all_free_pages;
4036         queue_new_head(&z->pages.all_free, &page_meta_head, struct zone_page_metadata *, pages);
4037         queue_init(&z->pages.all_free);
4038         z->count_all_free_pages = 0;
4039         unlock_zone(z);
4040
4041         /* Iterate through all elements to find out size and count of elements we snatched */
4042         size_freed = 0;
4043         queue_iterate(&page_meta_head, page_meta, struct zone_page_metadata *, pages) {
4044                 assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */
4045                 size_freed += elt_size * page_meta->free_count;
4046         }
4047
4048         /* Update the zone size and free element count */
4049         lock_zone(z);
4050         z->cur_size -= size_freed;
4051         z->countfree -= size_freed / elt_size;
4052         unlock_zone(z);
4053
4054         while ((page_meta = (struct zone_page_metadata *)dequeue_head(&page_meta_head)) != NULL) {
4055                 vm_address_t        free_page_address;
4056                 /* Free the pages for metadata and account for them */
4057                 free_page_address = get_zone_page(page_meta);
4058                 ZONE_PAGE_COUNT_DECR(z, page_meta->page_count);
4059                 total_freed_pages += page_meta->page_count;
4060                 old_all_free_count -= page_meta->page_count;
4061 #if KASAN_ZALLOC
4062                 kasan_poison_range(free_page_address, page_meta->page_count * PAGE_SIZE, ASAN_VALID);
4063 #endif
4064 #if VM_MAX_TAG_ZONES
4065                 if (z->tags) {
4066                         ztMemoryRemove(z, free_page_address, (page_meta->page_count * PAGE_SIZE));
4067                 }
4068 #endif /* VM_MAX_TAG_ZONES */
4069                 kmem_free(zone_map, free_page_address, (page_meta->page_count * PAGE_SIZE));
4070                 if (current_thread()->options & TH_OPT_ZONE_GC) {
4071                         thread_yield_to_preemption();
4072                 }
4073         }
4074
4075         /* We freed all the pages from the all_free list for this zone */
4076         assert(old_all_free_count == 0);
4077
4078         if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
4079                 kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed / elt_size, total_freed_pages);
4080         }
4081 }
4082
4083 /*      Zone garbage collection
4084  *
4085  *      zone_gc will walk through all the free elements in all the
4086  *      zones that are marked collectable looking for reclaimable
4087  *      pages.  zone_gc is called by consider_zone_gc when the system
4088  *      begins to run out of memory.
4089  *
4090  *      We should ensure that zone_gc never blocks.
4091  */
4092 void
4093 zone_gc(boolean_t consider_jetsams)
4094 {
4095         unsigned int    max_zones;
4096         zone_t                  z;
4097         unsigned int    i;
4098
4099         if (consider_jetsams) {
4100                 kill_process_in_largest_zone();
4101                 /*
4102                  * If we do end up jetsamming something, we need to do a zone_gc so that
4103                  * we can reclaim free zone elements and update the zone map size.
4104                  * Fall through.
4105                  */
4106         }
4107
4108         lck_mtx_lock(&zone_gc_lock);
4109
4110         current_thread()->options |= TH_OPT_ZONE_GC;
4111
4112         simple_lock(&all_zones_lock, &zone_locks_grp);
4113         max_zones = num_zones;
4114         simple_unlock(&all_zones_lock);
4115
4116         if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
4117                 kprintf("zone_gc() starting...\n");
4118         }
4119
4120         for (i = 0; i < max_zones; i++) {
4121                 z = &(zone_array[i]);
4122                 assert(z != ZONE_NULL);
4123
4124                 if (!z->collectable) {
4125                         continue;
4126                 }
4127 #if CONFIG_ZCACHE
4128                 if (zone_caching_enabled(z)) {
4129                         zcache_drain_depot(z);
4130                 }
4131 #endif /* CONFIG_ZCACHE */
4132                 if (queue_empty(&z->pages.all_free)) {
4133                         continue;
4134                 }
4135
4136                 drop_free_elements(z);
4137         }
4138
4139         current_thread()->options &= ~TH_OPT_ZONE_GC;
4140
4141         lck_mtx_unlock(&zone_gc_lock);
4142 }
4143
4144 extern vm_offset_t kmapoff_kaddr;
4145 extern unsigned int kmapoff_pgcnt;
4146
4147 /*
4148  *      consider_zone_gc:
4149  *
4150  *      Called by the pageout daemon when the system needs more free pages.
4151  */
4152
4153 void
4154 consider_zone_gc(boolean_t consider_jetsams)
4155 {
4156         if (kmapoff_kaddr != 0) {
4157                 /*
4158                  * One-time reclaim of kernel_map resources we allocated in
4159                  * early boot.
4160                  */
4161                 (void) vm_deallocate(kernel_map,
4162                     kmapoff_kaddr, kmapoff_pgcnt * PAGE_SIZE_64);
4163                 kmapoff_kaddr = 0;
4164         }
4165
4166         if (zone_gc_allowed) {
4167                 zone_gc(consider_jetsams);
4168         }
4169 }
4170
4171 /*
4172  * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
4173  * requesting zone information.
4174  * Frees unused pages towards the end of the region, and zero'es out unused
4175  * space on the last page.
4176  */
4177 vm_map_copy_t
4178 create_vm_map_copy(
4179         vm_offset_t             start_addr,
4180         vm_size_t               total_size,
4181         vm_size_t               used_size)
4182 {
4183         kern_return_t   kr;
4184         vm_offset_t             end_addr;
4185         vm_size_t               free_size;
4186         vm_map_copy_t   copy;
4187
4188         if (used_size != total_size) {
4189                 end_addr = start_addr + used_size;
4190                 free_size = total_size - (round_page(end_addr) - start_addr);
4191
4192                 if (free_size >= PAGE_SIZE) {
4193                         kmem_free(ipc_kernel_map,
4194                             round_page(end_addr), free_size);
4195                 }
4196                 bzero((char *) end_addr, round_page(end_addr) - end_addr);
4197         }
4198
4199         kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
4200             (vm_map_size_t)used_size, TRUE, &copy);
4201         assert(kr == KERN_SUCCESS);
4202
4203         return copy;
4204 }
4205
4206 boolean_t
4207 get_zone_info(
4208         zone_t                          z,
4209         mach_zone_name_t        *zn,
4210         mach_zone_info_t        *zi)
4211 {
4212         struct zone zcopy;
4213
4214         assert(z != ZONE_NULL);
4215         lock_zone(z);
4216         if (!z->zone_valid) {
4217                 unlock_zone(z);
4218                 return FALSE;
4219         }
4220         zcopy = *z;
4221         unlock_zone(z);
4222
4223         if (zn != NULL) {
4224                 /* assuming here the name data is static */
4225                 (void) __nosan_strlcpy(zn->mzn_name, zcopy.zone_name,
4226                     strlen(zcopy.zone_name) + 1);
4227         }
4228
4229         if (zi != NULL) {
4230                 zi->mzi_count = (uint64_t)zcopy.count;
4231                 zi->mzi_cur_size = ptoa_64(zcopy.page_count);
4232                 zi->mzi_max_size = (uint64_t)zcopy.max_size;
4233                 zi->mzi_elem_size = (uint64_t)zcopy.elem_size;
4234                 zi->mzi_alloc_size = (uint64_t)zcopy.alloc_size;
4235                 zi->mzi_sum_size = zcopy.sum_count * zcopy.elem_size;
4236                 zi->mzi_exhaustible = (uint64_t)zcopy.exhaustible;
4237                 zi->mzi_collectable = 0;
4238                 if (zcopy.collectable) {
4239                         SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable, ((uint64_t)zcopy.count_all_free_pages * PAGE_SIZE));
4240                         SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
4241                 }
4242         }
4243
4244         return TRUE;
4245 }
4246
4247 kern_return_t
4248 task_zone_info(
4249         __unused task_t                                 task,
4250         __unused mach_zone_name_array_t *namesp,
4251         __unused mach_msg_type_number_t *namesCntp,
4252         __unused task_zone_info_array_t *infop,
4253         __unused mach_msg_type_number_t *infoCntp)
4254 {
4255         return KERN_FAILURE;
4256 }
4257
4258 kern_return_t
4259 mach_zone_info(
4260         host_priv_t             host,
4261         mach_zone_name_array_t  *namesp,
4262         mach_msg_type_number_t  *namesCntp,
4263         mach_zone_info_array_t  *infop,
4264         mach_msg_type_number_t  *infoCntp)
4265 {
4266         return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL);
4267 }
4268
4269
4270 kern_return_t
4271 mach_memory_info(
4272         host_priv_t             host,
4273         mach_zone_name_array_t  *namesp,
4274         mach_msg_type_number_t  *namesCntp,
4275         mach_zone_info_array_t  *infop,
4276         mach_msg_type_number_t  *infoCntp,
4277         mach_memory_info_array_t *memoryInfop,
4278         mach_msg_type_number_t   *memoryInfoCntp)
4279 {
4280         mach_zone_name_t        *names;
4281         vm_offset_t             names_addr;
4282         vm_size_t               names_size;
4283
4284         mach_zone_info_t        *info;
4285         vm_offset_t             info_addr;
4286         vm_size_t               info_size;
4287
4288         mach_memory_info_t      *memory_info;
4289         vm_offset_t             memory_info_addr;
4290         vm_size_t               memory_info_size;
4291         vm_size_t               memory_info_vmsize;
4292         unsigned int            num_info;
4293
4294         unsigned int            max_zones, used_zones, i;
4295         mach_zone_name_t        *zn;
4296         mach_zone_info_t        *zi;
4297         kern_return_t           kr;
4298
4299         uint64_t                zones_collectable_bytes = 0;
4300
4301         if (host == HOST_NULL) {
4302                 return KERN_INVALID_HOST;
4303         }
4304 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
4305         if (!PE_i_can_has_debugger(NULL)) {
4306                 return KERN_INVALID_HOST;
4307         }
4308 #endif
4309
4310         /*
4311          *      We assume that zones aren't freed once allocated.
4312          *      We won't pick up any zones that are allocated later.
4313          */
4314
4315         simple_lock(&all_zones_lock, &zone_locks_grp);
4316         max_zones = (unsigned int)(num_zones);
4317         simple_unlock(&all_zones_lock);
4318
4319         names_size = round_page(max_zones * sizeof *names);
4320         kr = kmem_alloc_pageable(ipc_kernel_map,
4321             &names_addr, names_size, VM_KERN_MEMORY_IPC);
4322         if (kr != KERN_SUCCESS) {
4323                 return kr;
4324         }
4325         names = (mach_zone_name_t *) names_addr;
4326
4327         info_size = round_page(max_zones * sizeof *info);
4328         kr = kmem_alloc_pageable(ipc_kernel_map,
4329             &info_addr, info_size, VM_KERN_MEMORY_IPC);
4330         if (kr != KERN_SUCCESS) {
4331                 kmem_free(ipc_kernel_map,
4332                     names_addr, names_size);
4333                 return kr;
4334         }
4335         info = (mach_zone_info_t *) info_addr;
4336
4337         zn = &names[0];
4338         zi = &info[0];
4339
4340         used_zones = max_zones;
4341         for (i = 0; i < max_zones; i++) {
4342                 if (!get_zone_info(&(zone_array[i]), zn, zi)) {
4343                         used_zones--;
4344                         continue;
4345                 }
4346                 zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
4347                 zn++;
4348                 zi++;
4349         }
4350
4351         *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
4352         *namesCntp = used_zones;
4353
4354         *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
4355         *infoCntp = used_zones;
4356
4357         num_info = 0;
4358         memory_info_addr = 0;
4359
4360         if (memoryInfop && memoryInfoCntp) {
4361                 vm_map_copy_t           copy;
4362                 num_info = vm_page_diagnose_estimate();
4363                 memory_info_size = num_info * sizeof(*memory_info);
4364                 memory_info_vmsize = round_page(memory_info_size);
4365                 kr = kmem_alloc_pageable(ipc_kernel_map,
4366                     &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
4367                 if (kr != KERN_SUCCESS) {
4368                         return kr;
4369                 }
4370
4371                 kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
4372                     VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
4373                 assert(kr == KERN_SUCCESS);
4374
4375                 memory_info = (mach_memory_info_t *) memory_info_addr;
4376                 vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
4377
4378                 kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
4379                 assert(kr == KERN_SUCCESS);
4380
4381                 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
4382                     (vm_map_size_t)memory_info_size, TRUE, &copy);
4383                 assert(kr == KERN_SUCCESS);
4384
4385                 *memoryInfop = (mach_memory_info_t *) copy;
4386                 *memoryInfoCntp = num_info;
4387         }
4388
4389         return KERN_SUCCESS;
4390 }
4391
4392 kern_return_t
4393 mach_zone_info_for_zone(
4394         host_priv_t                     host,
4395         mach_zone_name_t        name,
4396         mach_zone_info_t        *infop)
4397 {
4398         unsigned int max_zones, i;
4399         zone_t zone_ptr;
4400
4401         if (host == HOST_NULL) {
4402                 return KERN_INVALID_HOST;
4403         }
4404 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
4405         if (!PE_i_can_has_debugger(NULL)) {
4406                 return KERN_INVALID_HOST;
4407         }
4408 #endif
4409
4410         if (infop == NULL) {
4411                 return KERN_INVALID_ARGUMENT;
4412         }
4413
4414         simple_lock(&all_zones_lock, &zone_locks_grp);
4415         max_zones = (unsigned int)(num_zones);
4416         simple_unlock(&all_zones_lock);
4417
4418         zone_ptr = ZONE_NULL;
4419         for (i = 0; i < max_zones; i++) {
4420                 zone_t z = &(zone_array[i]);
4421                 assert(z != ZONE_NULL);
4422
4423                 /* Find the requested zone by name */
4424                 if (track_this_zone(z->zone_name, name.mzn_name)) {
4425                         zone_ptr = z;
4426                         break;
4427                 }
4428         }
4429
4430         /* No zones found with the requested zone name */
4431         if (zone_ptr == ZONE_NULL) {
4432                 return KERN_INVALID_ARGUMENT;
4433         }
4434
4435         if (get_zone_info(zone_ptr, NULL, infop)) {
4436                 return KERN_SUCCESS;
4437         }
4438         return KERN_FAILURE;
4439 }
4440
4441 kern_return_t
4442 mach_zone_info_for_largest_zone(
4443         host_priv_t                     host,
4444         mach_zone_name_t        *namep,
4445         mach_zone_info_t        *infop)
4446 {
4447         if (host == HOST_NULL) {
4448                 return KERN_INVALID_HOST;
4449         }
4450 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
4451         if (!PE_i_can_has_debugger(NULL)) {
4452                 return KERN_INVALID_HOST;
4453         }
4454 #endif
4455
4456         if (namep == NULL || infop == NULL) {
4457                 return KERN_INVALID_ARGUMENT;
4458         }
4459
4460         if (get_zone_info(zone_find_largest(), namep, infop)) {
4461                 return KERN_SUCCESS;
4462         }
4463         return KERN_FAILURE;
4464 }
4465
4466 uint64_t
4467 get_zones_collectable_bytes(void)
4468 {
4469         unsigned int i, max_zones;
4470         uint64_t zones_collectable_bytes = 0;
4471         mach_zone_info_t zi;
4472
4473         simple_lock(&all_zones_lock, &zone_locks_grp);
4474         max_zones = (unsigned int)(num_zones);
4475         simple_unlock(&all_zones_lock);
4476
4477         for (i = 0; i < max_zones; i++) {
4478                 if (get_zone_info(&(zone_array[i]), NULL, &zi)) {
4479                         zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
4480                 }
4481         }
4482
4483         return zones_collectable_bytes;
4484 }
4485
4486 kern_return_t
4487 mach_zone_get_zlog_zones(
4488         host_priv_t                             host,
4489         mach_zone_name_array_t  *namesp,
4490         mach_msg_type_number_t  *namesCntp)
4491 {
4492 #if DEBUG || DEVELOPMENT
4493         unsigned int max_zones, logged_zones, i;
4494         kern_return_t kr;
4495         zone_t zone_ptr;
4496         mach_zone_name_t *names;
4497         vm_offset_t names_addr;
4498         vm_size_t names_size;
4499
4500         if (host == HOST_NULL) {
4501                 return KERN_INVALID_HOST;
4502         }
4503
4504         if (namesp == NULL || namesCntp == NULL) {
4505                 return KERN_INVALID_ARGUMENT;
4506         }
4507
4508         simple_lock(&all_zones_lock, &zone_locks_grp);
4509         max_zones = (unsigned int)(num_zones);
4510         simple_unlock(&all_zones_lock);
4511
4512         names_size = round_page(max_zones * sizeof *names);
4513         kr = kmem_alloc_pageable(ipc_kernel_map,
4514             &names_addr, names_size, VM_KERN_MEMORY_IPC);
4515         if (kr != KERN_SUCCESS) {
4516                 return kr;
4517         }
4518         names = (mach_zone_name_t *) names_addr;
4519
4520         zone_ptr = ZONE_NULL;
4521         logged_zones = 0;
4522         for (i = 0; i < max_zones; i++) {
4523                 zone_t z = &(zone_array[i]);
4524                 assert(z != ZONE_NULL);
4525
4526                 /* Copy out the zone name if zone logging is enabled */
4527                 if (z->zlog_btlog) {
4528                         get_zone_info(z, &names[logged_zones], NULL);
4529                         logged_zones++;
4530                 }
4531         }
4532
4533         *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
4534         *namesCntp = logged_zones;
4535
4536         return KERN_SUCCESS;
4537
4538 #else /* DEBUG || DEVELOPMENT */
4539 #pragma unused(host, namesp, namesCntp)
4540         return KERN_FAILURE;
4541 #endif /* DEBUG || DEVELOPMENT */
4542 }
4543
4544 kern_return_t
4545 mach_zone_get_btlog_records(
4546         host_priv_t                             host,
4547         mach_zone_name_t                name,
4548         zone_btrecord_array_t   *recsp,
4549         mach_msg_type_number_t  *recsCntp)
4550 {
4551 #if DEBUG || DEVELOPMENT
4552         unsigned int max_zones, i, numrecs = 0;
4553         zone_btrecord_t *recs;
4554         kern_return_t kr;
4555         zone_t zone_ptr;
4556         vm_offset_t recs_addr;
4557         vm_size_t recs_size;
4558
4559         if (host == HOST_NULL) {
4560                 return KERN_INVALID_HOST;
4561         }
4562
4563         if (recsp == NULL || recsCntp == NULL) {
4564                 return KERN_INVALID_ARGUMENT;
4565         }
4566
4567         simple_lock(&all_zones_lock, &zone_locks_grp);
4568         max_zones = (unsigned int)(num_zones);
4569         simple_unlock(&all_zones_lock);
4570
4571         zone_ptr = ZONE_NULL;
4572         for (i = 0; i < max_zones; i++) {
4573                 zone_t z = &(zone_array[i]);
4574                 assert(z != ZONE_NULL);
4575
4576                 /* Find the requested zone by name */
4577                 if (track_this_zone(z->zone_name, name.mzn_name)) {
4578                         zone_ptr = z;
4579                         break;
4580                 }
4581         }
4582
4583         /* No zones found with the requested zone name */
4584         if (zone_ptr == ZONE_NULL) {
4585                 return KERN_INVALID_ARGUMENT;
4586         }
4587
4588         /* Logging not turned on for the requested zone */
4589         if (!DO_LOGGING(zone_ptr)) {
4590                 return KERN_FAILURE;
4591         }
4592
4593         /* Allocate memory for btlog records */
4594         numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog));
4595         recs_size = round_page(numrecs * sizeof *recs);
4596
4597         kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC);
4598         if (kr != KERN_SUCCESS) {
4599                 return kr;
4600         }
4601
4602         /*
4603          * We will call get_btlog_records() below which populates this region while holding a spinlock
4604          * (the btlog lock). So these pages need to be wired.
4605          */
4606         kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size,
4607             VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
4608         assert(kr == KERN_SUCCESS);
4609
4610         recs = (zone_btrecord_t *)recs_addr;
4611         get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs);
4612
4613         kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE);
4614         assert(kr == KERN_SUCCESS);
4615
4616         *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs);
4617         *recsCntp = numrecs;
4618
4619         return KERN_SUCCESS;
4620
4621 #else /* DEBUG || DEVELOPMENT */
4622 #pragma unused(host, name, recsp, recsCntp)
4623         return KERN_FAILURE;
4624 #endif /* DEBUG || DEVELOPMENT */
4625 }
4626
4627
4628 #if DEBUG || DEVELOPMENT
4629
4630 kern_return_t
4631 mach_memory_info_check(void)
4632 {
4633         mach_memory_info_t * memory_info;
4634         mach_memory_info_t * info;
4635         zone_t                       zone;
4636         unsigned int         idx, num_info, max_zones;
4637         vm_offset_t                  memory_info_addr;
4638         kern_return_t        kr;
4639         size_t               memory_info_size, memory_info_vmsize;
4640         uint64_t             top_wired, zonestotal, total;
4641
4642         num_info = vm_page_diagnose_estimate();
4643         memory_info_size = num_info * sizeof(*memory_info);
4644         memory_info_vmsize = round_page(memory_info_size);
4645         kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG);
4646         assert(kr == KERN_SUCCESS);
4647
4648         memory_info = (mach_memory_info_t *) memory_info_addr;
4649         vm_page_diagnose(memory_info, num_info, 0);
4650
4651         simple_lock(&all_zones_lock, &zone_locks_grp);
4652         max_zones = num_zones;
4653         simple_unlock(&all_zones_lock);
4654
4655         top_wired = total = zonestotal = 0;
4656         for (idx = 0; idx < max_zones; idx++) {
4657                 zone = &(zone_array[idx]);
4658                 assert(zone != ZONE_NULL);
4659                 lock_zone(zone);
4660                 zonestotal += ptoa_64(zone->page_count);
4661                 unlock_zone(zone);
4662         }
4663         for (idx = 0; idx < num_info; idx++) {
4664                 info = &memory_info[idx];
4665                 if (!info->size) {
4666                         continue;
4667                 }
4668                 if (VM_KERN_COUNT_WIRED == info->site) {
4669                         top_wired = info->size;
4670                 }
4671                 if (VM_KERN_SITE_HIDE & info->flags) {
4672                         continue;
4673                 }
4674                 if (!(VM_KERN_SITE_WIRED & info->flags)) {
4675                         continue;
4676                 }
4677                 total += info->size;
4678         }
4679         total += zonestotal;
4680
4681         printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n", total, top_wired, zonestotal, top_wired - total);
4682
4683         kmem_free(kernel_map, memory_info_addr, memory_info_vmsize);
4684
4685         return kr;
4686 }
4687
4688 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
4689
4690 #endif /* DEBUG || DEVELOPMENT */
4691
4692 kern_return_t
4693 mach_zone_force_gc(
4694         host_t host)
4695 {
4696         if (host == HOST_NULL) {
4697                 return KERN_INVALID_HOST;
4698         }
4699
4700 #if DEBUG || DEVELOPMENT
4701         /* Callout to buffer cache GC to drop elements in the apfs zones */
4702         if (consider_buffer_cache_collect != NULL) {
4703                 (void)(*consider_buffer_cache_collect)(0);
4704         }
4705         consider_zone_gc(FALSE);
4706 #endif /* DEBUG || DEVELOPMENT */
4707         return KERN_SUCCESS;
4708 }
4709
4710 extern unsigned int stack_total;
4711 extern unsigned long long stack_allocs;
4712
4713 #if defined(__i386__) || defined (__x86_64__)
4714 extern unsigned int inuse_ptepages_count;
4715 extern long long alloc_ptepages_count;
4716 #endif
4717
4718 zone_t
4719 zone_find_largest(void)
4720 {
4721         unsigned int    i;
4722         unsigned int    max_zones;
4723         zone_t          the_zone;
4724         zone_t          zone_largest;
4725
4726         simple_lock(&all_zones_lock, &zone_locks_grp);
4727         max_zones = num_zones;
4728         simple_unlock(&all_zones_lock);
4729
4730         zone_largest = &(zone_array[0]);
4731         for (i = 0; i < max_zones; i++) {
4732                 the_zone = &(zone_array[i]);
4733                 if (the_zone->cur_size > zone_largest->cur_size) {
4734                         zone_largest = the_zone;
4735                 }
4736         }
4737         return zone_largest;
4738 }
4739
4740 #if     ZONE_DEBUG
4741
4742 /* should we care about locks here ? */
4743
4744 #define zone_in_use(z)  ( z->count || z->free_elements \
4745                                                   || !queue_empty(&z->pages.all_free) \
4746                                                   || !queue_empty(&z->pages.intermediate) \
4747                                                   || (z->allows_foreign && !queue_empty(&z->pages.any_free_foreign)))
4748
4749
4750 #endif  /* ZONE_DEBUG */
4751
4752
4753 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
4754
4755 #if DEBUG || DEVELOPMENT
4756
4757 static uintptr_t *
4758 zone_copy_all_allocations_inqueue(zone_t z, queue_head_t * queue, uintptr_t * elems)
4759 {
4760         struct zone_page_metadata *page_meta;
4761         vm_offset_t free, elements;
4762         vm_offset_t idx, numElements, freeCount, bytesAvail, metaSize;
4763
4764         queue_iterate(queue, page_meta, struct zone_page_metadata *, pages)
4765         {
4766                 elements = get_zone_page(page_meta);
4767                 bytesAvail = ptoa(page_meta->page_count);
4768                 freeCount = 0;
4769                 if (z->allows_foreign && !from_zone_map(elements, z->elem_size)) {
4770                         metaSize    = (sizeof(struct zone_page_metadata) + ZONE_ELEMENT_ALIGNMENT - 1) & ~(ZONE_ELEMENT_ALIGNMENT - 1);
4771                         bytesAvail -= metaSize;
4772                         elements   += metaSize;
4773                 }
4774                 numElements = bytesAvail / z->elem_size;
4775                 // construct array of all possible elements
4776                 for (idx = 0; idx < numElements; idx++) {
4777                         elems[idx] = INSTANCE_PUT(elements + idx * z->elem_size);
4778                 }
4779                 // remove from the array all free elements
4780                 free = (vm_offset_t)page_metadata_get_freelist(page_meta);
4781                 while (free) {
4782                         // find idx of free element
4783                         for (idx = 0; (idx < numElements) && (elems[idx] != INSTANCE_PUT(free)); idx++) {
4784                         }
4785                         assert(idx < numElements);
4786                         // remove it
4787                         bcopy(&elems[idx + 1], &elems[idx], (numElements - (idx + 1)) * sizeof(elems[0]));
4788                         numElements--;
4789                         freeCount++;
4790                         // next free element
4791                         vm_offset_t *primary = (vm_offset_t *) free;
4792                         free = *primary ^ zp_nopoison_cookie;
4793                 }
4794                 elems += numElements;
4795         }
4796
4797         return elems;
4798 }
4799
4800 kern_return_t
4801 zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon)
4802 {
4803         uintptr_t         zbt[MAX_ZTRACE_DEPTH];
4804         zone_t        zone;
4805         uintptr_t *   array;
4806         uintptr_t *   next;
4807         uintptr_t     element, bt;
4808         uint32_t      idx, count, found;
4809         uint32_t      btidx, btcount, nobtcount, btfound;
4810         uint32_t      elemSize;
4811         uint64_t      maxElems;
4812         unsigned int  max_zones;
4813         kern_return_t kr;
4814
4815         simple_lock(&all_zones_lock, &zone_locks_grp);
4816         max_zones = num_zones;
4817         simple_unlock(&all_zones_lock);
4818
4819         for (idx = 0; idx < max_zones; idx++) {
4820                 if (!strncmp(zoneName, zone_array[idx].zone_name, nameLen)) {
4821                         break;
4822                 }
4823         }
4824         if (idx >= max_zones) {
4825                 return KERN_INVALID_NAME;
4826         }
4827         zone = &zone_array[idx];
4828
4829         elemSize = (uint32_t) zone->elem_size;
4830         maxElems = ptoa(zone->page_count) / elemSize;
4831
4832         if ((zone->alloc_size % elemSize)
4833             && !leak_scan_debug_flag) {
4834                 return KERN_INVALID_CAPABILITY;
4835         }
4836
4837         kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array,
4838             maxElems * sizeof(uintptr_t), VM_KERN_MEMORY_DIAG);
4839         if (KERN_SUCCESS != kr) {
4840                 return kr;
4841         }
4842
4843         lock_zone(zone);
4844
4845         next = array;
4846         next = zone_copy_all_allocations_inqueue(zone, &zone->pages.any_free_foreign, next);
4847         next = zone_copy_all_allocations_inqueue(zone, &zone->pages.intermediate, next);
4848         next = zone_copy_all_allocations_inqueue(zone, &zone->pages.all_used, next);
4849         count = (uint32_t)(next - array);
4850
4851         unlock_zone(zone);
4852
4853         zone_leaks_scan(array, count, (uint32_t)zone->elem_size, &found);
4854         assert(found <= count);
4855
4856         for (idx = 0; idx < count; idx++) {
4857                 element = array[idx];
4858                 if (kInstanceFlagReferenced & element) {
4859                         continue;
4860                 }
4861                 element = INSTANCE_PUT(element) & ~kInstanceFlags;
4862         }
4863
4864         if (zone->zlog_btlog && !corruption_debug_flag) {
4865                 // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
4866                 btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon);
4867         }
4868
4869         for (nobtcount = idx = 0; idx < count; idx++) {
4870                 element = array[idx];
4871                 if (!element) {
4872                         continue;
4873                 }
4874                 if (kInstanceFlagReferenced & element) {
4875                         continue;
4876                 }
4877                 element = INSTANCE_PUT(element) & ~kInstanceFlags;
4878
4879                 // see if we can find any backtrace left in the element
4880                 btcount = (typeof(btcount))(zone->elem_size / sizeof(uintptr_t));
4881                 if (btcount >= MAX_ZTRACE_DEPTH) {
4882                         btcount = MAX_ZTRACE_DEPTH - 1;
4883                 }
4884                 for (btfound = btidx = 0; btidx < btcount; btidx++) {
4885                         bt = ((uintptr_t *)element)[btcount - 1 - btidx];
4886                         if (!VM_KERNEL_IS_SLID(bt)) {
4887                                 break;
4888                         }
4889                         zbt[btfound++] = bt;
4890                 }
4891                 if (btfound) {
4892                         (*proc)(refCon, 1, elemSize, &zbt[0], btfound);
4893                 } else {
4894                         nobtcount++;
4895                 }
4896         }
4897         if (nobtcount) {
4898                 // fake backtrace when we found nothing
4899                 zbt[0] = (uintptr_t) &zalloc;
4900                 (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1);
4901         }
4902
4903         kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t));
4904
4905         return KERN_SUCCESS;
4906 }
4907
4908 boolean_t
4909 kdp_is_in_zone(void *addr, const char *zone_name)
4910 {
4911         zone_t z;
4912         return zone_element_size(addr, &z) && !strcmp(z->zone_name, zone_name);
4913 }
4914
4915 boolean_t
4916 run_zone_test(void)
4917 {
4918         unsigned int i = 0, max_iter = 5;
4919         void * test_ptr;
4920         zone_t test_zone;
4921
4922         simple_lock(&zone_test_lock, &zone_locks_grp);
4923         if (!zone_test_running) {
4924                 zone_test_running = TRUE;
4925         } else {
4926                 simple_unlock(&zone_test_lock);
4927                 printf("run_zone_test: Test already running.\n");
4928                 return FALSE;
4929         }
4930         simple_unlock(&zone_test_lock);
4931
4932         printf("run_zone_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
4933
4934         /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
4935         do {
4936                 test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
4937                 if (test_zone == NULL) {
4938                         printf("run_zone_test: zinit() failed\n");
4939                         return FALSE;
4940                 }
4941
4942 #if KASAN_ZALLOC
4943                 if (test_zone_ptr == NULL && zone_free_count(test_zone) != 0) {
4944 #else
4945                 if (zone_free_count(test_zone) != 0) {
4946 #endif
4947                         printf("run_zone_test: free count is not zero\n");
4948                         return FALSE;
4949                 }
4950
4951                 if (test_zone_ptr == NULL) {
4952                         /* Stash the zone pointer returned on the fist zinit */
4953                         printf("run_zone_test: zone created for the first time\n");
4954                         test_zone_ptr = test_zone;
4955                 } else if (test_zone != test_zone_ptr) {
4956                         printf("run_zone_test: old zone pointer and new zone pointer don't match\n");
4957                         return FALSE;
4958                 }
4959
4960                 test_ptr = zalloc(test_zone);
4961                 if (test_ptr == NULL) {
4962                         printf("run_zone_test: zalloc() failed\n");
4963                         return FALSE;
4964                 }
4965                 zfree(test_zone, test_ptr);
4966
4967                 zdestroy(test_zone);
4968                 i++;
4969
4970                 printf("run_zone_test: Iteration %d successful\n", i);
4971         } while (i < max_iter);
4972
4973         printf("run_zone_test: Test passed\n");
4974
4975         simple_lock(&zone_test_lock, &zone_locks_grp);
4976         zone_test_running = FALSE;
4977         simple_unlock(&zone_test_lock);
4978
4979         return TRUE;
4980 }
4981
4982 #endif /* DEBUG || DEVELOPMENT */