osfmk/kern/zalloc.c

   1 /*
   2  * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   kern/zalloc.c
  60  *      Author: Avadis Tevanian, Jr.
  61  *
  62  *      Zone-based memory allocator.  A zone is a collection of fixed size
  63  *      data blocks for which quick allocation/deallocation is possible.
  64  */
  65 #include <zone_debug.h>
  66
  67 #include <mach/mach_types.h>
  68 #include <mach/vm_param.h>
  69 #include <mach/kern_return.h>
  70 #include <mach/mach_host_server.h>
  71 #include <mach/task_server.h>
  72 #include <mach/machine/vm_types.h>
  73 #include <mach_debug/zone_info.h>
  74 #include <mach/vm_map.h>
  75
  76 #include <kern/kern_types.h>
  77 #include <kern/assert.h>
  78 #include <kern/backtrace.h>
  79 #include <kern/host.h>
  80 #include <kern/macro_help.h>
  81 #include <kern/sched.h>
  82 #include <kern/locks.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/misc_protos.h>
  85 #include <kern/thread_call.h>
  86 #include <kern/zalloc.h>
  87 #include <kern/kalloc.h>
  88
  89 #include <vm/pmap.h>
  90 #include <vm/vm_map.h>
  91 #include <vm/vm_kern.h>
  92 #include <vm/vm_page.h>
  93
  94 #include <pexpert/pexpert.h>
  95
  96 #include <machine/machparam.h>
  97 #include <machine/machine_routines.h>  /* ml_cpu_get_info */
  98
  99 #include <libkern/OSDebug.h>
 100 #include <libkern/OSAtomic.h>
 101 #include <sys/kdebug.h>
 102
 103 /*
 104  *  ZONE_ALIAS_ADDR (deprecated)
 105  */
 106
 107 #define from_zone_map(addr, size) \
 108         ((vm_offset_t)(addr)             >= zone_map_min_address && \
 109         ((vm_offset_t)(addr) + size - 1) <  zone_map_max_address )
 110
 111 /*
 112  * Zone Corruption Debugging
 113  *
 114  * We use three techniques to detect modification of a zone element
 115  * after it's been freed.
 116  *
 117  * (1) Check the freelist next pointer for sanity.
 118  * (2) Store a backup of the next pointer at the end of the element,
 119  *     and compare it to the primary next pointer when the element is allocated
 120  *     to detect corruption of the freelist due to use-after-free bugs.
 121  *     The backup pointer is also XORed with a per-boot random cookie.
 122  * (3) Poison the freed element by overwriting it with 0xdeadbeef,
 123  *     and check for that value when the element is being reused to make sure
 124  *     no part of the element has been modified while it was on the freelist.
 125  *     This will also help catch read-after-frees, as code will now dereference
 126  *     0xdeadbeef instead of a valid but freed pointer.
 127  *
 128  * (1) and (2) occur for every allocation and free to a zone.
 129  * This is done to make it slightly more difficult for an attacker to
 130  * manipulate the freelist to behave in a specific way.
 131  *
 132  * Poisoning (3) occurs periodically for every N frees (counted per-zone)
 133  * and on every free for zones smaller than a cacheline.  If -zp
 134  * is passed as a boot arg, poisoning occurs for every free.
 135  *
 136  * Performance slowdown is inversely proportional to the frequency of poisoning,
 137  * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
 138  * and higher. You can expect to find a 100% reproducible bug in an average of
 139  * N tries, with a standard deviation of about N, but you will want to set
 140  * "-zp" to always poison every free if you are attempting to reproduce
 141  * a known bug.
 142  *
 143  * For a more heavyweight, but finer-grained method of detecting misuse
 144  * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
 145  *
 146  * Zone Corruption Logging
 147  *
 148  * You can also track where corruptions come from by using the boot-arguments
 149  * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
 150  * in this document for more implementation and usage information.
 151  *
 152  * Zone Leak Detection
 153  *
 154  * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
 155  * found later in this file via the showtopztrace and showz* macros in kgmacros,
 156  * or use zlog without the -zc argument.
 157  *
 158  */
 159
 160 /* Returns TRUE if we rolled over the counter at factor */
 161 static inline boolean_t
 162 sample_counter(volatile uint32_t * count_p, uint32_t factor)
 163 {
 164         uint32_t old_count, new_count;
 165         boolean_t rolled_over;
 166
 167         do {
 168                 new_count = old_count = *count_p;
 169
 170                 if (++new_count >= factor) {
 171                         rolled_over = TRUE;
 172                         new_count = 0;
 173                 } else {
 174                         rolled_over = FALSE;
 175                 }
 176
 177         } while (!OSCompareAndSwap(old_count, new_count, count_p));
 178
 179         return rolled_over;
 180 }
 181
 182 #if defined(__LP64__)
 183 #define ZP_POISON       0xdeadbeefdeadbeef
 184 #else
 185 #define ZP_POISON       0xdeadbeef
 186 #endif
 187
 188 #define ZP_DEFAULT_SAMPLING_FACTOR 16
 189 #define ZP_DEFAULT_SCALE_FACTOR 4
 190
 191 /*
 192  *  A zp_factor of 0 indicates zone poisoning is disabled,
 193  *  however, we still poison zones smaller than zp_tiny_zone_limit (a cacheline).
 194  *  Passing the -no-zp boot-arg disables even this behavior.
 195  *  In all cases, we record and check the integrity of a backup pointer.
 196  */
 197
 198 /* set by zp-factor=N boot arg, zero indicates non-tiny poisoning disabled */
 199 uint32_t        zp_factor               = 0;
 200
 201 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
 202 uint32_t        zp_scale                = 0;
 203
 204 /* set in zp_init, zero indicates -no-zp boot-arg */
 205 vm_size_t       zp_tiny_zone_limit      = 0;
 206
 207 /* initialized to a per-boot random value in zp_init */
 208 uintptr_t       zp_poisoned_cookie      = 0;
 209 uintptr_t       zp_nopoison_cookie      = 0;
 210
 211
 212 /*
 213  * initialize zone poisoning
 214  * called from zone_bootstrap before any allocations are made from zalloc
 215  */
 216 static inline void
 217 zp_init(void)
 218 {
 219         char temp_buf[16];
 220
 221         /*
 222          * Initialize backup pointer random cookie for poisoned elements
 223          * Try not to call early_random() back to back, it may return
 224          * the same value if mach_absolute_time doesn't have sufficient time
 225          * to tick over between calls.  <rdar://problem/11597395>
 226          * (This is only a problem on embedded devices)
 227          */
 228         zp_poisoned_cookie = (uintptr_t) early_random();
 229
 230         /*
 231          * Always poison zones smaller than a cacheline,
 232          * because it's pretty close to free
 233          */
 234         ml_cpu_info_t cpu_info;
 235         ml_cpu_get_info(&cpu_info);
 236         zp_tiny_zone_limit = (vm_size_t) cpu_info.cache_line_size;
 237
 238         zp_factor = ZP_DEFAULT_SAMPLING_FACTOR;
 239         zp_scale  = ZP_DEFAULT_SCALE_FACTOR;
 240
 241         //TODO: Bigger permutation?
 242         /*
 243          * Permute the default factor +/- 1 to make it less predictable
 244          * This adds or subtracts ~4 poisoned objects per 1000 frees.
 245          */
 246         if (zp_factor != 0) {
 247                 uint32_t rand_bits = early_random() & 0x3;
 248
 249                 if (rand_bits == 0x1)
 250                         zp_factor += 1;
 251                 else if (rand_bits == 0x2)
 252                         zp_factor -= 1;
 253                 /* if 0x0 or 0x3, leave it alone */
 254         }
 255
 256         /* -zp: enable poisoning for every alloc and free */
 257         if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
 258                 zp_factor = 1;
 259         }
 260
 261         /* -no-zp: disable poisoning completely even for tiny zones */
 262         if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
 263                 zp_factor          = 0;
 264                 zp_tiny_zone_limit = 0;
 265                 printf("Zone poisoning disabled\n");
 266         }
 267
 268         /* zp-factor=XXXX: override how often to poison freed zone elements */
 269         if (PE_parse_boot_argn("zp-factor", &zp_factor, sizeof(zp_factor))) {
 270                 printf("Zone poisoning factor override: %u\n", zp_factor);
 271         }
 272
 273         /* zp-scale=XXXX: override how much zone size scales zp-factor by */
 274         if (PE_parse_boot_argn("zp-scale", &zp_scale, sizeof(zp_scale))) {
 275                 printf("Zone poisoning scale factor override: %u\n", zp_scale);
 276         }
 277
 278         /* Initialize backup pointer random cookie for unpoisoned elements */
 279         zp_nopoison_cookie = (uintptr_t) early_random();
 280
 281 #if MACH_ASSERT
 282         if (zp_poisoned_cookie == zp_nopoison_cookie)
 283                 panic("early_random() is broken: %p and %p are not random\n",
 284                       (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
 285 #endif
 286
 287         /*
 288          * Use the last bit in the backup pointer to hint poisoning state
 289          * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
 290          * the low bits are zero.
 291          */
 292         zp_poisoned_cookie |=   (uintptr_t)0x1ULL;
 293         zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
 294
 295 #if defined(__LP64__)
 296         /*
 297          * Make backup pointers more obvious in GDB for 64 bit
 298          * by making OxFFFFFF... ^ cookie = 0xFACADE...
 299          * (0xFACADE = 0xFFFFFF ^ 0x053521)
 300          * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
 301          * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
 302          * by the sanity check, so it's OK for that part of the cookie to be predictable.
 303          *
 304          * TODO: Use #defines, xors, and shifts
 305          */
 306
 307         zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
 308         zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
 309
 310         zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
 311         zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
 312 #endif
 313 }
 314
 315 /*
 316  * These macros are used to keep track of the number
 317  * of pages being used by the zone currently. The
 318  * z->page_count is protected by the zone lock.
 319  */
 320 #define ZONE_PAGE_COUNT_INCR(z, count)          \
 321 {                                               \
 322         OSAddAtomic64(count, &(z->page_count)); \
 323 }
 324
 325 #define ZONE_PAGE_COUNT_DECR(z, count)                  \
 326 {                                                       \
 327         OSAddAtomic64(-count, &(z->page_count));        \
 328 }
 329
 330 vm_map_t        zone_map = VM_MAP_NULL;
 331
 332 /* for is_sane_zone_element and garbage collection */
 333
 334 vm_offset_t     zone_map_min_address = 0;  /* initialized in zone_init */
 335 vm_offset_t     zone_map_max_address = 0;
 336
 337 /* Globals for random boolean generator for elements in free list */
 338 #define MAX_ENTROPY_PER_ZCRAM           4
 339 #define RANDOM_BOOL_GEN_SEED_COUNT      4
 340 static unsigned int bool_gen_seed[RANDOM_BOOL_GEN_SEED_COUNT];
 341 static unsigned int bool_gen_global = 0;
 342 decl_simple_lock_data(, bool_gen_lock)
 343
 344 /* VM region for all metadata structures */
 345 vm_offset_t     zone_metadata_region_min = 0;
 346 vm_offset_t     zone_metadata_region_max = 0;
 347 decl_lck_mtx_data(static ,zone_metadata_region_lck)
 348 lck_attr_t      zone_metadata_lock_attr;
 349 lck_mtx_ext_t   zone_metadata_region_lck_ext;
 350
 351 /* Helpful for walking through a zone's free element list. */
 352 struct zone_free_element {
 353         struct zone_free_element *next;
 354         /* ... */
 355         /* void *backup_ptr; */
 356 };
 357
 358 /*
 359  *      Protects num_zones and zone_array
 360  */
 361 decl_simple_lock_data(, all_zones_lock)
 362 unsigned int            num_zones;
 363
 364 #define MAX_ZONES       256
 365 struct zone             zone_array[MAX_ZONES];
 366
 367 #define MULTIPAGE_METADATA_MAGIC                (0xff)
 368
 369 #define PAGE_METADATA_GET_ZINDEX(page_meta)                     \
 370         (page_meta->zindex)
 371
 372 #define PAGE_METADATA_GET_ZONE(page_meta)                               \
 373         (&(zone_array[page_meta->zindex]))
 374
 375 #define PAGE_METADATA_SET_ZINDEX(page_meta, index)              \
 376         page_meta->zindex = (index);
 377
 378 struct zone_page_metadata {
 379         queue_chain_t           pages; /* linkage pointer for metadata lists */
 380
 381         /* Union for maintaining start of element free list and real metadata (for multipage allocations) */
 382         union {
 383                 /*
 384                  * The start of the freelist can be maintained as a 32-bit offset instead of a pointer because
 385                  * the free elements would be at max ZONE_MAX_ALLOC_SIZE bytes away from the metadata. Offset
 386                  * from start of the allocation chunk to free element list head.
 387                  */
 388                 uint32_t                freelist_offset;
 389                 /*
 390                  * This field is used to lookup the real metadata for multipage allocations, where we mark the
 391                  * metadata for all pages except the first as "fake" metadata using MULTIPAGE_METADATA_MAGIC.
 392                  * Offset from this fake metadata to real metadata of allocation chunk (-ve offset).
 393                  */
 394                 uint32_t                real_metadata_offset;
 395         };
 396
 397         /*
 398          * For the first page in the allocation chunk, this represents the total number of free elements in
 399          * the chunk.
 400          * For all other pages, it represents the number of free elements on that page (used
 401          * for garbage collection of zones with large multipage allocation size)
 402          */
 403         uint16_t                        free_count;
 404         uint8_t                         zindex;         /* Zone index within the zone_array */
 405         uint8_t                         page_count; /* Count of pages within the allocation chunk */
 406 };
 407
 408 /* Macro to get page index (within zone_map) of page containing element */
 409 #define PAGE_INDEX_FOR_ELEMENT(element)                         \
 410         (((vm_offset_t)trunc_page(element) - zone_map_min_address) / PAGE_SIZE)
 411
 412 /* Macro to get metadata structure given a page index in zone_map */
 413 #define PAGE_METADATA_FOR_PAGE_INDEX(index)                     \
 414         (zone_metadata_region_min + ((index) * sizeof(struct zone_page_metadata)))
 415
 416 /* Macro to get index (within zone_map) for given metadata */
 417 #define PAGE_INDEX_FOR_METADATA(page_meta)                      \
 418         (((vm_offset_t)page_meta - zone_metadata_region_min) / sizeof(struct zone_page_metadata))
 419
 420 /* Macro to get page for given page index in zone_map */
 421 #define PAGE_FOR_PAGE_INDEX(index)                              \
 422         (zone_map_min_address + (PAGE_SIZE * (index)))
 423
 424 /* Macro to get the actual metadata for a given address */
 425 #define PAGE_METADATA_FOR_ELEMENT(element)              \
 426         (struct zone_page_metadata *)(PAGE_METADATA_FOR_PAGE_INDEX(PAGE_INDEX_FOR_ELEMENT(element)))
 427
 428 /* Magic value to indicate empty element free list */
 429 #define PAGE_METADATA_EMPTY_FREELIST            ((uint32_t)(~0))
 430
 431 static inline void *
 432 page_metadata_get_freelist(struct zone_page_metadata *page_meta)
 433 {
 434         assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
 435         if (page_meta->freelist_offset == PAGE_METADATA_EMPTY_FREELIST)
 436                 return NULL;
 437         else {
 438                 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata)))
 439                         return (void *)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)) + page_meta->freelist_offset);
 440                 else
 441                         return (void *)((vm_offset_t)page_meta + page_meta->freelist_offset);
 442         }
 443 }
 444
 445 static inline void
 446 page_metadata_set_freelist(struct zone_page_metadata *page_meta, void *addr)
 447 {
 448         assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
 449         if (addr == NULL)
 450                 page_meta->freelist_offset = PAGE_METADATA_EMPTY_FREELIST;
 451         else {
 452                 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata)))
 453                         page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)));
 454                 else
 455                         page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - (vm_offset_t)page_meta);
 456         }
 457 }
 458
 459 static inline struct zone_page_metadata *
 460 page_metadata_get_realmeta(struct zone_page_metadata *page_meta)
 461 {
 462         assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC);
 463         return (struct zone_page_metadata *)((vm_offset_t)page_meta - page_meta->real_metadata_offset);
 464 }
 465
 466 static inline void
 467 page_metadata_set_realmeta(struct zone_page_metadata *page_meta, struct zone_page_metadata *real_meta)
 468 {
 469                 assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC);
 470                 assert(PAGE_METADATA_GET_ZINDEX(real_meta) != MULTIPAGE_METADATA_MAGIC);
 471                 assert((vm_offset_t)page_meta > (vm_offset_t)real_meta);
 472                 vm_offset_t offset = (vm_offset_t)page_meta - (vm_offset_t)real_meta;
 473                 assert(offset <= UINT32_MAX);
 474                 page_meta->real_metadata_offset = (uint32_t)offset;
 475 }
 476
 477 /* The backup pointer is stored in the last pointer-sized location in an element. */
 478 static inline vm_offset_t *
 479 get_backup_ptr(vm_size_t  elem_size,
 480                vm_offset_t *element)
 481 {
 482         return (vm_offset_t *) ((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
 483 }
 484
 485 /*
 486  * Routine to populate a page backing metadata in the zone_metadata_region.
 487  * Must be called without the zone lock held as it might potentially block.
 488  */
 489 static inline void
 490 zone_populate_metadata_page(struct zone_page_metadata *page_meta)
 491 {
 492         vm_offset_t page_metadata_begin = trunc_page(page_meta);
 493         vm_offset_t page_metadata_end = trunc_page((vm_offset_t)page_meta + sizeof(struct zone_page_metadata));
 494
 495         for(;page_metadata_begin <= page_metadata_end; page_metadata_begin += PAGE_SIZE) {
 496                 if (pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin))
 497                         continue;
 498                 /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
 499                 lck_mtx_lock(&zone_metadata_region_lck);
 500                 if (0 == pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) {
 501                         kernel_memory_populate(zone_map,
 502                                        page_metadata_begin,
 503                                        PAGE_SIZE,
 504                                        KMA_KOBJECT,
 505                                        VM_KERN_MEMORY_OSFMK);
 506                 }
 507                 lck_mtx_unlock(&zone_metadata_region_lck);
 508         }
 509         return;
 510 }
 511
 512 static inline uint16_t
 513 get_metadata_alloc_count(struct zone_page_metadata *page_meta)
 514 {
 515                 assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
 516                 struct zone *z = PAGE_METADATA_GET_ZONE(page_meta);
 517                 return ((page_meta->page_count * PAGE_SIZE) / z->elem_size);
 518 }
 519
 520 /*
 521  * Routine to lookup metadata for any given address.
 522  * If init is marked as TRUE, this should be called without holding the zone lock
 523  * since the initialization might block.
 524  */
 525 static inline struct zone_page_metadata *
 526 get_zone_page_metadata(struct zone_free_element *element, boolean_t init)
 527 {
 528         struct zone_page_metadata *page_meta = 0;
 529
 530         if (from_zone_map(element, sizeof(struct zone_free_element))) {
 531                 page_meta = (struct zone_page_metadata *)(PAGE_METADATA_FOR_ELEMENT(element));
 532                 if (init)
 533                         zone_populate_metadata_page(page_meta);
 534         } else {
 535                 page_meta = (struct zone_page_metadata *)(trunc_page((vm_offset_t)element));
 536         }
 537         if (init)
 538                 bzero((char *)page_meta, sizeof(struct zone_page_metadata));
 539         return ((PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC) ? page_meta : page_metadata_get_realmeta(page_meta));
 540 }
 541
 542 /* Routine to get the page for a given metadata */
 543 static inline vm_offset_t
 544 get_zone_page(struct zone_page_metadata *page_meta)
 545 {
 546         if (from_zone_map(page_meta, sizeof(struct zone_page_metadata)))
 547                 return (vm_offset_t)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)));
 548         else
 549                 return (vm_offset_t)(trunc_page(page_meta));
 550 }
 551
 552 /* Routine to get the size of a zone allocated address. If the address doesnt belong to the
 553  * zone_map, returns 0.
 554  */
 555 vm_size_t
 556 zone_element_size(void *addr, zone_t *z)
 557 {
 558         struct zone *src_zone;
 559         if (from_zone_map(addr, sizeof(void *))) {
 560                 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
 561                 src_zone = PAGE_METADATA_GET_ZONE(page_meta);
 562                 if (z) {
 563                         *z = src_zone;
 564                 }
 565                 return (src_zone->elem_size);
 566         } else {
 567 #if CONFIG_GZALLOC
 568                 vm_size_t gzsize;
 569                 if (gzalloc_element_size(addr, z, &gzsize)) {
 570                         return gzsize;
 571                 }
 572 #endif /* CONFIG_GZALLOC */
 573
 574                 return 0;
 575         }
 576 }
 577
 578 /*
 579  * Zone checking helper function.
 580  * A pointer that satisfies these conditions is OK to be a freelist next pointer
 581  * A pointer that doesn't satisfy these conditions indicates corruption
 582  */
 583 static inline boolean_t
 584 is_sane_zone_ptr(zone_t         zone,
 585                  vm_offset_t    addr,
 586                  size_t         obj_size)
 587 {
 588         /*  Must be aligned to pointer boundary */
 589         if (__improbable((addr & (sizeof(vm_offset_t) - 1)) != 0))
 590                 return FALSE;
 591
 592         /*  Must be a kernel address */
 593         if (__improbable(!pmap_kernel_va(addr)))
 594                 return FALSE;
 595
 596         /*  Must be from zone map if the zone only uses memory from the zone_map */
 597         /*
 598          *  TODO: Remove the zone->collectable check when every
 599          *  zone using foreign memory is properly tagged with allows_foreign
 600          */
 601         if (zone->collectable && !zone->allows_foreign) {
 602                 /*  check if addr is from zone map */
 603                 if (addr                 >= zone_map_min_address &&
 604                    (addr + obj_size - 1) <  zone_map_max_address )
 605                         return TRUE;
 606
 607                 return FALSE;
 608         }
 609
 610         return TRUE;
 611 }
 612
 613 static inline boolean_t
 614 is_sane_zone_page_metadata(zone_t       zone,
 615                            vm_offset_t  page_meta)
 616 {
 617         /* NULL page metadata structures are invalid */
 618         if (page_meta == 0)
 619                 return FALSE;
 620         return is_sane_zone_ptr(zone, page_meta, sizeof(struct zone_page_metadata));
 621 }
 622
 623 static inline boolean_t
 624 is_sane_zone_element(zone_t      zone,
 625                      vm_offset_t addr)
 626 {
 627         /*  NULL is OK because it indicates the tail of the list */
 628         if (addr == 0)
 629                 return TRUE;
 630         return is_sane_zone_ptr(zone, addr, zone->elem_size);
 631 }
 632
 633 /* Someone wrote to freed memory. */
 634 static inline void /* noreturn */
 635 zone_element_was_modified_panic(zone_t        zone,
 636                                 vm_offset_t   element,
 637                                 vm_offset_t   found,
 638                                 vm_offset_t   expected,
 639                                 vm_offset_t   offset)
 640 {
 641         panic("a freed zone element has been modified in zone %s: expected %p but found %p, bits changed %p, at offset %d of %d in element %p, cookies %p %p",
 642                          zone->zone_name,
 643               (void *)   expected,
 644               (void *)   found,
 645               (void *)   (expected ^ found),
 646               (uint32_t) offset,
 647               (uint32_t) zone->elem_size,
 648               (void *)   element,
 649               (void *)   zp_nopoison_cookie,
 650               (void *)   zp_poisoned_cookie);
 651 }
 652
 653 /*
 654  * The primary and backup pointers don't match.
 655  * Determine which one was likely the corrupted pointer, find out what it
 656  * probably should have been, and panic.
 657  * I would like to mark this as noreturn, but panic() isn't marked noreturn.
 658  */
 659 static void /* noreturn */
 660 backup_ptr_mismatch_panic(zone_t        zone,
 661                           vm_offset_t   element,
 662                           vm_offset_t   primary,
 663                           vm_offset_t   backup)
 664 {
 665         vm_offset_t likely_backup;
 666         vm_offset_t likely_primary;
 667
 668         likely_primary = primary ^ zp_nopoison_cookie;
 669         boolean_t   sane_backup;
 670         boolean_t   sane_primary = is_sane_zone_element(zone, likely_primary);
 671         boolean_t   element_was_poisoned = (backup & 0x1) ? TRUE : FALSE;
 672
 673 #if defined(__LP64__)
 674         /* We can inspect the tag in the upper bits for additional confirmation */
 675         if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000)
 676                 element_was_poisoned = TRUE;
 677         else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000)
 678                 element_was_poisoned = FALSE;
 679 #endif
 680
 681         if (element_was_poisoned) {
 682                 likely_backup = backup ^ zp_poisoned_cookie;
 683                 sane_backup = is_sane_zone_element(zone, likely_backup);
 684         } else {
 685                 likely_backup = backup ^ zp_nopoison_cookie;
 686                 sane_backup = is_sane_zone_element(zone, likely_backup);
 687         }
 688
 689         /* The primary is definitely the corrupted one */
 690         if (!sane_primary && sane_backup)
 691                 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
 692
 693         /* The backup is definitely the corrupted one */
 694         if (sane_primary && !sane_backup)
 695                 zone_element_was_modified_panic(zone, element, backup,
 696                                                 (primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
 697                                                 zone->elem_size - sizeof(vm_offset_t));
 698
 699         /*
 700          * Not sure which is the corrupted one.
 701          * It's less likely that the backup pointer was overwritten with
 702          * ( (sane address) ^ (valid cookie) ), so we'll guess that the
 703          * primary pointer has been overwritten with a sane but incorrect address.
 704          */
 705         if (sane_primary && sane_backup)
 706                 zone_element_was_modified_panic(zone, element, primary, likely_backup, 0);
 707
 708         /* Neither are sane, so just guess. */
 709         zone_element_was_modified_panic(zone, element, primary, likely_backup, 0);
 710 }
 711
 712 /*
 713  * Adds the element to the head of the zone's free list
 714  * Keeps a backup next-pointer at the end of the element
 715  */
 716 static inline void
 717 free_to_zone(zone_t      zone,
 718              vm_offset_t element,
 719              boolean_t   poison)
 720 {
 721         vm_offset_t old_head;
 722         struct zone_page_metadata *page_meta;
 723
 724         vm_offset_t *primary  = (vm_offset_t *) element;
 725         vm_offset_t *backup   = get_backup_ptr(zone->elem_size, primary);
 726
 727         page_meta = get_zone_page_metadata((struct zone_free_element *)element, FALSE);
 728         assert(PAGE_METADATA_GET_ZONE(page_meta) == zone);
 729         old_head = (vm_offset_t)page_metadata_get_freelist(page_meta);
 730
 731 #if MACH_ASSERT
 732         if (__improbable(!is_sane_zone_element(zone, old_head)))
 733                 panic("zfree: invalid head pointer %p for freelist of zone %s\n",
 734                       (void *) old_head, zone->zone_name);
 735 #endif
 736
 737         if (__improbable(!is_sane_zone_element(zone, element)))
 738                 panic("zfree: freeing invalid pointer %p to zone %s\n",
 739                       (void *) element, zone->zone_name);
 740
 741         /*
 742          * Always write a redundant next pointer
 743          * So that it is more difficult to forge, xor it with a random cookie
 744          * A poisoned element is indicated by using zp_poisoned_cookie
 745          * instead of zp_nopoison_cookie
 746          */
 747
 748         *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
 749
 750         /*
 751          * Insert this element at the head of the free list. We also xor the
 752          * primary pointer with the zp_nopoison_cookie to make sure a free
 753          * element does not provide the location of the next free element directly.
 754          */
 755         *primary             = old_head ^ zp_nopoison_cookie;
 756         page_metadata_set_freelist(page_meta, (struct zone_free_element *)element);
 757         page_meta->free_count++;
 758         if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) {
 759                 if (page_meta->free_count == 1) {
 760                         /* first foreign element freed on page, move from all_used */
 761                         re_queue_tail(&zone->pages.any_free_foreign, &(page_meta->pages));
 762                 } else {
 763                         /* no other list transitions */
 764                 }
 765         } else if (page_meta->free_count == get_metadata_alloc_count(page_meta)) {
 766                 /* whether the page was on the intermediate or all_used, queue, move it to free */
 767                 re_queue_tail(&zone->pages.all_free, &(page_meta->pages));
 768                 zone->count_all_free_pages += page_meta->page_count;
 769         } else if (page_meta->free_count == 1) {
 770                 /* first free element on page, move from all_used */
 771                 re_queue_tail(&zone->pages.intermediate, &(page_meta->pages));
 772         }
 773         zone->count--;
 774         zone->countfree++;
 775 }
 776
 777
 778 /*
 779  * Removes an element from the zone's free list, returning 0 if the free list is empty.
 780  * Verifies that the next-pointer and backup next-pointer are intact,
 781  * and verifies that a poisoned element hasn't been modified.
 782  */
 783 static inline vm_offset_t
 784 try_alloc_from_zone(zone_t zone,
 785                     boolean_t* check_poison)
 786 {
 787         vm_offset_t  element;
 788         struct zone_page_metadata *page_meta;
 789
 790         *check_poison = FALSE;
 791
 792         /* if zone is empty, bail */
 793         if (zone->allows_foreign && !queue_empty(&zone->pages.any_free_foreign))
 794                 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
 795         else if (!queue_empty(&zone->pages.intermediate))
 796                 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
 797         else if (!queue_empty(&zone->pages.all_free)) {
 798                 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
 799                 assert(zone->count_all_free_pages >= page_meta->page_count);
 800                 zone->count_all_free_pages -= page_meta->page_count;
 801         } else {
 802                 return 0;
 803         }
 804         /* Check if page_meta passes is_sane_zone_element */
 805         if (__improbable(!is_sane_zone_page_metadata(zone, (vm_offset_t)page_meta)))
 806                 panic("zalloc: invalid metadata structure %p for freelist of zone %s\n",
 807                         (void *) page_meta, zone->zone_name);
 808         assert(PAGE_METADATA_GET_ZONE(page_meta) == zone);
 809         element = (vm_offset_t)page_metadata_get_freelist(page_meta);
 810
 811         if (__improbable(!is_sane_zone_ptr(zone, element, zone->elem_size)))
 812                 panic("zfree: invalid head pointer %p for freelist of zone %s\n",
 813                       (void *) element, zone->zone_name);
 814
 815         vm_offset_t *primary = (vm_offset_t *) element;
 816         vm_offset_t *backup  = get_backup_ptr(zone->elem_size, primary);
 817
 818         /*
 819          * Since the primary next pointer is xor'ed with zp_nopoison_cookie
 820          * for obfuscation, retrieve the original value back
 821          */
 822         vm_offset_t  next_element          = *primary ^ zp_nopoison_cookie;
 823         vm_offset_t  next_element_primary  = *primary;
 824         vm_offset_t  next_element_backup   = *backup;
 825
 826         /*
 827          * backup_ptr_mismatch_panic will determine what next_element
 828          * should have been, and print it appropriately
 829          */
 830         if (__improbable(!is_sane_zone_element(zone, next_element)))
 831                 backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup);
 832
 833         /* Check the backup pointer for the regular cookie */
 834         if (__improbable(next_element != (next_element_backup ^ zp_nopoison_cookie))) {
 835
 836                 /* Check for the poisoned cookie instead */
 837                 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie)))
 838                         /* Neither cookie is valid, corruption has occurred */
 839                         backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup);
 840
 841                 /*
 842                  * Element was marked as poisoned, so check its integrity before using it.
 843                  */
 844                 *check_poison = TRUE;
 845         }
 846
 847         /* Make sure the page_meta is at the correct offset from the start of page */
 848         if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)element, FALSE)))
 849                 panic("zalloc: Incorrect metadata %p found in zone %s page queue. Expected metadata: %p\n",
 850                         page_meta, zone->zone_name, get_zone_page_metadata((struct zone_free_element *)element, FALSE));
 851
 852         /* Make sure next_element belongs to the same page as page_meta */
 853         if (next_element) {
 854                 if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)next_element, FALSE)))
 855                         panic("zalloc: next element pointer %p for element %p points to invalid element for zone %s\n",
 856                                 (void *)next_element, (void *)element, zone->zone_name);
 857         }
 858
 859         /* Remove this element from the free list */
 860         page_metadata_set_freelist(page_meta, (struct zone_free_element *)next_element);
 861         page_meta->free_count--;
 862
 863         if (page_meta->free_count == 0) {
 864                 /* move to all used */
 865                 re_queue_tail(&zone->pages.all_used, &(page_meta->pages));
 866         } else {
 867                 if (!zone->allows_foreign || from_zone_map(element, zone->elem_size)) {
 868                         if (get_metadata_alloc_count(page_meta) == page_meta->free_count + 1) {
 869                                 /* remove from free, move to intermediate */
 870                                 re_queue_tail(&zone->pages.intermediate, &(page_meta->pages));
 871                         }
 872                 }
 873         }
 874         zone->countfree--;
 875         zone->count++;
 876         zone->sum_count++;
 877
 878         return element;
 879 }
 880
 881 /*
 882  * End of zone poisoning
 883  */
 884
 885 /*
 886  * Zone info options
 887  */
 888 #define ZINFO_SLOTS     MAX_ZONES               /* for now */
 889
 890 void            zone_display_zprint(void);
 891
 892 zone_t          zone_find_largest(void);
 893
 894 /*
 895  * Async allocation of zones
 896  * This mechanism allows for bootstrapping an empty zone which is setup with
 897  * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
 898  * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
 899  * This will prime the zone for the next use.
 900  *
 901  * Currently the thread_callout function (zalloc_async) will loop through all zones
 902  * looking for any zone with async_pending set and do the work for it.
 903  *
 904  * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
 905  * then zalloc_noblock to an empty zone may succeed.
 906  */
 907 void            zalloc_async(
 908                                 thread_call_param_t     p0,
 909                                 thread_call_param_t     p1);
 910
 911 static thread_call_data_t call_async_alloc;
 912
 913 /*
 914  * Align elements that use the zone page list to 32 byte boundaries.
 915  */
 916 #define ZONE_ELEMENT_ALIGNMENT 32
 917
 918 #define zone_wakeup(zone) thread_wakeup((event_t)(zone))
 919 #define zone_sleep(zone)                                \
 920         (void) lck_mtx_sleep(&(zone)->lock, LCK_SLEEP_SPIN, (event_t)(zone), THREAD_UNINT);
 921
 922 /*
 923  *      The zone_locks_grp allows for collecting lock statistics.
 924  *      All locks are associated to this group in zinit.
 925  *      Look at tools/lockstat for debugging lock contention.
 926  */
 927
 928 lck_grp_t       zone_locks_grp;
 929 lck_grp_attr_t  zone_locks_grp_attr;
 930
 931 #define lock_zone_init(zone)                            \
 932 MACRO_BEGIN                                             \
 933         lck_attr_setdefault(&(zone)->lock_attr);                        \
 934         lck_mtx_init_ext(&(zone)->lock, &(zone)->lock_ext,              \
 935             &zone_locks_grp, &(zone)->lock_attr);                       \
 936 MACRO_END
 937
 938 #define lock_try_zone(zone)     lck_mtx_try_lock_spin(&zone->lock)
 939
 940 /*
 941  *      Exclude more than one concurrent garbage collection
 942  */
 943 decl_lck_mtx_data(, zone_gc_lock)
 944
 945 lck_attr_t      zone_gc_lck_attr;
 946 lck_grp_t       zone_gc_lck_grp;
 947 lck_grp_attr_t  zone_gc_lck_grp_attr;
 948 lck_mtx_ext_t   zone_gc_lck_ext;
 949
 950 boolean_t zone_gc_allowed = TRUE;
 951 boolean_t panic_include_zprint = FALSE;
 952
 953 vm_offset_t panic_kext_memory_info = 0;
 954 vm_size_t panic_kext_memory_size = 0;
 955
 956 #define ZALLOC_DEBUG_ZONEGC             0x00000001
 957 #define ZALLOC_DEBUG_ZCRAM              0x00000002
 958 uint32_t zalloc_debug = 0;
 959
 960 /*
 961  * Zone leak debugging code
 962  *
 963  * When enabled, this code keeps a log to track allocations to a particular zone that have not
 964  * yet been freed.  Examining this log will reveal the source of a zone leak.  The log is allocated
 965  * only when logging is enabled, so there is no effect on the system when it's turned off.  Logging is
 966  * off by default.
 967  *
 968  * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
 969  * is the name of the zone you wish to log.
 970  *
 971  * This code only tracks one zone, so you need to identify which one is leaking first.
 972  * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
 973  * garbage collector.  Note that the zone name printed in the panic message is not necessarily the one
 974  * containing the leak.  So do a zprint from gdb and locate the zone with the bloated size.  This
 975  * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test.  The
 976  * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
 977  * See the help in the kgmacros for usage info.
 978  *
 979  *
 980  * Zone corruption logging
 981  *
 982  * Logging can also be used to help identify the source of a zone corruption.  First, identify the zone
 983  * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args.  When -zc is used in conjunction
 984  * with zlog, it changes the logging style to track both allocations and frees to the zone.  So when the
 985  * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
 986  * and freed any particular element in the zone.  Use the findelem kgmacro with the address of the element that's been
 987  * corrupted to examine its history.  This should lead to the source of the corruption.
 988  */
 989
 990 static boolean_t log_records_init = FALSE;
 991 static int log_records; /* size of the log, expressed in number of records */
 992
 993 #define MAX_NUM_ZONES_ALLOWED_LOGGING   5 /* Maximum 5 zones can be logged at once */
 994
 995 static int  max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
 996 static int  num_zones_logged = 0;
 997
 998 #define MAX_ZONE_NAME   32      /* max length of a zone name we can take from the boot-args */
 999
1000 static char zone_name_to_log[MAX_ZONE_NAME] = "";       /* the zone name we're logging, if any */
1001
1002 /* Log allocations and frees to help debug a zone element corruption */
1003 boolean_t       corruption_debug_flag    = FALSE;    /* enabled by "-zc" boot-arg */
1004 /* Making pointer scanning leaks detection possible for all zones */
1005
1006 #if DEBUG || DEVELOPMENT
1007 boolean_t       leak_scan_debug_flag     = FALSE;    /* enabled by "-zl" boot-arg */
1008 #endif /* DEBUG || DEVELOPMENT */
1009
1010
1011 /*
1012  * The number of records in the log is configurable via the zrecs parameter in boot-args.  Set this to
1013  * the number of records you want in the log.  For example, "zrecs=10" sets it to 10 records. Since this
1014  * is the number of stacks suspected of leaking, we don't need many records.
1015  */
1016
1017 #if     defined(__LP64__)
1018 #define ZRECORDS_MAX            2560            /* Max records allowed in the log */
1019 #else
1020 #define ZRECORDS_MAX            1536            /* Max records allowed in the log */
1021 #endif
1022 #define ZRECORDS_DEFAULT        1024            /* default records in log if zrecs is not specificed in boot-args */
1023
1024 /*
1025  * Each record in the log contains a pointer to the zone element it refers to,
1026  * and a small array to hold the pc's from the stack trace.  A
1027  * record is added to the log each time a zalloc() is done in the zone_of_interest.  For leak debugging,
1028  * the record is cleared when a zfree() is done.  For corruption debugging, the log tracks both allocs and frees.
1029  * If the log fills, old records are replaced as if it were a circular buffer.
1030  */
1031
1032
1033 /*
1034  * Opcodes for the btlog operation field:
1035  */
1036
1037 #define ZOP_ALLOC       1
1038 #define ZOP_FREE        0
1039
1040 /*
1041  * Decide if we want to log this zone by doing a string compare between a zone name and the name
1042  * of the zone to log. Return true if the strings are equal, false otherwise.  Because it's not
1043  * possible to include spaces in strings passed in via the boot-args, a period in the logname will
1044  * match a space in the zone name.
1045  */
1046
1047 static int
1048 log_this_zone(const char *zonename, const char *logname)
1049 {
1050         int len;
1051         const char *zc = zonename;
1052         const char *lc = logname;
1053
1054         /*
1055          * Compare the strings.  We bound the compare by MAX_ZONE_NAME.
1056          */
1057
1058         for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
1059
1060                 /*
1061                  * If the current characters don't match, check for a space in
1062                  * in the zone name and a corresponding period in the log name.
1063                  * If that's not there, then the strings don't match.
1064                  */
1065
1066                 if (*zc != *lc && !(*zc == ' ' && *lc == '.'))
1067                         break;
1068
1069                 /*
1070                  * The strings are equal so far.  If we're at the end, then it's a match.
1071                  */
1072
1073                 if (*zc == '\0')
1074                         return TRUE;
1075         }
1076
1077         return FALSE;
1078 }
1079
1080
1081 /*
1082  * Test if we want to log this zalloc/zfree event.  We log if this is the zone we're interested in and
1083  * the buffer for the records has been allocated.
1084  */
1085
1086 #define DO_LOGGING(z)           (z->zone_logging == TRUE && z->zlog_btlog)
1087
1088 extern boolean_t kmem_alloc_ready;
1089
1090 #if CONFIG_ZLEAKS
1091 #pragma mark -
1092 #pragma mark Zone Leak Detection
1093
1094 /*
1095  * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
1096  * allocations made by the zone allocator.  Every zleak_sample_factor allocations in each zone, we capture a
1097  * backtrace.  Every free, we examine the table and determine if the allocation was being tracked,
1098  * and stop tracking it if it was being tracked.
1099  *
1100  * We track the allocations in the zallocations hash table, which stores the address that was returned from
1101  * the zone allocator.  Each stored entry in the zallocations table points to an entry in the ztraces table, which
1102  * stores the backtrace associated with that allocation.  This provides uniquing for the relatively large
1103  * backtraces - we don't store them more than once.
1104  *
1105  * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
1106  * a large amount of virtual space.
1107  */
1108 #define ZLEAK_STATE_ENABLED             0x01    /* Zone leak monitoring should be turned on if zone_map fills up. */
1109 #define ZLEAK_STATE_ACTIVE              0x02    /* We are actively collecting traces. */
1110 #define ZLEAK_STATE_ACTIVATING          0x04    /* Some thread is doing setup; others should move along. */
1111 #define ZLEAK_STATE_FAILED              0x08    /* Attempt to allocate tables failed.  We will not try again. */
1112 uint32_t        zleak_state = 0;                /* State of collection, as above */
1113
1114 boolean_t       panic_include_ztrace    = FALSE;        /* Enable zleak logging on panic */
1115 vm_size_t       zleak_global_tracking_threshold;        /* Size of zone map at which to start collecting data */
1116 vm_size_t       zleak_per_zone_tracking_threshold;      /* Size a zone will have before we will collect data on it */
1117 unsigned int    zleak_sample_factor     = 1000;         /* Allocations per sample attempt */
1118
1119 /*
1120  * Counters for allocation statistics.
1121  */
1122
1123 /* Times two active records want to occupy the same spot */
1124 unsigned int z_alloc_collisions = 0;
1125 unsigned int z_trace_collisions = 0;
1126
1127 /* Times a new record lands on a spot previously occupied by a freed allocation */
1128 unsigned int z_alloc_overwrites = 0;
1129 unsigned int z_trace_overwrites = 0;
1130
1131 /* Times a new alloc or trace is put into the hash table */
1132 unsigned int z_alloc_recorded   = 0;
1133 unsigned int z_trace_recorded   = 0;
1134
1135 /* Times zleak_log returned false due to not being able to acquire the lock */
1136 unsigned int z_total_conflicts  = 0;
1137
1138
1139 #pragma mark struct zallocation
1140 /*
1141  * Structure for keeping track of an allocation
1142  * An allocation bucket is in use if its element is not NULL
1143  */
1144 struct zallocation {
1145         uintptr_t               za_element;             /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
1146         vm_size_t               za_size;                        /* how much memory did this allocation take up? */
1147         uint32_t                za_trace_index; /* index into ztraces for backtrace associated with allocation */
1148         /* TODO: #if this out */
1149         uint32_t                za_hit_count;           /* for determining effectiveness of hash function */
1150 };
1151
1152 /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
1153 uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
1154 uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
1155
1156 vm_size_t zleak_max_zonemap_size;
1157
1158 /* Hashmaps of allocations and their corresponding traces */
1159 static struct zallocation*      zallocations;
1160 static struct ztrace*           ztraces;
1161
1162 /* not static so that panic can see this, see kern/debug.c */
1163 struct ztrace*                          top_ztrace;
1164
1165 /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
1166 static lck_spin_t                       zleak_lock;
1167 static lck_attr_t                       zleak_lock_attr;
1168 static lck_grp_t                        zleak_lock_grp;
1169 static lck_grp_attr_t                   zleak_lock_grp_attr;
1170
1171 /*
1172  * Initializes the zone leak monitor.  Called from zone_init()
1173  */
1174 static void
1175 zleak_init(vm_size_t max_zonemap_size)
1176 {
1177         char                    scratch_buf[16];
1178         boolean_t               zleak_enable_flag = FALSE;
1179
1180         zleak_max_zonemap_size = max_zonemap_size;
1181         zleak_global_tracking_threshold = max_zonemap_size / 2;
1182         zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
1183
1184         /* -zleakoff (flag to disable zone leak monitor) */
1185         if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
1186                 zleak_enable_flag = FALSE;
1187                 printf("zone leak detection disabled\n");
1188         } else {
1189                 zleak_enable_flag = TRUE;
1190                 printf("zone leak detection enabled\n");
1191         }
1192
1193         /* zfactor=XXXX (override how often to sample the zone allocator) */
1194         if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
1195                 printf("Zone leak factor override: %u\n", zleak_sample_factor);
1196         }
1197
1198         /* zleak-allocs=XXXX (override number of buckets in zallocations) */
1199         if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
1200                 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
1201                 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1202                 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets-1))) {
1203                         printf("Override isn't a power of two, bad things might happen!\n");
1204                 }
1205         }
1206
1207         /* zleak-traces=XXXX (override number of buckets in ztraces) */
1208         if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
1209                 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
1210                 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1211                 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets-1))) {
1212                         printf("Override isn't a power of two, bad things might happen!\n");
1213                 }
1214         }
1215
1216         /* allocate the zleak_lock */
1217         lck_grp_attr_setdefault(&zleak_lock_grp_attr);
1218         lck_grp_init(&zleak_lock_grp, "zleak_lock", &zleak_lock_grp_attr);
1219         lck_attr_setdefault(&zleak_lock_attr);
1220         lck_spin_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr);
1221
1222         if (zleak_enable_flag) {
1223                 zleak_state = ZLEAK_STATE_ENABLED;
1224         }
1225 }
1226
1227 #if CONFIG_ZLEAKS
1228
1229 /*
1230  * Support for kern.zleak.active sysctl - a simplified
1231  * version of the zleak_state variable.
1232  */
1233 int
1234 get_zleak_state(void)
1235 {
1236         if (zleak_state & ZLEAK_STATE_FAILED)
1237                 return (-1);
1238         if (zleak_state & ZLEAK_STATE_ACTIVE)
1239                 return (1);
1240         return (0);
1241 }
1242
1243 #endif
1244
1245
1246 kern_return_t
1247 zleak_activate(void)
1248 {
1249         kern_return_t retval;
1250         vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
1251         vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
1252         void *allocations_ptr = NULL;
1253         void *traces_ptr = NULL;
1254
1255         /* Only one thread attempts to activate at a time */
1256         if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1257                 return KERN_SUCCESS;
1258         }
1259
1260         /* Indicate that we're doing the setup */
1261         lck_spin_lock(&zleak_lock);
1262         if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1263                 lck_spin_unlock(&zleak_lock);
1264                 return KERN_SUCCESS;
1265         }
1266
1267         zleak_state |= ZLEAK_STATE_ACTIVATING;
1268         lck_spin_unlock(&zleak_lock);
1269
1270         /* Allocate and zero tables */
1271         retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK);
1272         if (retval != KERN_SUCCESS) {
1273                 goto fail;
1274         }
1275
1276         retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK);
1277         if (retval != KERN_SUCCESS) {
1278                 goto fail;
1279         }
1280
1281         bzero(allocations_ptr, z_alloc_size);
1282         bzero(traces_ptr, z_trace_size);
1283
1284         /* Everything's set.  Install tables, mark active. */
1285         zallocations = allocations_ptr;
1286         ztraces = traces_ptr;
1287
1288         /*
1289          * Initialize the top_ztrace to the first entry in ztraces,
1290          * so we don't have to check for null in zleak_log
1291          */
1292         top_ztrace = &ztraces[0];
1293
1294         /*
1295          * Note that we do need a barrier between installing
1296          * the tables and setting the active flag, because the zfree()
1297          * path accesses the table without a lock if we're active.
1298          */
1299         lck_spin_lock(&zleak_lock);
1300         zleak_state |= ZLEAK_STATE_ACTIVE;
1301         zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1302         lck_spin_unlock(&zleak_lock);
1303
1304         return 0;
1305
1306 fail:
1307         /*
1308          * If we fail to allocate memory, don't further tax
1309          * the system by trying again.
1310          */
1311         lck_spin_lock(&zleak_lock);
1312         zleak_state |= ZLEAK_STATE_FAILED;
1313         zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1314         lck_spin_unlock(&zleak_lock);
1315
1316         if (allocations_ptr != NULL) {
1317                 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
1318         }
1319
1320         if (traces_ptr != NULL) {
1321                 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
1322         }
1323
1324         return retval;
1325 }
1326
1327 /*
1328  * TODO: What about allocations that never get deallocated,
1329  * especially ones with unique backtraces? Should we wait to record
1330  * until after boot has completed?
1331  * (How many persistent zallocs are there?)
1332  */
1333
1334 /*
1335  * This function records the allocation in the allocations table,
1336  * and stores the associated backtrace in the traces table
1337  * (or just increments the refcount if the trace is already recorded)
1338  * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
1339  * the associated trace's refcount is decremented.
1340  * If the trace slot is in use, it returns.
1341  * The refcount is incremented by the amount of memory the allocation consumes.
1342  * The return value indicates whether to try again next time.
1343  */
1344 static boolean_t
1345 zleak_log(uintptr_t* bt,
1346                   uintptr_t addr,
1347                   uint32_t depth,
1348                   vm_size_t allocation_size)
1349 {
1350         /* Quit if there's someone else modifying the hash tables */
1351         if (!lck_spin_try_lock(&zleak_lock)) {
1352                 z_total_conflicts++;
1353                 return FALSE;
1354         }
1355
1356         struct zallocation* allocation  = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
1357
1358         uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
1359         struct ztrace* trace = &ztraces[trace_index];
1360
1361         allocation->za_hit_count++;
1362         trace->zt_hit_count++;
1363
1364         /*
1365          * If the allocation bucket we want to be in is occupied, and if the occupier
1366          * has the same trace as us, just bail.
1367          */
1368         if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
1369                 z_alloc_collisions++;
1370
1371                 lck_spin_unlock(&zleak_lock);
1372                 return TRUE;
1373         }
1374
1375         /* STEP 1: Store the backtrace in the traces array. */
1376         /* A size of zero indicates that the trace bucket is free. */
1377
1378         if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0 ) {
1379                 /*
1380                  * Different unique trace with same hash!
1381                  * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
1382                  * and get out of the way for later chances
1383                  */
1384                 trace->zt_collisions++;
1385                 z_trace_collisions++;
1386
1387                 lck_spin_unlock(&zleak_lock);
1388                 return TRUE;
1389         } else if (trace->zt_size > 0) {
1390                 /* Same trace, already added, so increment refcount */
1391                 trace->zt_size += allocation_size;
1392         } else {
1393                 /* Found an unused trace bucket, record the trace here! */
1394                 if (trace->zt_depth != 0) /* if this slot was previously used but not currently in use */
1395                         z_trace_overwrites++;
1396
1397                 z_trace_recorded++;
1398                 trace->zt_size                  = allocation_size;
1399                 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)) );
1400
1401                 trace->zt_depth         = depth;
1402                 trace->zt_collisions    = 0;
1403         }
1404
1405         /* STEP 2: Store the allocation record in the allocations array. */
1406
1407         if (allocation->za_element != (uintptr_t) 0) {
1408                 /*
1409                  * Straight up replace any allocation record that was there.  We don't want to do the work
1410                  * to preserve the allocation entries that were there, because we only record a subset of the
1411                  * allocations anyways.
1412                  */
1413
1414                 z_alloc_collisions++;
1415
1416                 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
1417                 /* Knock off old allocation's size, not the new allocation */
1418                 associated_trace->zt_size -= allocation->za_size;
1419         } else if (allocation->za_trace_index != 0) {
1420                 /* Slot previously used but not currently in use */
1421                 z_alloc_overwrites++;
1422         }
1423
1424         allocation->za_element          = addr;
1425         allocation->za_trace_index      = trace_index;
1426         allocation->za_size             = allocation_size;
1427
1428         z_alloc_recorded++;
1429
1430         if (top_ztrace->zt_size < trace->zt_size)
1431                 top_ztrace = trace;
1432
1433         lck_spin_unlock(&zleak_lock);
1434         return TRUE;
1435 }
1436
1437 /*
1438  * Free the allocation record and release the stacktrace.
1439  * This should be as fast as possible because it will be called for every free.
1440  */
1441 static void
1442 zleak_free(uintptr_t addr,
1443                    vm_size_t allocation_size)
1444 {
1445         if (addr == (uintptr_t) 0)
1446                 return;
1447
1448         struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
1449
1450         /* Double-checked locking: check to find out if we're interested, lock, check to make
1451          * sure it hasn't changed, then modify it, and release the lock.
1452          */
1453
1454         if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
1455                 /* if the allocation was the one, grab the lock, check again, then delete it */
1456                 lck_spin_lock(&zleak_lock);
1457
1458                 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
1459                         struct ztrace *trace;
1460
1461                         /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
1462                         if (allocation->za_size != allocation_size) {
1463                                 panic("Freeing as size %lu memory that was allocated with size %lu\n",
1464                                                 (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
1465                         }
1466
1467                         trace = &ztraces[allocation->za_trace_index];
1468
1469                         /* size of 0 indicates trace bucket is unused */
1470                         if (trace->zt_size > 0) {
1471                                 trace->zt_size -= allocation_size;
1472                         }
1473
1474                         /* A NULL element means the allocation bucket is unused */
1475                         allocation->za_element = 0;
1476                 }
1477                 lck_spin_unlock(&zleak_lock);
1478         }
1479 }
1480
1481 #endif /* CONFIG_ZLEAKS */
1482
1483 /*  These functions outside of CONFIG_ZLEAKS because they are also used in
1484  *  mbuf.c for mbuf leak-detection.  This is why they lack the z_ prefix.
1485  */
1486
1487 /* "Thomas Wang's 32/64 bit mix functions."  http://www.concentric.net/~Ttwang/tech/inthash.htm */
1488 uintptr_t
1489 hash_mix(uintptr_t x)
1490 {
1491 #ifndef __LP64__
1492         x += ~(x << 15);
1493         x ^=  (x >> 10);
1494         x +=  (x << 3 );
1495         x ^=  (x >> 6 );
1496         x += ~(x << 11);
1497         x ^=  (x >> 16);
1498 #else
1499         x += ~(x << 32);
1500         x ^=  (x >> 22);
1501         x += ~(x << 13);
1502         x ^=  (x >> 8 );
1503         x +=  (x << 3 );
1504         x ^=  (x >> 15);
1505         x += ~(x << 27);
1506         x ^=  (x >> 31);
1507 #endif
1508         return x;
1509 }
1510
1511 uint32_t
1512 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
1513 {
1514
1515         uintptr_t hash = 0;
1516         uintptr_t mask = max_size - 1;
1517
1518         while (depth) {
1519                 hash += bt[--depth];
1520         }
1521
1522         hash = hash_mix(hash) & mask;
1523
1524         assert(hash < max_size);
1525
1526         return (uint32_t) hash;
1527 }
1528
1529 /*
1530  *  TODO: Determine how well distributed this is
1531  *      max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
1532  */
1533 uint32_t
1534 hashaddr(uintptr_t pt, uint32_t max_size)
1535 {
1536         uintptr_t hash = 0;
1537         uintptr_t mask = max_size - 1;
1538
1539         hash = hash_mix(pt) & mask;
1540
1541         assert(hash < max_size);
1542
1543         return (uint32_t) hash;
1544 }
1545
1546 /* End of all leak-detection code */
1547 #pragma mark -
1548
1549 #define ZONE_MAX_ALLOC_SIZE     (32 * 1024)
1550 #define ZONE_ALLOC_FRAG_PERCENT(alloc_size, ele_size) (((alloc_size % ele_size) * 100) / alloc_size)
1551
1552 /*
1553  *      zinit initializes a new zone.  The zone data structures themselves
1554  *      are stored in a zone, which is initially a static structure that
1555  *      is initialized by zone_init.
1556  */
1557 zone_t
1558 zinit(
1559         vm_size_t       size,           /* the size of an element */
1560         vm_size_t       max,            /* maximum memory to use */
1561         vm_size_t       alloc,          /* allocation size */
1562         const char      *name)          /* a name for the zone */
1563 {
1564         zone_t          z;
1565
1566         simple_lock(&all_zones_lock);
1567         assert(num_zones < MAX_ZONES);
1568         z = &(zone_array[num_zones]);
1569         z->index = num_zones;
1570         num_zones++;
1571         simple_unlock(&all_zones_lock);
1572
1573         /* Zone elements must fit both a next pointer and a backup pointer */
1574         vm_size_t  minimum_element_size = sizeof(vm_offset_t) * 2;
1575         if (size < minimum_element_size)
1576                 size = minimum_element_size;
1577
1578         /*
1579          *  Round element size to a multiple of sizeof(pointer)
1580          *  This also enforces that allocations will be aligned on pointer boundaries
1581          */
1582         size = ((size-1) + sizeof(vm_offset_t)) -
1583                ((size-1) % sizeof(vm_offset_t));
1584
1585         if (alloc == 0)
1586                 alloc = PAGE_SIZE;
1587
1588         alloc = round_page(alloc);
1589         max   = round_page(max);
1590
1591         vm_size_t best_alloc = PAGE_SIZE;
1592         vm_size_t alloc_size;
1593         for (alloc_size = (2 * PAGE_SIZE); alloc_size <= ZONE_MAX_ALLOC_SIZE; alloc_size += PAGE_SIZE) {
1594                 if (ZONE_ALLOC_FRAG_PERCENT(alloc_size, size) < ZONE_ALLOC_FRAG_PERCENT(best_alloc, size)) {
1595                         best_alloc = alloc_size;
1596                 }
1597         }
1598         alloc = best_alloc;
1599         if (max && (max < alloc))
1600                 max = alloc;
1601
1602         z->free_elements = NULL;
1603         queue_init(&z->pages.any_free_foreign);
1604         queue_init(&z->pages.all_free);
1605         queue_init(&z->pages.intermediate);
1606         queue_init(&z->pages.all_used);
1607         z->cur_size = 0;
1608         z->page_count = 0;
1609         z->max_size = max;
1610         z->elem_size = size;
1611         z->alloc_size = alloc;
1612         z->zone_name = name;
1613         z->count = 0;
1614         z->countfree = 0;
1615         z->count_all_free_pages = 0;
1616         z->sum_count = 0LL;
1617         z->doing_alloc_without_vm_priv = FALSE;
1618         z->doing_alloc_with_vm_priv = FALSE;
1619         z->exhaustible = FALSE;
1620         z->collectable = TRUE;
1621         z->allows_foreign = FALSE;
1622         z->expandable  = TRUE;
1623         z->waiting = FALSE;
1624         z->async_pending = FALSE;
1625         z->caller_acct = TRUE;
1626         z->noencrypt = FALSE;
1627         z->no_callout = FALSE;
1628         z->async_prio_refill = FALSE;
1629         z->gzalloc_exempt = FALSE;
1630         z->alignment_required = FALSE;
1631         z->zone_replenishing = FALSE;
1632         z->prio_refill_watermark = 0;
1633         z->zone_replenish_thread = NULL;
1634         z->zp_count = 0;
1635
1636 #if CONFIG_ZLEAKS
1637         z->zleak_capture = 0;
1638         z->zleak_on = FALSE;
1639 #endif /* CONFIG_ZLEAKS */
1640
1641         lock_zone_init(z);
1642
1643         /*
1644          * Check for and set up zone leak detection if requested via boot-args.  We recognized two
1645          * boot-args:
1646          *
1647          *      zlog=<zone_to_log>
1648          *      zrecs=<num_records_in_log>
1649          *
1650          * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to
1651          * control the size of the log.  If zrecs is not specified, a default value is used.
1652          */
1653
1654         if (num_zones_logged < max_num_zones_to_log) {
1655
1656                 int             i = 1; /* zlog0 isn't allowed. */
1657                 boolean_t       zone_logging_enabled = FALSE;
1658                 char            zlog_name[MAX_ZONE_NAME] = ""; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
1659
1660                 while (i <= max_num_zones_to_log) {
1661
1662                         snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
1663
1664                         if (PE_parse_boot_argn(zlog_name, zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
1665                                 if (log_this_zone(z->zone_name, zone_name_to_log)) {
1666                                         z->zone_logging = TRUE;
1667                                         zone_logging_enabled = TRUE;
1668                                         num_zones_logged++;
1669                                         break;
1670                                 }
1671                         }
1672                         i++;
1673                 }
1674
1675                 if (zone_logging_enabled == FALSE) {
1676                         /*
1677                          * Backwards compat. with the old boot-arg used to specify single zone logging i.e. zlog
1678                          * Needs to happen after the newer zlogn checks because the prefix will match all the zlogn
1679                          * boot-args.
1680                          */
1681                         if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
1682                                 if (log_this_zone(z->zone_name, zone_name_to_log)) {
1683                                                 z->zone_logging = TRUE;
1684                                                 zone_logging_enabled = TRUE;
1685                                                 num_zones_logged++;
1686                                 }
1687                         }
1688                 }
1689
1690                 if (log_records_init == FALSE && zone_logging_enabled == TRUE) {
1691                     if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) {
1692                                 /*
1693                                  * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
1694                                  * This prevents accidentally hogging too much kernel memory and making the system
1695                                  * unusable.
1696                                  */
1697
1698                                 log_records = MIN(ZRECORDS_MAX, log_records);
1699                                 log_records_init = TRUE;
1700                         } else {
1701                                 log_records = ZRECORDS_DEFAULT;
1702                                 log_records_init = TRUE;
1703                         }
1704                 }
1705
1706                 /*
1707                  * If we want to log a zone, see if we need to allocate buffer space for the log.  Some vm related zones are
1708                  * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case.  kmem_alloc_ready is set to
1709                  * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work.  If we want to log one
1710                  * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again
1711                  * later on some other zone.  So note we may be allocating a buffer to log a zone other than the one being initialized
1712                  * right now.
1713                  */
1714                 if (kmem_alloc_ready) {
1715
1716                         zone_t curr_zone = NULL;
1717                         unsigned int max_zones = 0, zone_idx = 0;
1718
1719                         simple_lock(&all_zones_lock);
1720                         max_zones = num_zones;
1721                         simple_unlock(&all_zones_lock);
1722
1723                         for (zone_idx = 0; zone_idx < max_zones; zone_idx++) {
1724
1725                                 curr_zone = &(zone_array[zone_idx]);
1726
1727                                 /*
1728                                  * We work with the zone unlocked here because we could end up needing the zone lock to
1729                                  * enable logging for this zone e.g. need a VM object to allocate memory to enable logging for the
1730                                  * VM objects zone.
1731                                  *
1732                                  * We don't expect these zones to be needed at this early a time in boot and so take this chance.
1733                                  */
1734                                 if (curr_zone->zone_logging && curr_zone->zlog_btlog == NULL) {
1735
1736                                         curr_zone->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
1737
1738                                         if (curr_zone->zlog_btlog) {
1739
1740                                                 printf("zone: logging started for zone %s\n", curr_zone->zone_name);
1741                                         } else {
1742                                                 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
1743                                                 curr_zone->zone_logging = FALSE;
1744                                         }
1745                                 }
1746
1747                         }
1748                 }
1749         }
1750
1751 #if     CONFIG_GZALLOC
1752         gzalloc_zone_init(z);
1753 #endif
1754         return(z);
1755 }
1756 unsigned        zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count;
1757
1758 static void zone_replenish_thread(zone_t);
1759
1760 /* High priority VM privileged thread used to asynchronously refill a designated
1761  * zone, such as the reserved VM map entry zone.
1762  */
1763 __attribute__((noreturn))
1764 static void
1765 zone_replenish_thread(zone_t z)
1766 {
1767         vm_size_t free_size;
1768         current_thread()->options |= TH_OPT_VMPRIV;
1769
1770         for (;;) {
1771                 lock_zone(z);
1772                 z->zone_replenishing = TRUE;
1773                 assert(z->prio_refill_watermark != 0);
1774                 while ((free_size = (z->cur_size - (z->count * z->elem_size))) < (z->prio_refill_watermark * z->elem_size)) {
1775                         assert(z->doing_alloc_without_vm_priv == FALSE);
1776                         assert(z->doing_alloc_with_vm_priv == FALSE);
1777                         assert(z->async_prio_refill == TRUE);
1778
1779                         unlock_zone(z);
1780                         int     zflags = KMA_KOBJECT|KMA_NOPAGEWAIT;
1781                         vm_offset_t space, alloc_size;
1782                         kern_return_t kr;
1783
1784                         if (vm_pool_low())
1785                                 alloc_size = round_page(z->elem_size);
1786                         else
1787                                 alloc_size = z->alloc_size;
1788
1789                         if (z->noencrypt)
1790                                 zflags |= KMA_NOENCRYPT;
1791
1792                         kr = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
1793
1794                         if (kr == KERN_SUCCESS) {
1795                                 zcram(z, space, alloc_size);
1796                         } else if (kr == KERN_RESOURCE_SHORTAGE) {
1797                                 VM_PAGE_WAIT();
1798                         } else if (kr == KERN_NO_SPACE) {
1799                                 kr = kernel_memory_allocate(kernel_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
1800                                 if (kr == KERN_SUCCESS) {
1801                                         zcram(z, space, alloc_size);
1802                                 } else {
1803                                         assert_wait_timeout(&z->zone_replenish_thread, THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
1804                                         thread_block(THREAD_CONTINUE_NULL);
1805                                 }
1806                         }
1807
1808                         lock_zone(z);
1809                         zone_replenish_loops++;
1810                 }
1811
1812                 z->zone_replenishing = FALSE;
1813                 /* Signal any potential throttled consumers, terminating
1814                  * their timer-bounded waits.
1815                  */
1816                 thread_wakeup(z);
1817
1818                 assert_wait(&z->zone_replenish_thread, THREAD_UNINT);
1819                 unlock_zone(z);
1820                 thread_block(THREAD_CONTINUE_NULL);
1821                 zone_replenish_wakeups++;
1822         }
1823 }
1824
1825 void
1826 zone_prio_refill_configure(zone_t z, vm_size_t low_water_mark) {
1827         z->prio_refill_watermark = low_water_mark;
1828
1829         z->async_prio_refill = TRUE;
1830         OSMemoryBarrier();
1831         kern_return_t tres = kernel_thread_start_priority((thread_continue_t)zone_replenish_thread, z, MAXPRI_KERNEL, &z->zone_replenish_thread);
1832
1833         if (tres != KERN_SUCCESS) {
1834                 panic("zone_prio_refill_configure, thread create: 0x%x", tres);
1835         }
1836
1837         thread_deallocate(z->zone_replenish_thread);
1838 }
1839
1840 /* Initialize the metadata for an allocation chunk */
1841 static inline void
1842 zcram_metadata_init(vm_offset_t newmem, vm_size_t size, struct zone_page_metadata *chunk_metadata)
1843 {
1844         struct zone_page_metadata *page_metadata;
1845
1846         /* The first page is the real metadata for this allocation chunk. We mark the others as fake metadata */
1847         size -= PAGE_SIZE;
1848         newmem += PAGE_SIZE;
1849
1850         for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) {
1851                 page_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE);
1852                 assert(page_metadata != chunk_metadata);
1853                 PAGE_METADATA_SET_ZINDEX(page_metadata, MULTIPAGE_METADATA_MAGIC);
1854                 page_metadata_set_realmeta(page_metadata, chunk_metadata);
1855                 page_metadata->free_count = 0;
1856         }
1857         return;
1858 }
1859
1860
1861 /*
1862  * Boolean Random Number Generator for generating booleans to randomize
1863  * the order of elements in newly zcram()'ed memory. The algorithm is a
1864  * modified version of the KISS RNG proposed in the paper:
1865  * http://stat.fsu.edu/techreports/M802.pdf
1866  * The modifications have been documented in the technical paper
1867  * paper from UCL:
1868  * http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf
1869  */
1870
1871 static void random_bool_gen_entropy(
1872                 int     *buffer,
1873                 int     count)
1874 {
1875
1876         int i, t;
1877         simple_lock(&bool_gen_lock);
1878         for (i = 0; i < count; i++) {
1879                 bool_gen_seed[1] ^= (bool_gen_seed[1] << 5);
1880                 bool_gen_seed[1] ^= (bool_gen_seed[1] >> 7);
1881                 bool_gen_seed[1] ^= (bool_gen_seed[1] << 22);
1882                 t = bool_gen_seed[2] + bool_gen_seed[3] + bool_gen_global;
1883                 bool_gen_seed[2] = bool_gen_seed[3];
1884                 bool_gen_global = t < 0;
1885                 bool_gen_seed[3] = t &2147483647;
1886                 bool_gen_seed[0] += 1411392427;
1887                 buffer[i] = (bool_gen_seed[0] + bool_gen_seed[1] + bool_gen_seed[3]);
1888         }
1889         simple_unlock(&bool_gen_lock);
1890 }
1891
1892 static boolean_t random_bool_gen(
1893                 int     *buffer,
1894                 int     index,
1895                 int     bufsize)
1896 {
1897         int valindex, bitpos;
1898         valindex = (index / (8 * sizeof(int))) % bufsize;
1899         bitpos = index % (8 * sizeof(int));
1900         return (boolean_t)(buffer[valindex] & (1 << bitpos));
1901 }
1902
1903 static void
1904 random_free_to_zone(
1905                         zone_t          zone,
1906                         vm_offset_t     newmem,
1907                         vm_offset_t     first_element_offset,
1908                         int             element_count,
1909                         int             *entropy_buffer)
1910 {
1911         vm_offset_t     last_element_offset;
1912         vm_offset_t     element_addr;
1913         vm_size_t       elem_size;
1914         int             index;
1915
1916         elem_size = zone->elem_size;
1917         last_element_offset = first_element_offset + ((element_count * elem_size) - elem_size);
1918         for (index = 0; index < element_count; index++) {
1919                 assert(first_element_offset <= last_element_offset);
1920                 if (
1921 #if DEBUG || DEVELOPMENT
1922                 leak_scan_debug_flag ||
1923 #endif /* DEBUG || DEVELOPMENT */
1924                 random_bool_gen(entropy_buffer, index, MAX_ENTROPY_PER_ZCRAM)) {
1925                         element_addr = newmem + first_element_offset;
1926                         first_element_offset += elem_size;
1927                 } else {
1928                         element_addr = newmem + last_element_offset;
1929                         last_element_offset -= elem_size;
1930                 }
1931                 if (element_addr != (vm_offset_t)zone) {
1932                         zone->count++;  /* compensate for free_to_zone */
1933                         free_to_zone(zone, element_addr, FALSE);
1934                 }
1935                 zone->cur_size += elem_size;
1936         }
1937 }
1938
1939 /*
1940  *      Cram the given memory into the specified zone. Update the zone page count accordingly.
1941  */
1942 void
1943 zcram(
1944         zone_t          zone,
1945         vm_offset_t                     newmem,
1946         vm_size_t               size)
1947 {
1948         vm_size_t       elem_size;
1949         boolean_t   from_zm = FALSE;
1950         int element_count;
1951         int entropy_buffer[MAX_ENTROPY_PER_ZCRAM];
1952
1953         /* Basic sanity checks */
1954         assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
1955         assert(!zone->collectable || zone->allows_foreign
1956                 || (from_zone_map(newmem, size)));
1957
1958         elem_size = zone->elem_size;
1959
1960         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START, VM_KERNEL_ADDRPERM(zone), size, 0, 0, 0);
1961
1962         if (from_zone_map(newmem, size))
1963                 from_zm = TRUE;
1964
1965         if (!from_zm) {
1966                 /* We cannot support elements larger than page size for foreign memory because we
1967                  * put metadata on the page itself for each page of foreign memory. We need to do
1968                  * this in order to be able to reach the metadata when any element is freed
1969                  */
1970                 assert((zone->allows_foreign == TRUE) && (zone->elem_size <= (PAGE_SIZE - sizeof(struct zone_page_metadata))));
1971         }
1972
1973         if (zalloc_debug & ZALLOC_DEBUG_ZCRAM)
1974                 kprintf("zcram(%p[%s], 0x%lx%s, 0x%lx)\n", zone, zone->zone_name,
1975                                 (unsigned long)newmem, from_zm ? "" : "[F]", (unsigned long)size);
1976
1977         ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE));
1978
1979         random_bool_gen_entropy(entropy_buffer, MAX_ENTROPY_PER_ZCRAM);
1980
1981         /*
1982          * Initialize the metadata for all pages. We dont need the zone lock
1983          * here because we are not manipulating any zone related state yet.
1984          */
1985
1986         struct zone_page_metadata *chunk_metadata;
1987         size_t zone_page_metadata_size = sizeof(struct zone_page_metadata);
1988
1989         assert((newmem & PAGE_MASK) == 0);
1990         assert((size & PAGE_MASK) == 0);
1991
1992         chunk_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE);
1993         chunk_metadata->pages.next = NULL;
1994         chunk_metadata->pages.prev = NULL;
1995         page_metadata_set_freelist(chunk_metadata, 0);
1996         PAGE_METADATA_SET_ZINDEX(chunk_metadata, zone->index);
1997         chunk_metadata->free_count = 0;
1998         chunk_metadata->page_count = (size / PAGE_SIZE);
1999
2000         zcram_metadata_init(newmem, size, chunk_metadata);
2001
2002         lock_zone(zone);
2003         enqueue_tail(&zone->pages.all_used, &(chunk_metadata->pages));
2004
2005         if (!from_zm) {
2006                 /* We cannot support elements larger than page size for foreign memory because we
2007                  * put metadata on the page itself for each page of foreign memory. We need to do
2008                  * this in order to be able to reach the metadata when any element is freed
2009                  */
2010
2011                 for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) {
2012                         vm_offset_t first_element_offset = 0;
2013                         if (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT == 0){
2014                                 first_element_offset = zone_page_metadata_size;
2015                         } else {
2016                                 first_element_offset = zone_page_metadata_size + (ZONE_ELEMENT_ALIGNMENT - (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT));
2017                         }
2018                         element_count = (int)((PAGE_SIZE - first_element_offset) / elem_size);
2019                         random_free_to_zone(zone, newmem, first_element_offset, element_count, entropy_buffer);
2020                 }
2021         } else {
2022                 element_count = (int)(size / elem_size);
2023                 random_free_to_zone(zone, newmem, 0, element_count, entropy_buffer);
2024         }
2025         unlock_zone(zone);
2026
2027         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, VM_KERNEL_ADDRPERM(zone), 0, 0, 0, 0);
2028
2029 }
2030
2031 /*
2032  * Fill a zone with enough memory to contain at least nelem elements.
2033  * Memory is obtained with kmem_alloc_kobject from the kernel_map.
2034  * Return the number of elements actually put into the zone, which may
2035  * be more than the caller asked for since the memory allocation is
2036  * rounded up to a full page.
2037  */
2038 int
2039 zfill(
2040         zone_t  zone,
2041         int     nelem)
2042 {
2043         kern_return_t   kr;
2044         vm_size_t       size;
2045         vm_offset_t     memory;
2046         int             nalloc;
2047
2048         assert(nelem > 0);
2049         if (nelem <= 0)
2050                 return 0;
2051         size = nelem * zone->elem_size;
2052         size = round_page(size);
2053         kr = kmem_alloc_kobject(kernel_map, &memory, size, VM_KERN_MEMORY_ZONE);
2054         if (kr != KERN_SUCCESS)
2055                 return 0;
2056
2057         zone_change(zone, Z_FOREIGN, TRUE);
2058         zcram(zone, memory, size);
2059         nalloc = (int)(size / zone->elem_size);
2060         assert(nalloc >= nelem);
2061
2062         return nalloc;
2063 }
2064
2065 /*
2066  *      Initialize the "zone of zones" which uses fixed memory allocated
2067  *      earlier in memory initialization.  zone_bootstrap is called
2068  *      before zone_init.
2069  */
2070 void
2071 zone_bootstrap(void)
2072 {
2073         char temp_buf[16];
2074         unsigned int i;
2075
2076         if (!PE_parse_boot_argn("zalloc_debug", &zalloc_debug, sizeof(zalloc_debug)))
2077                 zalloc_debug = 0;
2078
2079         /* Set up zone element poisoning */
2080         zp_init();
2081
2082         /* Seed the random boolean generator for elements in zone free list */
2083         for (i = 0; i < RANDOM_BOOL_GEN_SEED_COUNT; i++) {
2084                 bool_gen_seed[i] = (unsigned int)early_random();
2085         }
2086         simple_lock_init(&bool_gen_lock, 0);
2087
2088         /* should zlog log to debug zone corruption instead of leaks? */
2089         if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) {
2090                 corruption_debug_flag = TRUE;
2091         }
2092
2093 #if DEBUG || DEVELOPMENT
2094         /* disable element location randomization in a page */
2095         if (PE_parse_boot_argn("-zl", temp_buf, sizeof(temp_buf))) {
2096                 leak_scan_debug_flag = TRUE;
2097         }
2098 #endif
2099
2100         simple_lock_init(&all_zones_lock, 0);
2101
2102         num_zones = 0;
2103         thread_call_setup(&call_async_alloc, zalloc_async, NULL);
2104
2105         /* initializing global lock group for zones */
2106         lck_grp_attr_setdefault(&zone_locks_grp_attr);
2107         lck_grp_init(&zone_locks_grp, "zone_locks", &zone_locks_grp_attr);
2108
2109         lck_attr_setdefault(&zone_metadata_lock_attr);
2110         lck_mtx_init_ext(&zone_metadata_region_lck, &zone_metadata_region_lck_ext, &zone_locks_grp, &zone_metadata_lock_attr);
2111 }
2112
2113 /* Global initialization of Zone Allocator.
2114  * Runs after zone_bootstrap.
2115  */
2116 void
2117 zone_init(
2118         vm_size_t max_zonemap_size)
2119 {
2120         kern_return_t   retval;
2121         vm_offset_t     zone_min;
2122         vm_offset_t     zone_max;
2123         vm_offset_t     zone_metadata_space;
2124         unsigned int    zone_pages;
2125
2126         retval = kmem_suballoc(kernel_map, &zone_min, max_zonemap_size,
2127                                FALSE, VM_FLAGS_ANYWHERE | VM_FLAGS_PERMANENT | VM_MAKE_TAG(VM_KERN_MEMORY_ZONE),
2128                                &zone_map);
2129
2130         if (retval != KERN_SUCCESS)
2131                 panic("zone_init: kmem_suballoc failed");
2132         zone_max = zone_min + round_page(max_zonemap_size);
2133 #if     CONFIG_GZALLOC
2134         gzalloc_init(max_zonemap_size);
2135 #endif
2136         /*
2137          * Setup garbage collection information:
2138          */
2139         zone_map_min_address = zone_min;
2140         zone_map_max_address = zone_max;
2141
2142         zone_pages = (unsigned int)atop_kernel(zone_max - zone_min);
2143         zone_metadata_space = round_page(zone_pages * sizeof(struct zone_page_metadata));
2144         retval = kernel_memory_allocate(zone_map, &zone_metadata_region_min, zone_metadata_space,
2145                                         0, KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT, VM_KERN_MEMORY_OSFMK);
2146         if (retval != KERN_SUCCESS)
2147                 panic("zone_init: zone_metadata_region initialization failed!");
2148         zone_metadata_region_max = zone_metadata_region_min + zone_metadata_space;
2149
2150 #if defined(__LP64__)
2151         /*
2152          * ensure that any vm_page_t that gets created from
2153          * the vm_page zone can be packed properly (see vm_page.h
2154          * for the packing requirements
2155          */
2156         if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_metadata_region_max))) != (vm_page_t)zone_metadata_region_max)
2157                 panic("VM_PAGE_PACK_PTR failed on zone_metadata_region_max - %p", (void *)zone_metadata_region_max);
2158
2159         if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_max_address))) != (vm_page_t)zone_map_max_address)
2160                 panic("VM_PAGE_PACK_PTR failed on zone_map_max_address - %p", (void *)zone_map_max_address);
2161 #endif
2162
2163         lck_grp_attr_setdefault(&zone_gc_lck_grp_attr);
2164         lck_grp_init(&zone_gc_lck_grp, "zone_gc", &zone_gc_lck_grp_attr);
2165         lck_attr_setdefault(&zone_gc_lck_attr);
2166         lck_mtx_init_ext(&zone_gc_lock, &zone_gc_lck_ext, &zone_gc_lck_grp, &zone_gc_lck_attr);
2167
2168 #if CONFIG_ZLEAKS
2169         /*
2170          * Initialize the zone leak monitor
2171          */
2172         zleak_init(max_zonemap_size);
2173 #endif /* CONFIG_ZLEAKS */
2174 }
2175
2176 extern volatile SInt32 kfree_nop_count;
2177
2178 #pragma mark -
2179 #pragma mark zalloc_canblock
2180
2181 /*
2182  *      zalloc returns an element from the specified zone.
2183  */
2184 static void *
2185 zalloc_internal(
2186         zone_t  zone,
2187         boolean_t canblock,
2188         boolean_t nopagewait)
2189 {
2190         vm_offset_t     addr = 0;
2191         kern_return_t   retval;
2192         uintptr_t       zbt[MAX_ZTRACE_DEPTH];  /* used in zone leak logging and zone leak detection */
2193         int             numsaved = 0;
2194         boolean_t       zone_replenish_wakeup = FALSE, zone_alloc_throttle = FALSE;
2195 #if     CONFIG_GZALLOC
2196         boolean_t       did_gzalloc = FALSE;
2197 #endif
2198         thread_t thr = current_thread();
2199         boolean_t       check_poison = FALSE;
2200         boolean_t       set_doing_alloc_with_vm_priv = FALSE;
2201
2202 #if CONFIG_ZLEAKS
2203         uint32_t        zleak_tracedepth = 0;  /* log this allocation if nonzero */
2204 #endif /* CONFIG_ZLEAKS */
2205
2206         assert(zone != ZONE_NULL);
2207
2208 #if     CONFIG_GZALLOC
2209         addr = gzalloc_alloc(zone, canblock);
2210         did_gzalloc = (addr != 0);
2211 #endif
2212
2213         /*
2214          * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
2215          */
2216         if (__improbable(DO_LOGGING(zone)))
2217                 numsaved = OSBacktrace((void*) zbt, MAX_ZTRACE_DEPTH);
2218
2219 #if CONFIG_ZLEAKS
2220         /*
2221          * Zone leak detection: capture a backtrace every zleak_sample_factor
2222          * allocations in this zone.
2223          */
2224         if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) {
2225                 /* Avoid backtracing twice if zone logging is on */
2226                 if (numsaved == 0)
2227                         zleak_tracedepth = backtrace(zbt, MAX_ZTRACE_DEPTH);
2228                 else
2229                         zleak_tracedepth = numsaved;
2230         }
2231 #endif /* CONFIG_ZLEAKS */
2232
2233         lock_zone(zone);
2234
2235         if (zone->async_prio_refill && zone->zone_replenish_thread) {
2236                     do {
2237                             vm_size_t zfreec = (zone->cur_size - (zone->count * zone->elem_size));
2238                             vm_size_t zrefillwm = zone->prio_refill_watermark * zone->elem_size;
2239                             zone_replenish_wakeup = (zfreec < zrefillwm);
2240                             zone_alloc_throttle = (zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0);
2241
2242                             if (zone_replenish_wakeup) {
2243                                     zone_replenish_wakeups_initiated++;
2244                                     /* Signal the potentially waiting
2245                                      * refill thread.
2246                                      */
2247                                     thread_wakeup(&zone->zone_replenish_thread);
2248                                     unlock_zone(zone);
2249                                     /* Scheduling latencies etc. may prevent
2250                                      * the refill thread from keeping up
2251                                      * with demand. Throttle consumers
2252                                      * when we fall below half the
2253                                      * watermark, unless VM privileged
2254                                      */
2255                                     if (zone_alloc_throttle) {
2256                                             zone_replenish_throttle_count++;
2257                                             assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
2258                                             thread_block(THREAD_CONTINUE_NULL);
2259                                     }
2260                                     lock_zone(zone);
2261                             }
2262                     } while (zone_alloc_throttle == TRUE);
2263         }
2264
2265         if (__probable(addr == 0))
2266                 addr = try_alloc_from_zone(zone, &check_poison);
2267
2268
2269         while ((addr == 0) && canblock) {
2270                 /*
2271                  * zone is empty, try to expand it
2272                  *
2273                  * Note that we now allow up to 2 threads (1 vm_privliged and 1 non-vm_privliged)
2274                  * to expand the zone concurrently...  this is necessary to avoid stalling
2275                  * vm_privileged threads running critical code necessary to continue compressing/swapping
2276                  * pages (i.e. making new free pages) from stalling behind non-vm_privileged threads
2277                  * waiting to acquire free pages when the vm_page_free_count is below the
2278                  * vm_page_free_reserved limit.
2279                  */
2280                 if ((zone->doing_alloc_without_vm_priv || zone->doing_alloc_with_vm_priv) &&
2281                     (((thr->options & TH_OPT_VMPRIV) == 0) || zone->doing_alloc_with_vm_priv)) {
2282                         /*
2283                          * This is a non-vm_privileged thread and a non-vm_privileged or
2284                          * a vm_privileged thread is already expanding the zone...
2285                          *    OR
2286                          * this is a vm_privileged thread and a vm_privileged thread is
2287                          * already expanding the zone...
2288                          *
2289                          * In either case wait for a thread to finish, then try again.
2290                          */
2291                         zone->waiting = TRUE;
2292                         zone_sleep(zone);
2293                 } else {
2294                         vm_offset_t space;
2295                         vm_size_t alloc_size;
2296                         int retry = 0;
2297
2298                         if ((zone->cur_size + zone->elem_size) >
2299                             zone->max_size) {
2300                                 if (zone->exhaustible)
2301                                         break;
2302                                 if (zone->expandable) {
2303                                         /*
2304                                          * We're willing to overflow certain
2305                                          * zones, but not without complaining.
2306                                          *
2307                                          * This is best used in conjunction
2308                                          * with the collectable flag. What we
2309                                          * want is an assurance we can get the
2310                                          * memory back, assuming there's no
2311                                          * leak.
2312                                          */
2313                                         zone->max_size += (zone->max_size >> 1);
2314                                 } else {
2315                                         unlock_zone(zone);
2316
2317                                         panic_include_zprint = TRUE;
2318 #if CONFIG_ZLEAKS
2319                                         if (zleak_state & ZLEAK_STATE_ACTIVE)
2320                                                 panic_include_ztrace = TRUE;
2321 #endif /* CONFIG_ZLEAKS */
2322                                         panic("zalloc: zone \"%s\" empty.", zone->zone_name);
2323                                 }
2324                         }
2325                         /*
2326                          * It is possible that a BG thread is refilling/expanding the zone
2327                          * and gets pre-empted during that operation. That blocks all other
2328                          * threads from making progress leading to a watchdog timeout. To
2329                          * avoid that, boost the thread priority using the rwlock boost
2330                          */
2331                         set_thread_rwlock_boost();
2332
2333                         if ((thr->options & TH_OPT_VMPRIV)) {
2334                                 zone->doing_alloc_with_vm_priv = TRUE;
2335                                 set_doing_alloc_with_vm_priv = TRUE;
2336                         } else {
2337                                 zone->doing_alloc_without_vm_priv = TRUE;
2338                         }
2339                         unlock_zone(zone);
2340
2341                         for (;;) {
2342                                 int     zflags = KMA_KOBJECT|KMA_NOPAGEWAIT;
2343
2344                                 if (vm_pool_low() || retry >= 1)
2345                                         alloc_size =
2346                                                 round_page(zone->elem_size);
2347                                 else
2348                                         alloc_size = zone->alloc_size;
2349
2350                                 if (zone->noencrypt)
2351                                         zflags |= KMA_NOENCRYPT;
2352
2353                                 retval = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
2354                                 if (retval == KERN_SUCCESS) {
2355 #if CONFIG_ZLEAKS
2356                                         if ((zleak_state & (ZLEAK_STATE_ENABLED | ZLEAK_STATE_ACTIVE)) == ZLEAK_STATE_ENABLED) {
2357                                                 if (zone_map->size >= zleak_global_tracking_threshold) {
2358                                                         kern_return_t kr;
2359
2360                                                         kr = zleak_activate();
2361                                                         if (kr != KERN_SUCCESS) {
2362                                                                 printf("Failed to activate live zone leak debugging (%d).\n", kr);
2363                                                         }
2364                                                 }
2365                                         }
2366
2367                                         if ((zleak_state & ZLEAK_STATE_ACTIVE) && !(zone->zleak_on)) {
2368                                                 if (zone->cur_size > zleak_per_zone_tracking_threshold) {
2369                                                         zone->zleak_on = TRUE;
2370                                                 }
2371                                         }
2372 #endif /* CONFIG_ZLEAKS */
2373                                         zcram(zone, space, alloc_size);
2374
2375                                         break;
2376                                 } else if (retval != KERN_RESOURCE_SHORTAGE) {
2377                                         retry++;
2378
2379                                         if (retry == 2) {
2380                                                 zone_gc();
2381                                                 printf("zalloc did gc\n");
2382                                                 zone_display_zprint();
2383                                         }
2384                                         if (retry == 3) {
2385                                                 panic_include_zprint = TRUE;
2386 #if CONFIG_ZLEAKS
2387                                                 if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
2388                                                         panic_include_ztrace = TRUE;
2389                                                 }
2390 #endif /* CONFIG_ZLEAKS */
2391                                                 if (retval == KERN_NO_SPACE) {
2392                                                         zone_t zone_largest = zone_find_largest();
2393                                                         panic("zalloc: zone map exhausted while allocating from zone %s, likely due to memory leak in zone %s (%lu total bytes, %d elements allocated)",
2394                                                         zone->zone_name, zone_largest->zone_name,
2395                                                         (unsigned long)zone_largest->cur_size, zone_largest->count);
2396
2397                                                 }
2398                                                 panic("zalloc: \"%s\" (%d elements) retry fail %d, kfree_nop_count: %d", zone->zone_name, zone->count, retval, (int)kfree_nop_count);
2399                                         }
2400                                 } else {
2401                                         break;
2402                                 }
2403                         }
2404                         lock_zone(zone);
2405
2406                         if (set_doing_alloc_with_vm_priv == TRUE)
2407                                 zone->doing_alloc_with_vm_priv = FALSE;
2408                         else
2409                                 zone->doing_alloc_without_vm_priv = FALSE;
2410
2411                         if (zone->waiting) {
2412                                 zone->waiting = FALSE;
2413                                 zone_wakeup(zone);
2414                         }
2415                         clear_thread_rwlock_boost();
2416
2417                         addr = try_alloc_from_zone(zone, &check_poison);
2418                         if (addr == 0 &&
2419                             retval == KERN_RESOURCE_SHORTAGE) {
2420                                 if (nopagewait == TRUE)
2421                                         break;  /* out of the main while loop */
2422                                 unlock_zone(zone);
2423
2424                                 VM_PAGE_WAIT();
2425                                 lock_zone(zone);
2426                         }
2427                 }
2428                 if (addr == 0)
2429                         addr = try_alloc_from_zone(zone, &check_poison);
2430         }
2431
2432 #if CONFIG_ZLEAKS
2433         /* Zone leak detection:
2434          * If we're sampling this allocation, add it to the zleaks hash table.
2435          */
2436         if (addr && zleak_tracedepth > 0)  {
2437                 /* Sampling can fail if another sample is happening at the same time in a different zone. */
2438                 if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) {
2439                         /* If it failed, roll back the counter so we sample the next allocation instead. */
2440                         zone->zleak_capture = zleak_sample_factor;
2441                 }
2442         }
2443 #endif /* CONFIG_ZLEAKS */
2444
2445
2446         if ((addr == 0) && (!canblock || nopagewait) && (zone->async_pending == FALSE) && (zone->no_callout == FALSE) && (zone->exhaustible == FALSE) && (!vm_pool_low())) {
2447                 zone->async_pending = TRUE;
2448                 unlock_zone(zone);
2449                 thread_call_enter(&call_async_alloc);
2450                 lock_zone(zone);
2451                 addr = try_alloc_from_zone(zone, &check_poison);
2452         }
2453
2454         vm_offset_t     inner_size = zone->elem_size;
2455
2456         unlock_zone(zone);
2457
2458         if (__improbable(DO_LOGGING(zone) && addr)) {
2459                 btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved);
2460         }
2461
2462         if (__improbable(check_poison && addr)) {
2463                 vm_offset_t *element_cursor  = ((vm_offset_t *) addr) + 1;
2464                 vm_offset_t *backup  = get_backup_ptr(inner_size, (vm_offset_t *) addr);
2465
2466                 for ( ; element_cursor < backup ; element_cursor++)
2467                         if (__improbable(*element_cursor != ZP_POISON))
2468                                 zone_element_was_modified_panic(zone,
2469                                                                 addr,
2470                                                                 *element_cursor,
2471                                                                 ZP_POISON,
2472                                                                 ((vm_offset_t)element_cursor) - addr);
2473         }
2474
2475         if (addr) {
2476                 /*
2477                  * Clear out the old next pointer and backup to avoid leaking the cookie
2478                  * and so that only values on the freelist have a valid cookie
2479                  */
2480
2481                 vm_offset_t *primary  = (vm_offset_t *) addr;
2482                 vm_offset_t *backup   = get_backup_ptr(inner_size, primary);
2483
2484                 *primary = ZP_POISON;
2485                 *backup  = ZP_POISON;
2486
2487 #if DEBUG || DEVELOPMENT
2488                 if (__improbable(leak_scan_debug_flag && !(zone->elem_size & (sizeof(uintptr_t) - 1)))) {
2489                         int count, idx;
2490                         /* Fill element, from tail, with backtrace in reverse order */
2491                         if (numsaved == 0) numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH);
2492                         count = (int) (zone->elem_size / sizeof(uintptr_t));
2493                         if (count >= numsaved) count = numsaved - 1;
2494                         for (idx = 0; idx < count; idx++) ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
2495                 }
2496 #endif /* DEBUG || DEVELOPMENT */
2497         }
2498
2499         TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr);
2500         return((void *)addr);
2501 }
2502
2503
2504 void *
2505 zalloc(zone_t zone)
2506 {
2507         return (zalloc_internal(zone, TRUE, FALSE));
2508 }
2509
2510 void *
2511 zalloc_noblock(zone_t zone)
2512 {
2513         return (zalloc_internal(zone, FALSE, FALSE));
2514 }
2515
2516 void *
2517 zalloc_nopagewait(zone_t zone)
2518 {
2519         return (zalloc_internal(zone, TRUE, TRUE));
2520 }
2521
2522 void *
2523 zalloc_canblock(zone_t zone, boolean_t canblock)
2524 {
2525         return (zalloc_internal(zone, canblock, FALSE));
2526 }
2527
2528
2529 void
2530 zalloc_async(
2531         __unused thread_call_param_t          p0,
2532         __unused thread_call_param_t p1)
2533 {
2534         zone_t current_z = NULL;
2535         unsigned int max_zones, i;
2536         void *elt = NULL;
2537         boolean_t pending = FALSE;
2538
2539         simple_lock(&all_zones_lock);
2540         max_zones = num_zones;
2541         simple_unlock(&all_zones_lock);
2542         for (i = 0; i < max_zones; i++) {
2543                 current_z = &(zone_array[i]);
2544                 lock_zone(current_z);
2545                 if (current_z->async_pending == TRUE) {
2546                         current_z->async_pending = FALSE;
2547                         pending = TRUE;
2548                 }
2549                 unlock_zone(current_z);
2550
2551                 if (pending == TRUE) {
2552                         elt = zalloc_canblock(current_z, TRUE);
2553                         zfree(current_z, elt);
2554                         pending = FALSE;
2555                 }
2556         }
2557 }
2558
2559 /*
2560  *      zget returns an element from the specified zone
2561  *      and immediately returns nothing if there is nothing there.
2562  */
2563 void *
2564 zget(
2565         zone_t  zone)
2566 {
2567     return zalloc_internal(zone, FALSE, TRUE);
2568 }
2569
2570 /* Keep this FALSE by default.  Large memory machine run orders of magnitude
2571    slower in debug mode when true.  Use debugger to enable if needed */
2572 /* static */ boolean_t zone_check = FALSE;
2573
2574 static void zone_check_freelist(zone_t zone, vm_offset_t elem)
2575 {
2576         struct zone_free_element *this;
2577         struct zone_page_metadata *thispage;
2578
2579         if (zone->allows_foreign) {
2580                 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
2581                          !queue_end(&zone->pages.any_free_foreign, &(thispage->pages));
2582                          thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
2583                         for (this = page_metadata_get_freelist(thispage);
2584                                  this != NULL;
2585                                  this = this->next) {
2586                                 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem)
2587                                         panic("zone_check_freelist");
2588                         }
2589                 }
2590         }
2591         for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
2592                 !queue_end(&zone->pages.all_free, &(thispage->pages));
2593                 thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
2594                 for (this = page_metadata_get_freelist(thispage);
2595                         this != NULL;
2596                         this = this->next) {
2597                         if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem)
2598                                 panic("zone_check_freelist");
2599                 }
2600         }
2601         for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
2602                 !queue_end(&zone->pages.intermediate, &(thispage->pages));
2603                 thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
2604                 for (this = page_metadata_get_freelist(thispage);
2605                         this != NULL;
2606                         this = this->next) {
2607                         if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem)
2608                                 panic("zone_check_freelist");
2609                 }
2610         }
2611 }
2612
2613 void
2614 zfree(
2615         zone_t  zone,
2616         void            *addr)
2617 {
2618         vm_offset_t     elem = (vm_offset_t) addr;
2619         uintptr_t       zbt[MAX_ZTRACE_DEPTH];                  /* only used if zone logging is enabled via boot-args */
2620         int             numsaved = 0;
2621         boolean_t       gzfreed = FALSE;
2622         boolean_t       poison = FALSE;
2623
2624         assert(zone != ZONE_NULL);
2625
2626         /*
2627          * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
2628          */
2629
2630         if (__improbable(DO_LOGGING(zone) && corruption_debug_flag))
2631                 numsaved = OSBacktrace((void *)zbt, MAX_ZTRACE_DEPTH);
2632
2633 #if MACH_ASSERT
2634         /* Basic sanity checks */
2635         if (zone == ZONE_NULL || elem == (vm_offset_t)0)
2636                 panic("zfree: NULL");
2637 #endif
2638
2639 #if     CONFIG_GZALLOC
2640         gzfreed = gzalloc_free(zone, addr);
2641 #endif
2642
2643         if (!gzfreed) {
2644                 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
2645                 if (zone != PAGE_METADATA_GET_ZONE(page_meta)) {
2646                         panic("Element %p from zone %s caught being freed to wrong zone %s\n", addr, PAGE_METADATA_GET_ZONE(page_meta)->zone_name, zone->zone_name);
2647                 }
2648         }
2649
2650         TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, zone->elem_size, (uintptr_t)addr);
2651
2652         if (__improbable(!gzfreed && zone->collectable && !zone->allows_foreign &&
2653                 !from_zone_map(elem, zone->elem_size))) {
2654                 panic("zfree: non-allocated memory in collectable zone!");
2655         }
2656
2657         if ((zp_factor != 0 || zp_tiny_zone_limit != 0) && !gzfreed) {
2658                 /*
2659                  * Poison the memory before it ends up on the freelist to catch
2660                  * use-after-free and use of uninitialized memory
2661                  *
2662                  * Always poison tiny zones' elements (limit is 0 if -no-zp is set)
2663                  * Also poison larger elements periodically
2664                  */
2665
2666                 vm_offset_t     inner_size = zone->elem_size;
2667
2668                 uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale);
2669
2670                 if (inner_size <= zp_tiny_zone_limit)
2671                         poison = TRUE;
2672                 else if (zp_factor != 0 && sample_counter(&zone->zp_count, sample_factor) == TRUE)
2673                         poison = TRUE;
2674
2675                 if (__improbable(poison)) {
2676
2677                         /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
2678                         /* Poison everything but primary and backup */
2679                         vm_offset_t *element_cursor  = ((vm_offset_t *) elem) + 1;
2680                         vm_offset_t *backup   = get_backup_ptr(inner_size, (vm_offset_t *)elem);
2681
2682                         for ( ; element_cursor < backup; element_cursor++)
2683                                 *element_cursor = ZP_POISON;
2684                 }
2685         }
2686
2687         /*
2688          * See if we're doing logging on this zone.  There are two styles of logging used depending on
2689          * whether we're trying to catch a leak or corruption.  See comments above in zalloc for details.
2690          */
2691
2692         if (__improbable(DO_LOGGING(zone))) {
2693                 if (corruption_debug_flag) {
2694                         /*
2695                          * We're logging to catch a corruption.  Add a record of this zfree operation
2696                          * to log.
2697                          */
2698                         btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE, (void **)zbt, numsaved);
2699                 } else {
2700                         /*
2701                          * We're logging to catch a leak. Remove any record we might have for this
2702                          * element since it's being freed.  Note that we may not find it if the buffer
2703                          * overflowed and that's OK.  Since the log is of a limited size, old records
2704                          * get overwritten if there are more zallocs than zfrees.
2705                          */
2706                         btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
2707                 }
2708         }
2709
2710         lock_zone(zone);
2711
2712         if (zone_check) {
2713                 zone_check_freelist(zone, elem);
2714         }
2715
2716         if (__probable(!gzfreed))
2717                 free_to_zone(zone, elem, poison);
2718
2719 #if MACH_ASSERT
2720         if (zone->count < 0)
2721                 panic("zfree: zone count underflow in zone %s while freeing element %p, possible cause: double frees or freeing memory that did not come from this zone",
2722                 zone->zone_name, addr);
2723 #endif
2724
2725
2726 #if CONFIG_ZLEAKS
2727         /*
2728          * Zone leak detection: un-track the allocation
2729          */
2730         if (zone->zleak_on) {
2731                 zleak_free(elem, zone->elem_size);
2732         }
2733 #endif /* CONFIG_ZLEAKS */
2734
2735         unlock_zone(zone);
2736 }
2737
2738
2739 /*      Change a zone's flags.
2740  *      This routine must be called immediately after zinit.
2741  */
2742 void
2743 zone_change(
2744         zone_t          zone,
2745         unsigned int    item,
2746         boolean_t       value)
2747 {
2748         assert( zone != ZONE_NULL );
2749         assert( value == TRUE || value == FALSE );
2750
2751         switch(item){
2752                 case Z_NOENCRYPT:
2753                         zone->noencrypt = value;
2754                         break;
2755                 case Z_EXHAUST:
2756                         zone->exhaustible = value;
2757                         break;
2758                 case Z_COLLECT:
2759                         zone->collectable = value;
2760                         break;
2761                 case Z_EXPAND:
2762                         zone->expandable = value;
2763                         break;
2764                 case Z_FOREIGN:
2765                         zone->allows_foreign = value;
2766                         break;
2767                 case Z_CALLERACCT:
2768                         zone->caller_acct = value;
2769                         break;
2770                 case Z_NOCALLOUT:
2771                         zone->no_callout = value;
2772                         break;
2773                 case Z_GZALLOC_EXEMPT:
2774                         zone->gzalloc_exempt = value;
2775 #if     CONFIG_GZALLOC
2776                         gzalloc_reconfigure(zone);
2777 #endif
2778                         break;
2779                 case Z_ALIGNMENT_REQUIRED:
2780                         zone->alignment_required = value;
2781 #if     CONFIG_GZALLOC
2782                         gzalloc_reconfigure(zone);
2783 #endif
2784                         break;
2785                 default:
2786                         panic("Zone_change: Wrong Item Type!");
2787                         /* break; */
2788         }
2789 }
2790
2791 /*
2792  * Return the expected number of free elements in the zone.
2793  * This calculation will be incorrect if items are zfree'd that
2794  * were never zalloc'd/zget'd. The correct way to stuff memory
2795  * into a zone is by zcram.
2796  */
2797
2798 integer_t
2799 zone_free_count(zone_t zone)
2800 {
2801         integer_t free_count;
2802
2803         lock_zone(zone);
2804         free_count = zone->countfree;
2805         unlock_zone(zone);
2806
2807         assert(free_count >= 0);
2808
2809         return(free_count);
2810 }
2811
2812 /*      Zone garbage collection
2813  *
2814  *      zone_gc will walk through all the free elements in all the
2815  *      zones that are marked collectable looking for reclaimable
2816  *      pages.  zone_gc is called by consider_zone_gc when the system
2817  *      begins to run out of memory.
2818  */
2819 extern zone_t   vm_map_entry_reserved_zone;
2820 uint64_t                zone_gc_bailed = 0;
2821
2822 void
2823 zone_gc(void)
2824 {
2825         unsigned int    max_zones;
2826         zone_t                  z;
2827         unsigned int    i;
2828         zone_t                  zres = vm_map_entry_reserved_zone;
2829
2830         lck_mtx_lock(&zone_gc_lock);
2831
2832         simple_lock(&all_zones_lock);
2833         max_zones = num_zones;
2834         simple_unlock(&all_zones_lock);
2835
2836         if (zalloc_debug & ZALLOC_DEBUG_ZONEGC)
2837                 kprintf("zone_gc() starting...\n");
2838
2839         for (i = 0; i < max_zones; i++) {
2840                 z = &(zone_array[i]);
2841                 vm_size_t                                       elt_size, size_freed;
2842                 int                                                     total_freed_pages = 0;
2843                 struct zone_page_metadata       *page_meta;
2844                 queue_head_t                            page_meta_head;
2845
2846                 assert(z != ZONE_NULL);
2847
2848                 if (!z->collectable)
2849                         continue;
2850
2851                 if (queue_empty(&z->pages.all_free)) {
2852                         continue;
2853                 }
2854
2855                 /*
2856                  * Since kmem_free() might use VM entries from the reserved VM entries zone, we should bail from zone_gc() if we
2857                  * are below the critical threshold for that zone. Otherwise, there could be a deadlock between the zone_gc
2858                  * thread and the zone_replenish thread for the VM entries zone on the zone_map lock.
2859                  */
2860                 if (zres->zone_replenishing) {
2861                         zone_gc_bailed++;
2862                         break;
2863                 }
2864
2865                 lock_zone(z);
2866                 elt_size = z->elem_size;
2867
2868                 if (queue_empty(&z->pages.all_free)) {
2869                         unlock_zone(z);
2870                         continue;
2871                 }
2872
2873                 /*
2874                  * Snatch all of the free elements away from the zone.
2875                  */
2876                 uint64_t old_all_free_count = z->count_all_free_pages;
2877                 queue_new_head(&z->pages.all_free, &page_meta_head, struct zone_page_metadata *, pages);
2878                 queue_init(&z->pages.all_free);
2879                 z->count_all_free_pages = 0;
2880                 unlock_zone(z);
2881
2882                 /* Iterate through all elements to find out size and count of elements we snatched */
2883                 size_freed = 0;
2884                 queue_iterate(&page_meta_head, page_meta, struct zone_page_metadata *, pages) {
2885                         assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */
2886                         size_freed += elt_size * page_meta->free_count;
2887                 }
2888
2889                 /* Update the zone size and free element count */
2890                 lock_zone(z);
2891                 z->cur_size -= size_freed;
2892                 z->countfree -= size_freed/elt_size;
2893                 unlock_zone(z);
2894
2895                 while ((page_meta = (struct zone_page_metadata *)dequeue_head(&page_meta_head)) != NULL) {
2896                         vm_address_t        free_page_address;
2897                         if (zres->zone_replenishing)
2898                                 break;
2899                         /* Free the pages for metadata and account for them */
2900                         free_page_address = get_zone_page(page_meta);
2901                         ZONE_PAGE_COUNT_DECR(z, page_meta->page_count);
2902                         total_freed_pages += page_meta->page_count;
2903                         old_all_free_count -= page_meta->page_count;
2904                         size_freed -= (elt_size * page_meta->free_count);
2905                         kmem_free(zone_map, free_page_address, (page_meta->page_count * PAGE_SIZE));
2906                         thread_yield_to_preemption();
2907                 }
2908                 if (page_meta != NULL) {
2909                    /*
2910                         * We bailed because the VM entry reserved zone is replenishing. Put the remaining
2911                         * metadata objects back on the all_free list and bail.
2912                         */
2913                         queue_entry_t qe;
2914                         enqueue_head(&page_meta_head, &(page_meta->pages));
2915                         zone_gc_bailed++;
2916
2917                         lock_zone(z);
2918                         qe_foreach_safe(qe, &page_meta_head) {
2919                                 re_queue_tail(&z->pages.all_free, qe);
2920                         }
2921                         z->count_all_free_pages += (int)old_all_free_count;
2922                         z->cur_size += size_freed;
2923                         z->countfree += size_freed/elt_size;
2924                         unlock_zone(z);
2925                         if (zalloc_debug & ZALLOC_DEBUG_ZONEGC)
2926                                 kprintf("zone_gc() bailed due to VM entry zone replenishing (zone_gc_bailed: %lld)\n", zone_gc_bailed);
2927                         break;
2928                 }
2929
2930                 /* We freed all the pages from the all_free list for this zone */
2931                 assert(old_all_free_count == 0);
2932
2933                 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC)
2934                         kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed/elt_size, total_freed_pages);
2935         }
2936
2937         lck_mtx_unlock(&zone_gc_lock);
2938 }
2939
2940 extern vm_offset_t kmapoff_kaddr;
2941 extern unsigned int kmapoff_pgcnt;
2942
2943 /*
2944  *      consider_zone_gc:
2945  *
2946  *      Called by the pageout daemon when the system needs more free pages.
2947  */
2948
2949 void
2950 consider_zone_gc(void)
2951 {
2952         if (kmapoff_kaddr != 0) {
2953                 /*
2954                  * One-time reclaim of kernel_map resources we allocated in
2955                  * early boot.
2956                  */
2957                 (void) vm_deallocate(kernel_map,
2958                     kmapoff_kaddr, kmapoff_pgcnt * PAGE_SIZE_64);
2959                 kmapoff_kaddr = 0;
2960         }
2961
2962         if (zone_gc_allowed)
2963                 zone_gc();
2964 }
2965
2966 kern_return_t
2967 task_zone_info(
2968         __unused task_t                                 task,
2969         __unused mach_zone_name_array_t *namesp,
2970         __unused mach_msg_type_number_t *namesCntp,
2971         __unused task_zone_info_array_t *infop,
2972         __unused mach_msg_type_number_t *infoCntp)
2973 {
2974         return KERN_FAILURE;
2975 }
2976
2977 kern_return_t
2978 mach_zone_info(
2979         host_priv_t             host,
2980         mach_zone_name_array_t  *namesp,
2981         mach_msg_type_number_t  *namesCntp,
2982         mach_zone_info_array_t  *infop,
2983         mach_msg_type_number_t  *infoCntp)
2984 {
2985         return (mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL));
2986 }
2987
2988
2989 kern_return_t
2990 host_zone_info(
2991         host_priv_t             host,
2992         zone_name_array_t       *namesp,
2993         mach_msg_type_number_t  *namesCntp,
2994         zone_info_array_t       *infop,
2995         mach_msg_type_number_t  *infoCntp)
2996 {
2997         return (mach_memory_info(host, (mach_zone_name_array_t *)namesp, namesCntp, (mach_zone_info_array_t *)infop, infoCntp, NULL, NULL));
2998 }
2999
3000 kern_return_t
3001 mach_memory_info(
3002         host_priv_t             host,
3003         mach_zone_name_array_t  *namesp,
3004         mach_msg_type_number_t  *namesCntp,
3005         mach_zone_info_array_t  *infop,
3006         mach_msg_type_number_t  *infoCntp,
3007         mach_memory_info_array_t *memoryInfop,
3008         mach_msg_type_number_t   *memoryInfoCntp)
3009 {
3010         mach_zone_name_t        *names;
3011         vm_offset_t             names_addr;
3012         vm_size_t               names_size;
3013
3014         mach_zone_info_t        *info;
3015         vm_offset_t             info_addr;
3016         vm_size_t               info_size;
3017
3018         mach_memory_info_t      *memory_info;
3019         vm_offset_t             memory_info_addr;
3020         vm_size_t               memory_info_size;
3021         vm_size_t               memory_info_vmsize;
3022         unsigned int            num_sites;
3023
3024         unsigned int            max_zones, i;
3025         zone_t                  z;
3026         mach_zone_name_t        *zn;
3027         mach_zone_info_t        *zi;
3028         kern_return_t           kr;
3029
3030         vm_size_t               used;
3031         vm_map_copy_t           copy;
3032         uint64_t                zones_collectable_bytes = 0;
3033
3034         if (host == HOST_NULL)
3035                 return KERN_INVALID_HOST;
3036 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
3037         if (!PE_i_can_has_debugger(NULL))
3038                 return KERN_INVALID_HOST;
3039 #endif
3040
3041         /*
3042          *      We assume that zones aren't freed once allocated.
3043          *      We won't pick up any zones that are allocated later.
3044          */
3045
3046         simple_lock(&all_zones_lock);
3047         max_zones = (unsigned int)(num_zones);
3048         simple_unlock(&all_zones_lock);
3049
3050         names_size = round_page(max_zones * sizeof *names);
3051         kr = kmem_alloc_pageable(ipc_kernel_map,
3052                                  &names_addr, names_size, VM_KERN_MEMORY_IPC);
3053         if (kr != KERN_SUCCESS)
3054                 return kr;
3055         names = (mach_zone_name_t *) names_addr;
3056
3057         info_size = round_page(max_zones * sizeof *info);
3058         kr = kmem_alloc_pageable(ipc_kernel_map,
3059                                  &info_addr, info_size, VM_KERN_MEMORY_IPC);
3060         if (kr != KERN_SUCCESS) {
3061                 kmem_free(ipc_kernel_map,
3062                           names_addr, names_size);
3063                 return kr;
3064         }
3065         info = (mach_zone_info_t *) info_addr;
3066
3067         zn = &names[0];
3068         zi = &info[0];
3069
3070         for (i = 0; i < max_zones; i++) {
3071                 struct zone zcopy;
3072                 z = &(zone_array[i]);
3073                 assert(z != ZONE_NULL);
3074
3075                 lock_zone(z);
3076                 zcopy = *z;
3077                 unlock_zone(z);
3078
3079                 /* assuming here the name data is static */
3080                 (void) strncpy(zn->mzn_name, zcopy.zone_name,
3081                                sizeof zn->mzn_name);
3082                 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0';
3083
3084                 zi->mzi_count = (uint64_t)zcopy.count;
3085                 zi->mzi_cur_size = ptoa_64(zcopy.page_count);
3086                 zi->mzi_max_size = (uint64_t)zcopy.max_size;
3087                 zi->mzi_elem_size = (uint64_t)zcopy.elem_size;
3088                 zi->mzi_alloc_size = (uint64_t)zcopy.alloc_size;
3089                 zi->mzi_sum_size = zcopy.sum_count * zcopy.elem_size;
3090                 zi->mzi_exhaustible = (uint64_t)zcopy.exhaustible;
3091                 zi->mzi_collectable = (uint64_t)zcopy.collectable;
3092                 zones_collectable_bytes += ((uint64_t)zcopy.count_all_free_pages * PAGE_SIZE);
3093                 zn++;
3094                 zi++;
3095         }
3096
3097         used = max_zones * sizeof *names;
3098         if (used != names_size)
3099                 bzero((char *) (names_addr + used), names_size - used);
3100
3101         kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr,
3102                            (vm_map_size_t)used, TRUE, &copy);
3103         assert(kr == KERN_SUCCESS);
3104
3105         *namesp = (mach_zone_name_t *) copy;
3106         *namesCntp = max_zones;
3107
3108         used = max_zones * sizeof *info;
3109
3110         if (used != info_size)
3111                 bzero((char *) (info_addr + used), info_size - used);
3112
3113         kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr,
3114                            (vm_map_size_t)used, TRUE, &copy);
3115         assert(kr == KERN_SUCCESS);
3116
3117         *infop = (mach_zone_info_t *) copy;
3118         *infoCntp = max_zones;
3119
3120         num_sites = 0;
3121         memory_info_addr = 0;
3122
3123         if (memoryInfop && memoryInfoCntp)
3124         {
3125                 num_sites = VM_KERN_MEMORY_COUNT + VM_KERN_COUNTER_COUNT;
3126                 memory_info_size = num_sites * sizeof(*info);
3127                 memory_info_vmsize = round_page(memory_info_size);
3128                 kr = kmem_alloc_pageable(ipc_kernel_map,
3129                                          &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
3130                 if (kr != KERN_SUCCESS) {
3131                         kmem_free(ipc_kernel_map,
3132                                   names_addr, names_size);
3133                         kmem_free(ipc_kernel_map,
3134                                   info_addr, info_size);
3135                         return kr;
3136                 }
3137
3138                 kr = vm_map_wire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
3139                                      VM_PROT_READ|VM_PROT_WRITE|VM_PROT_MEMORY_TAG_MAKE(VM_KERN_MEMORY_IPC), FALSE);
3140                 assert(kr == KERN_SUCCESS);
3141
3142                 memory_info = (mach_memory_info_t *) memory_info_addr;
3143                 vm_page_diagnose(memory_info, num_sites, zones_collectable_bytes);
3144
3145                 kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
3146                 assert(kr == KERN_SUCCESS);
3147
3148                 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
3149                                    (vm_map_size_t)memory_info_size, TRUE, &copy);
3150                 assert(kr == KERN_SUCCESS);
3151
3152                 *memoryInfop = (mach_memory_info_t *) copy;
3153                 *memoryInfoCntp = num_sites;
3154         }
3155
3156         return KERN_SUCCESS;
3157 }
3158
3159 kern_return_t
3160 mach_zone_force_gc(
3161         host_t host)
3162 {
3163
3164         if (host == HOST_NULL)
3165                 return KERN_INVALID_HOST;
3166
3167         consider_zone_gc();
3168
3169         return (KERN_SUCCESS);
3170 }
3171
3172 extern unsigned int stack_total;
3173 extern unsigned long long stack_allocs;
3174
3175 #if defined(__i386__) || defined (__x86_64__)
3176 extern unsigned int inuse_ptepages_count;
3177 extern long long alloc_ptepages_count;
3178 #endif
3179
3180 void zone_display_zprint()
3181 {
3182         unsigned int    i;
3183         zone_t          the_zone;
3184
3185         for (i = 0; i < num_zones; i++) {
3186                 the_zone = &(zone_array[i]);
3187                 if(the_zone->cur_size > (1024*1024)) {
3188                         printf("%.20s:\t%lu\n",the_zone->zone_name,(uintptr_t)the_zone->cur_size);
3189                 }
3190         }
3191         printf("Kernel Stacks:\t%lu\n",(uintptr_t)(kernel_stack_size * stack_total));
3192
3193 #if defined(__i386__) || defined (__x86_64__)
3194         printf("PageTables:\t%lu\n",(uintptr_t)(PAGE_SIZE * inuse_ptepages_count));
3195 #endif
3196
3197         printf("Kalloc.Large:\t%lu\n",(uintptr_t)kalloc_large_total);
3198 }
3199
3200 zone_t
3201 zone_find_largest(void)
3202 {
3203         unsigned int    i;
3204         unsigned int    max_zones;
3205         zone_t          the_zone;
3206         zone_t          zone_largest;
3207
3208         simple_lock(&all_zones_lock);
3209         max_zones = num_zones;
3210         simple_unlock(&all_zones_lock);
3211
3212         zone_largest = &(zone_array[0]);
3213         for (i = 0; i < max_zones; i++) {
3214                 the_zone = &(zone_array[i]);
3215                 if (the_zone->cur_size > zone_largest->cur_size) {
3216                         zone_largest = the_zone;
3217                 }
3218         }
3219         return zone_largest;
3220 }
3221
3222 #if     ZONE_DEBUG
3223
3224 /* should we care about locks here ? */
3225
3226 #define zone_in_use(z)  ( z->count || z->free_elements \
3227                                                   || !queue_empty(&z->pages.all_free) \
3228                                                   || !queue_empty(&z->pages.intermediate) \
3229                                                   || (z->allows_foreign && !queue_empty(&z->pages.any_free_foreign)))
3230
3231
3232 #endif  /* ZONE_DEBUG */
3233
3234
3235 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
3236
3237 #if DEBUG || DEVELOPMENT
3238
3239 static uintptr_t *
3240 zone_copy_all_allocations_inqueue(zone_t z, queue_head_t * queue, uintptr_t * elems)
3241 {
3242     struct zone_page_metadata *page_meta;
3243     vm_offset_t free, elements;
3244     vm_offset_t idx, numElements, freeCount, bytesAvail, metaSize;
3245
3246     queue_iterate(queue, page_meta, struct zone_page_metadata *, pages)
3247     {
3248         elements = get_zone_page(page_meta);
3249         bytesAvail = ptoa(page_meta->page_count);
3250         freeCount = 0;
3251         if (z->allows_foreign && !from_zone_map(elements, z->elem_size))
3252         {
3253             metaSize    = (sizeof(struct zone_page_metadata) + ZONE_ELEMENT_ALIGNMENT - 1) & ~(ZONE_ELEMENT_ALIGNMENT - 1);
3254             bytesAvail -= metaSize;
3255             elements   += metaSize;
3256         }
3257         numElements = bytesAvail / z->elem_size;
3258         // construct array of all possible elements
3259         for (idx = 0; idx < numElements; idx++)
3260         {
3261             elems[idx] = INSTANCE_PUT(elements + idx * z->elem_size);
3262         }
3263         // remove from the array all free elements
3264         free = (vm_offset_t)page_metadata_get_freelist(page_meta);
3265         while (free)
3266         {
3267             // find idx of free element
3268             for (idx = 0; (idx < numElements) && (elems[idx] != INSTANCE_PUT(free)); idx++)  {}
3269             assert(idx < numElements);
3270             // remove it
3271             bcopy(&elems[idx + 1], &elems[idx], (numElements - (idx + 1)) * sizeof(elems[0]));
3272             numElements--;
3273             freeCount++;
3274             // next free element
3275             vm_offset_t *primary = (vm_offset_t *) free;
3276             free = *primary ^ zp_nopoison_cookie;
3277         }
3278         elems += numElements;
3279     }
3280
3281     return (elems);
3282 }
3283
3284 kern_return_t
3285 zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon)
3286 {
3287         uintptr_t         zbt[MAX_ZTRACE_DEPTH];
3288     zone_t        zone;
3289     uintptr_t *   array;
3290     uintptr_t *   next;
3291     uintptr_t     element, bt;
3292     uint32_t      idx, count, found;
3293     uint32_t      btidx, btcount, nobtcount, btfound;
3294     uint32_t      elemSize;
3295     uint64_t      maxElems;
3296     kern_return_t kr;
3297
3298     for (idx = 0; idx < num_zones; idx++)
3299     {
3300         if (!strncmp(zoneName, zone_array[idx].zone_name, nameLen)) break;
3301     }
3302     if (idx >= num_zones) return (KERN_INVALID_NAME);
3303     zone = &zone_array[idx];
3304
3305     elemSize = (uint32_t) zone->elem_size;
3306     maxElems = ptoa(zone->page_count) / elemSize;
3307
3308     if ((zone->alloc_size % elemSize)
3309       && !leak_scan_debug_flag) return (KERN_INVALID_CAPABILITY);
3310
3311     kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array,
3312                             maxElems * sizeof(uintptr_t), VM_KERN_MEMORY_DIAG);
3313     if (KERN_SUCCESS != kr) return (kr);
3314
3315     lock_zone(zone);
3316
3317     next = array;
3318     next = zone_copy_all_allocations_inqueue(zone, &zone->pages.any_free_foreign, next);
3319     next = zone_copy_all_allocations_inqueue(zone, &zone->pages.intermediate,     next);
3320     next = zone_copy_all_allocations_inqueue(zone, &zone->pages.all_used,         next);
3321     count = (uint32_t)(next - array);
3322
3323     unlock_zone(zone);
3324
3325     zone_leaks_scan(array, count, (uint32_t)zone->elem_size, &found);
3326     assert(found <= count);
3327
3328     for (idx = 0; idx < count; idx++)
3329     {
3330         element = array[idx];
3331         if (kInstanceFlagReferenced & element) continue;
3332         element = INSTANCE_PUT(element) & ~kInstanceFlags;
3333     }
3334
3335     if (zone->zlog_btlog && !corruption_debug_flag)
3336     {
3337         // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
3338         btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon);
3339     }
3340
3341     for (nobtcount = idx = 0; idx < count; idx++)
3342     {
3343         element = array[idx];
3344         if (!element)                          continue;
3345         if (kInstanceFlagReferenced & element) continue;
3346         element = INSTANCE_PUT(element) & ~kInstanceFlags;
3347
3348         // see if we can find any backtrace left in the element
3349         btcount = (typeof(btcount)) (zone->elem_size / sizeof(uintptr_t));
3350         if (btcount >= MAX_ZTRACE_DEPTH) btcount = MAX_ZTRACE_DEPTH - 1;
3351         for (btfound = btidx = 0; btidx < btcount; btidx++)
3352         {
3353             bt = ((uintptr_t *)element)[btcount - 1 - btidx];
3354             if (!VM_KERNEL_IS_SLID(bt)) break;
3355             zbt[btfound++] = bt;
3356         }
3357         if (btfound) (*proc)(refCon, 1, elemSize, &zbt[0], btfound);
3358         else         nobtcount++;
3359     }
3360     if (nobtcount)
3361     {
3362         // fake backtrace when we found nothing
3363         zbt[0] = (uintptr_t) &zalloc;
3364         (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1);
3365     }
3366
3367     kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t));
3368
3369     return (KERN_SUCCESS);
3370 }
3371
3372 void
3373 kern_wired_diagnose(void)
3374 {
3375     unsigned int       count = VM_KERN_MEMORY_COUNT + VM_KERN_COUNTER_COUNT;
3376     mach_memory_info_t info[count];
3377     unsigned int       idx;
3378     uint64_t           total_zone, total_wired, top_wired, osfmk_wired;
3379
3380     if (KERN_SUCCESS != vm_page_diagnose(info, count, 0)) return;
3381
3382     total_zone = total_wired = top_wired = osfmk_wired = 0;
3383     for (idx = 0; idx < num_zones; idx++)
3384     {
3385         total_zone += ptoa_64(zone_array[idx].page_count);
3386     }
3387     total_wired = total_zone;
3388
3389     for (idx = 0; idx < count; idx++)
3390     {
3391         if (VM_KERN_COUNT_WIRED  == info[idx].site)   top_wired   = info[idx].size;
3392         if (VM_KERN_MEMORY_OSFMK == info[idx].site)   osfmk_wired = info[idx].size;
3393         if (VM_KERN_SITE_HIDE    &  info[idx].flags)  continue;
3394         if (!(VM_KERN_SITE_WIRED &  info[idx].flags)) continue;
3395         total_wired += info[idx].size;
3396     }
3397
3398     printf("top 0x%qx, total 0x%qx, zone 0x%qx, osfmk 0x%qx\n",
3399            top_wired, total_wired, total_zone, osfmk_wired);
3400 }
3401
3402 boolean_t
3403 kdp_is_in_zone(void *addr, const char *zone_name)
3404 {
3405         zone_t z;
3406         return (zone_element_size(addr, &z) && !strcmp(z->zone_name, zone_name));
3407 }
3408
3409 #endif /* DEBUG || DEVELOPMENT */