osfmk/vm/vm_resident.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_page.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Resident memory management module.
  63  */
  64
  65 #include <debug.h>
  66 #include <libkern/OSAtomic.h>
  67
  68 #include <mach/clock_types.h>
  69 #include <mach/vm_prot.h>
  70 #include <mach/vm_statistics.h>
  71 #include <mach/sdt.h>
  72 #include <kern/counters.h>
  73 #include <kern/sched_prim.h>
  74 #include <kern/task.h>
  75 #include <kern/thread.h>
  76 #include <kern/kalloc.h>
  77 #include <kern/zalloc.h>
  78 #include <kern/xpr.h>
  79 #include <vm/pmap.h>
  80 #include <vm/vm_init.h>
  81 #include <vm/vm_map.h>
  82 #include <vm/vm_page.h>
  83 #include <vm/vm_pageout.h>
  84 #include <vm/vm_kern.h>                 /* kernel_memory_allocate() */
  85 #include <kern/misc_protos.h>
  86 #include <zone_debug.h>
  87 #include <vm/cpm.h>
  88 #include <pexpert/pexpert.h>
  89
  90 #include <vm/vm_protos.h>
  91 #include <vm/memory_object.h>
  92 #include <vm/vm_purgeable_internal.h>
  93 #include <vm/vm_compressor.h>
  94
  95 #include <IOKit/IOHibernatePrivate.h>
  96
  97 #include <sys/kdebug.h>
  98
  99 boolean_t       hibernate_cleaning_in_progress = FALSE;
 100 boolean_t       vm_page_free_verify = TRUE;
 101
 102 uint32_t        vm_lopage_free_count = 0;
 103 uint32_t        vm_lopage_free_limit = 0;
 104 uint32_t        vm_lopage_lowater    = 0;
 105 boolean_t       vm_lopage_refill = FALSE;
 106 boolean_t       vm_lopage_needed = FALSE;
 107
 108 lck_mtx_ext_t   vm_page_queue_lock_ext;
 109 lck_mtx_ext_t   vm_page_queue_free_lock_ext;
 110 lck_mtx_ext_t   vm_purgeable_queue_lock_ext;
 111
 112 int             speculative_age_index = 0;
 113 int             speculative_steal_index = 0;
 114 struct vm_speculative_age_q vm_page_queue_speculative[VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1];
 115
 116
 117 __private_extern__ void         vm_page_init_lck_grp(void);
 118
 119 static void             vm_page_free_prepare(vm_page_t  page);
 120 static vm_page_t        vm_page_grab_fictitious_common(ppnum_t phys_addr);
 121
 122
 123
 124
 125 /*
 126  *      Associated with page of user-allocatable memory is a
 127  *      page structure.
 128  */
 129
 130 /*
 131  *      These variables record the values returned by vm_page_bootstrap,
 132  *      for debugging purposes.  The implementation of pmap_steal_memory
 133  *      and pmap_startup here also uses them internally.
 134  */
 135
 136 vm_offset_t virtual_space_start;
 137 vm_offset_t virtual_space_end;
 138 uint32_t        vm_page_pages;
 139
 140 /*
 141  *      The vm_page_lookup() routine, which provides for fast
 142  *      (virtual memory object, offset) to page lookup, employs
 143  *      the following hash table.  The vm_page_{insert,remove}
 144  *      routines install and remove associations in the table.
 145  *      [This table is often called the virtual-to-physical,
 146  *      or VP, table.]
 147  */
 148 typedef struct {
 149         vm_page_t       pages;
 150 #if     MACH_PAGE_HASH_STATS
 151         int             cur_count;              /* current count */
 152         int             hi_count;               /* high water mark */
 153 #endif /* MACH_PAGE_HASH_STATS */
 154 } vm_page_bucket_t;
 155
 156
 157 #define BUCKETS_PER_LOCK        16
 158
 159 vm_page_bucket_t *vm_page_buckets;              /* Array of buckets */
 160 unsigned int    vm_page_bucket_count = 0;       /* How big is array? */
 161 unsigned int    vm_page_hash_mask;              /* Mask for hash function */
 162 unsigned int    vm_page_hash_shift;             /* Shift for hash function */
 163 uint32_t        vm_page_bucket_hash;            /* Basic bucket hash */
 164 unsigned int    vm_page_bucket_lock_count = 0;          /* How big is array of locks? */
 165
 166 lck_spin_t      *vm_page_bucket_locks;
 167
 168 #if VM_PAGE_BUCKETS_CHECK
 169 boolean_t vm_page_buckets_check_ready = FALSE;
 170 #if VM_PAGE_FAKE_BUCKETS
 171 vm_page_bucket_t *vm_page_fake_buckets; /* decoy buckets */
 172 vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
 173 #endif /* VM_PAGE_FAKE_BUCKETS */
 174 #endif /* VM_PAGE_BUCKETS_CHECK */
 175
 176 #if     MACH_PAGE_HASH_STATS
 177 /* This routine is only for debug.  It is intended to be called by
 178  * hand by a developer using a kernel debugger.  This routine prints
 179  * out vm_page_hash table statistics to the kernel debug console.
 180  */
 181 void
 182 hash_debug(void)
 183 {
 184         int     i;
 185         int     numbuckets = 0;
 186         int     highsum = 0;
 187         int     maxdepth = 0;
 188
 189         for (i = 0; i < vm_page_bucket_count; i++) {
 190                 if (vm_page_buckets[i].hi_count) {
 191                         numbuckets++;
 192                         highsum += vm_page_buckets[i].hi_count;
 193                         if (vm_page_buckets[i].hi_count > maxdepth)
 194                                 maxdepth = vm_page_buckets[i].hi_count;
 195                 }
 196         }
 197         printf("Total number of buckets: %d\n", vm_page_bucket_count);
 198         printf("Number used buckets:     %d = %d%%\n",
 199                 numbuckets, 100*numbuckets/vm_page_bucket_count);
 200         printf("Number unused buckets:   %d = %d%%\n",
 201                 vm_page_bucket_count - numbuckets,
 202                 100*(vm_page_bucket_count-numbuckets)/vm_page_bucket_count);
 203         printf("Sum of bucket max depth: %d\n", highsum);
 204         printf("Average bucket depth:    %d.%2d\n",
 205                 highsum/vm_page_bucket_count,
 206                 highsum%vm_page_bucket_count);
 207         printf("Maximum bucket depth:    %d\n", maxdepth);
 208 }
 209 #endif /* MACH_PAGE_HASH_STATS */
 210
 211 /*
 212  *      The virtual page size is currently implemented as a runtime
 213  *      variable, but is constant once initialized using vm_set_page_size.
 214  *      This initialization must be done in the machine-dependent
 215  *      bootstrap sequence, before calling other machine-independent
 216  *      initializations.
 217  *
 218  *      All references to the virtual page size outside this
 219  *      module must use the PAGE_SIZE, PAGE_MASK and PAGE_SHIFT
 220  *      constants.
 221  */
 222 vm_size_t       page_size  = PAGE_SIZE;
 223 vm_size_t       page_mask  = PAGE_MASK;
 224 int             page_shift = PAGE_SHIFT;
 225
 226 /*
 227  *      Resident page structures are initialized from
 228  *      a template (see vm_page_alloc).
 229  *
 230  *      When adding a new field to the virtual memory
 231  *      object structure, be sure to add initialization
 232  *      (see vm_page_bootstrap).
 233  */
 234 struct vm_page  vm_page_template;
 235
 236 vm_page_t       vm_pages = VM_PAGE_NULL;
 237 unsigned int    vm_pages_count = 0;
 238 ppnum_t         vm_page_lowest = 0;
 239
 240 /*
 241  *      Resident pages that represent real memory
 242  *      are allocated from a set of free lists,
 243  *      one per color.
 244  */
 245 unsigned int    vm_colors;
 246 unsigned int    vm_color_mask;                  /* mask is == (vm_colors-1) */
 247 unsigned int    vm_cache_geometry_colors = 0;   /* set by hw dependent code during startup */
 248 queue_head_t    vm_page_queue_free[MAX_COLORS];
 249 unsigned int    vm_page_free_wanted;
 250 unsigned int    vm_page_free_wanted_privileged;
 251 unsigned int    vm_page_free_count;
 252 unsigned int    vm_page_fictitious_count;
 253
 254 unsigned int    vm_page_free_count_minimum;     /* debugging */
 255
 256 /*
 257  *      Occasionally, the virtual memory system uses
 258  *      resident page structures that do not refer to
 259  *      real pages, for example to leave a page with
 260  *      important state information in the VP table.
 261  *
 262  *      These page structures are allocated the way
 263  *      most other kernel structures are.
 264  */
 265 zone_t  vm_page_zone;
 266 vm_locks_array_t vm_page_locks;
 267 decl_lck_mtx_data(,vm_page_alloc_lock)
 268 lck_mtx_ext_t vm_page_alloc_lock_ext;
 269
 270 unsigned int io_throttle_zero_fill;
 271
 272 unsigned int    vm_page_local_q_count = 0;
 273 unsigned int    vm_page_local_q_soft_limit = 250;
 274 unsigned int    vm_page_local_q_hard_limit = 500;
 275 struct vplq     *vm_page_local_q = NULL;
 276
 277 /* N.B. Guard and fictitious pages must not
 278  * be assigned a zero phys_page value.
 279  */
 280 /*
 281  *      Fictitious pages don't have a physical address,
 282  *      but we must initialize phys_page to something.
 283  *      For debugging, this should be a strange value
 284  *      that the pmap module can recognize in assertions.
 285  */
 286 ppnum_t vm_page_fictitious_addr = (ppnum_t) -1;
 287
 288 /*
 289  *      Guard pages are not accessible so they don't
 290  *      need a physical address, but we need to enter
 291  *      one in the pmap.
 292  *      Let's make it recognizable and make sure that
 293  *      we don't use a real physical page with that
 294  *      physical address.
 295  */
 296 ppnum_t vm_page_guard_addr = (ppnum_t) -2;
 297
 298 /*
 299  *      Resident page structures are also chained on
 300  *      queues that are used by the page replacement
 301  *      system (pageout daemon).  These queues are
 302  *      defined here, but are shared by the pageout
 303  *      module.  The inactive queue is broken into
 304  *      file backed and anonymous for convenience as the
 305  *      pageout daemon often assignes a higher
 306  *      importance to anonymous pages (less likely to pick)
 307  */
 308 queue_head_t    vm_page_queue_active;
 309 queue_head_t    vm_page_queue_inactive;
 310 queue_head_t    vm_page_queue_anonymous;        /* inactive memory queue for anonymous pages */
 311 queue_head_t    vm_page_queue_throttled;
 312
 313 unsigned int    vm_page_active_count;
 314 unsigned int    vm_page_inactive_count;
 315 unsigned int    vm_page_anonymous_count;
 316 unsigned int    vm_page_throttled_count;
 317 unsigned int    vm_page_speculative_count;
 318 unsigned int    vm_page_wire_count;
 319 unsigned int    vm_page_wire_count_initial;
 320 unsigned int    vm_page_gobble_count = 0;
 321 unsigned int    vm_page_wire_count_warning = 0;
 322 unsigned int    vm_page_gobble_count_warning = 0;
 323
 324 unsigned int    vm_page_purgeable_count = 0; /* # of pages purgeable now */
 325 unsigned int    vm_page_purgeable_wired_count = 0; /* # of purgeable pages that are wired now */
 326 uint64_t        vm_page_purged_count = 0;    /* total count of purged pages */
 327
 328 unsigned int    vm_page_external_count = 0;
 329 unsigned int    vm_page_internal_count = 0;
 330 unsigned int    vm_page_pageable_external_count = 0;
 331 unsigned int    vm_page_pageable_internal_count = 0;
 332
 333 #if DEVELOPMENT || DEBUG
 334 unsigned int    vm_page_speculative_recreated = 0;
 335 unsigned int    vm_page_speculative_created = 0;
 336 unsigned int    vm_page_speculative_used = 0;
 337 #endif
 338
 339 queue_head_t    vm_page_queue_cleaned;
 340
 341 unsigned int    vm_page_cleaned_count = 0;
 342 unsigned int    vm_pageout_enqueued_cleaned = 0;
 343
 344 uint64_t        max_valid_dma_address = 0xffffffffffffffffULL;
 345 ppnum_t         max_valid_low_ppnum = 0xffffffff;
 346
 347
 348 /*
 349  *      Several page replacement parameters are also
 350  *      shared with this module, so that page allocation
 351  *      (done here in vm_page_alloc) can trigger the
 352  *      pageout daemon.
 353  */
 354 unsigned int    vm_page_free_target = 0;
 355 unsigned int    vm_page_free_min = 0;
 356 unsigned int    vm_page_throttle_limit = 0;
 357 uint32_t        vm_page_creation_throttle = 0;
 358 unsigned int    vm_page_inactive_target = 0;
 359 unsigned int    vm_page_anonymous_min = 0;
 360 unsigned int    vm_page_inactive_min = 0;
 361 unsigned int    vm_page_free_reserved = 0;
 362 unsigned int    vm_page_throttle_count = 0;
 363
 364
 365 /*
 366  *      The VM system has a couple of heuristics for deciding
 367  *      that pages are "uninteresting" and should be placed
 368  *      on the inactive queue as likely candidates for replacement.
 369  *      These variables let the heuristics be controlled at run-time
 370  *      to make experimentation easier.
 371  */
 372
 373 boolean_t vm_page_deactivate_hint = TRUE;
 374
 375 struct vm_page_stats_reusable vm_page_stats_reusable;
 376
 377 /*
 378  *      vm_set_page_size:
 379  *
 380  *      Sets the page size, perhaps based upon the memory
 381  *      size.  Must be called before any use of page-size
 382  *      dependent functions.
 383  *
 384  *      Sets page_shift and page_mask from page_size.
 385  */
 386 void
 387 vm_set_page_size(void)
 388 {
 389         page_mask = page_size - 1;
 390
 391         if ((page_mask & page_size) != 0)
 392                 panic("vm_set_page_size: page size not a power of two");
 393
 394         for (page_shift = 0; ; page_shift++)
 395                 if ((1U << page_shift) == page_size)
 396                         break;
 397 }
 398
 399
 400 /* Called once during statup, once the cache geometry is known.
 401  */
 402 static void
 403 vm_page_set_colors( void )
 404 {
 405         unsigned int    n, override;
 406
 407         if ( PE_parse_boot_argn("colors", &override, sizeof (override)) )               /* colors specified as a boot-arg? */
 408                 n = override;
 409         else if ( vm_cache_geometry_colors )                    /* do we know what the cache geometry is? */
 410                 n = vm_cache_geometry_colors;
 411         else    n = DEFAULT_COLORS;                             /* use default if all else fails */
 412
 413         if ( n == 0 )
 414                 n = 1;
 415         if ( n > MAX_COLORS )
 416                 n = MAX_COLORS;
 417
 418         /* the count must be a power of 2  */
 419         if ( ( n & (n - 1)) != 0  )
 420                 panic("vm_page_set_colors");
 421
 422         vm_colors = n;
 423         vm_color_mask = n - 1;
 424 }
 425
 426
 427 lck_grp_t               vm_page_lck_grp_free;
 428 lck_grp_t               vm_page_lck_grp_queue;
 429 lck_grp_t               vm_page_lck_grp_local;
 430 lck_grp_t               vm_page_lck_grp_purge;
 431 lck_grp_t               vm_page_lck_grp_alloc;
 432 lck_grp_t               vm_page_lck_grp_bucket;
 433 lck_grp_attr_t          vm_page_lck_grp_attr;
 434 lck_attr_t              vm_page_lck_attr;
 435
 436
 437 __private_extern__ void
 438 vm_page_init_lck_grp(void)
 439 {
 440         /*
 441          * initialze the vm_page lock world
 442          */
 443         lck_grp_attr_setdefault(&vm_page_lck_grp_attr);
 444         lck_grp_init(&vm_page_lck_grp_free, "vm_page_free", &vm_page_lck_grp_attr);
 445         lck_grp_init(&vm_page_lck_grp_queue, "vm_page_queue", &vm_page_lck_grp_attr);
 446         lck_grp_init(&vm_page_lck_grp_local, "vm_page_queue_local", &vm_page_lck_grp_attr);
 447         lck_grp_init(&vm_page_lck_grp_purge, "vm_page_purge", &vm_page_lck_grp_attr);
 448         lck_grp_init(&vm_page_lck_grp_alloc, "vm_page_alloc", &vm_page_lck_grp_attr);
 449         lck_grp_init(&vm_page_lck_grp_bucket, "vm_page_bucket", &vm_page_lck_grp_attr);
 450         lck_attr_setdefault(&vm_page_lck_attr);
 451         lck_mtx_init_ext(&vm_page_alloc_lock, &vm_page_alloc_lock_ext, &vm_page_lck_grp_alloc, &vm_page_lck_attr);
 452
 453         vm_compressor_init_locks();
 454 }
 455
 456 void
 457 vm_page_init_local_q()
 458 {
 459         unsigned int            num_cpus;
 460         unsigned int            i;
 461         struct vplq             *t_local_q;
 462
 463         num_cpus = ml_get_max_cpus();
 464
 465         /*
 466          * no point in this for a uni-processor system
 467          */
 468         if (num_cpus >= 2) {
 469                 t_local_q = (struct vplq *)kalloc(num_cpus * sizeof(struct vplq));
 470
 471                 for (i = 0; i < num_cpus; i++) {
 472                         struct vpl      *lq;
 473
 474                         lq = &t_local_q[i].vpl_un.vpl;
 475                         VPL_LOCK_INIT(lq, &vm_page_lck_grp_local, &vm_page_lck_attr);
 476                         queue_init(&lq->vpl_queue);
 477                         lq->vpl_count = 0;
 478                         lq->vpl_internal_count = 0;
 479                         lq->vpl_external_count = 0;
 480                 }
 481                 vm_page_local_q_count = num_cpus;
 482
 483                 vm_page_local_q = (struct vplq *)t_local_q;
 484         }
 485 }
 486
 487
 488 /*
 489  *      vm_page_bootstrap:
 490  *
 491  *      Initializes the resident memory module.
 492  *
 493  *      Allocates memory for the page cells, and
 494  *      for the object/offset-to-page hash table headers.
 495  *      Each page cell is initialized and placed on the free list.
 496  *      Returns the range of available kernel virtual memory.
 497  */
 498
 499 void
 500 vm_page_bootstrap(
 501         vm_offset_t             *startp,
 502         vm_offset_t             *endp)
 503 {
 504         register vm_page_t      m;
 505         unsigned int            i;
 506         unsigned int            log1;
 507         unsigned int            log2;
 508         unsigned int            size;
 509
 510         /*
 511          *      Initialize the vm_page template.
 512          */
 513
 514         m = &vm_page_template;
 515         bzero(m, sizeof (*m));
 516
 517         m->pageq.next = NULL;
 518         m->pageq.prev = NULL;
 519         m->listq.next = NULL;
 520         m->listq.prev = NULL;
 521         m->next = VM_PAGE_NULL;
 522
 523         m->object = VM_OBJECT_NULL;             /* reset later */
 524         m->offset = (vm_object_offset_t) -1;    /* reset later */
 525
 526         m->wire_count = 0;
 527         m->local = FALSE;
 528         m->inactive = FALSE;
 529         m->active = FALSE;
 530         m->pageout_queue = FALSE;
 531         m->speculative = FALSE;
 532         m->laundry = FALSE;
 533         m->free = FALSE;
 534         m->reference = FALSE;
 535         m->gobbled = FALSE;
 536         m->private = FALSE;
 537         m->throttled = FALSE;
 538         m->__unused_pageq_bits = 0;
 539
 540         m->phys_page = 0;               /* reset later */
 541
 542         m->busy = TRUE;
 543         m->wanted = FALSE;
 544         m->tabled = FALSE;
 545         m->hashed = FALSE;
 546         m->fictitious = FALSE;
 547         m->pmapped = FALSE;
 548         m->wpmapped = FALSE;
 549         m->pageout = FALSE;
 550         m->absent = FALSE;
 551         m->error = FALSE;
 552         m->dirty = FALSE;
 553         m->cleaning = FALSE;
 554         m->precious = FALSE;
 555         m->clustered = FALSE;
 556         m->overwriting = FALSE;
 557         m->restart = FALSE;
 558         m->unusual = FALSE;
 559         m->encrypted = FALSE;
 560         m->encrypted_cleaning = FALSE;
 561         m->cs_validated = FALSE;
 562         m->cs_tainted = FALSE;
 563         m->no_cache = FALSE;
 564         m->reusable = FALSE;
 565         m->slid = FALSE;
 566         m->was_dirty = FALSE;
 567         m->xpmapped = FALSE;
 568         m->compressor = FALSE;
 569         m->written_by_kernel = FALSE;
 570         m->__unused_object_bits = 0;
 571
 572         /*
 573          *      Initialize the page queues.
 574          */
 575         vm_page_init_lck_grp();
 576
 577         lck_mtx_init_ext(&vm_page_queue_free_lock, &vm_page_queue_free_lock_ext, &vm_page_lck_grp_free, &vm_page_lck_attr);
 578         lck_mtx_init_ext(&vm_page_queue_lock, &vm_page_queue_lock_ext, &vm_page_lck_grp_queue, &vm_page_lck_attr);
 579         lck_mtx_init_ext(&vm_purgeable_queue_lock, &vm_purgeable_queue_lock_ext, &vm_page_lck_grp_purge, &vm_page_lck_attr);
 580
 581         for (i = 0; i < PURGEABLE_Q_TYPE_MAX; i++) {
 582                 int group;
 583
 584                 purgeable_queues[i].token_q_head = 0;
 585                 purgeable_queues[i].token_q_tail = 0;
 586                 for (group = 0; group < NUM_VOLATILE_GROUPS; group++)
 587                         queue_init(&purgeable_queues[i].objq[group]);
 588
 589                 purgeable_queues[i].type = i;
 590                 purgeable_queues[i].new_pages = 0;
 591 #if MACH_ASSERT
 592                 purgeable_queues[i].debug_count_tokens = 0;
 593                 purgeable_queues[i].debug_count_objects = 0;
 594 #endif
 595         };
 596
 597         for (i = 0; i < MAX_COLORS; i++ )
 598                 queue_init(&vm_page_queue_free[i]);
 599
 600         queue_init(&vm_lopage_queue_free);
 601         queue_init(&vm_page_queue_active);
 602         queue_init(&vm_page_queue_inactive);
 603         queue_init(&vm_page_queue_cleaned);
 604         queue_init(&vm_page_queue_throttled);
 605         queue_init(&vm_page_queue_anonymous);
 606
 607         for ( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ ) {
 608                 queue_init(&vm_page_queue_speculative[i].age_q);
 609
 610                 vm_page_queue_speculative[i].age_ts.tv_sec = 0;
 611                 vm_page_queue_speculative[i].age_ts.tv_nsec = 0;
 612         }
 613         vm_page_free_wanted = 0;
 614         vm_page_free_wanted_privileged = 0;
 615
 616         vm_page_set_colors();
 617
 618
 619         /*
 620          *      Steal memory for the map and zone subsystems.
 621          */
 622         zone_steal_memory();
 623         vm_map_steal_memory();
 624
 625         /*
 626          *      Allocate (and initialize) the virtual-to-physical
 627          *      table hash buckets.
 628          *
 629          *      The number of buckets should be a power of two to
 630          *      get a good hash function.  The following computation
 631          *      chooses the first power of two that is greater
 632          *      than the number of physical pages in the system.
 633          */
 634
 635         if (vm_page_bucket_count == 0) {
 636                 unsigned int npages = pmap_free_pages();
 637
 638                 vm_page_bucket_count = 1;
 639                 while (vm_page_bucket_count < npages)
 640                         vm_page_bucket_count <<= 1;
 641         }
 642         vm_page_bucket_lock_count = (vm_page_bucket_count + BUCKETS_PER_LOCK - 1) / BUCKETS_PER_LOCK;
 643
 644         vm_page_hash_mask = vm_page_bucket_count - 1;
 645
 646         /*
 647          *      Calculate object shift value for hashing algorithm:
 648          *              O = log2(sizeof(struct vm_object))
 649          *              B = log2(vm_page_bucket_count)
 650          *              hash shifts the object left by
 651          *              B/2 - O
 652          */
 653         size = vm_page_bucket_count;
 654         for (log1 = 0; size > 1; log1++)
 655                 size /= 2;
 656         size = sizeof(struct vm_object);
 657         for (log2 = 0; size > 1; log2++)
 658                 size /= 2;
 659         vm_page_hash_shift = log1/2 - log2 + 1;
 660
 661         vm_page_bucket_hash = 1 << ((log1 + 1) >> 1);           /* Get (ceiling of sqrt of table size) */
 662         vm_page_bucket_hash |= 1 << ((log1 + 1) >> 2);          /* Get (ceiling of quadroot of table size) */
 663         vm_page_bucket_hash |= 1;                                                       /* Set bit and add 1 - always must be 1 to insure unique series */
 664
 665         if (vm_page_hash_mask & vm_page_bucket_count)
 666                 printf("vm_page_bootstrap: WARNING -- strange page hash\n");
 667
 668 #if VM_PAGE_BUCKETS_CHECK
 669 #if VM_PAGE_FAKE_BUCKETS
 670         /*
 671          * Allocate a decoy set of page buckets, to detect
 672          * any stomping there.
 673          */
 674         vm_page_fake_buckets = (vm_page_bucket_t *)
 675                 pmap_steal_memory(vm_page_bucket_count *
 676                                   sizeof(vm_page_bucket_t));
 677         vm_page_fake_buckets_start = (vm_map_offset_t) vm_page_fake_buckets;
 678         vm_page_fake_buckets_end =
 679                 vm_map_round_page((vm_page_fake_buckets_start +
 680                                    (vm_page_bucket_count *
 681                                     sizeof (vm_page_bucket_t))),
 682                                   PAGE_MASK);
 683         char *cp;
 684         for (cp = (char *)vm_page_fake_buckets_start;
 685              cp < (char *)vm_page_fake_buckets_end;
 686              cp++) {
 687                 *cp = 0x5a;
 688         }
 689 #endif /* VM_PAGE_FAKE_BUCKETS */
 690 #endif /* VM_PAGE_BUCKETS_CHECK */
 691
 692         vm_page_buckets = (vm_page_bucket_t *)
 693                 pmap_steal_memory(vm_page_bucket_count *
 694                                   sizeof(vm_page_bucket_t));
 695
 696         vm_page_bucket_locks = (lck_spin_t *)
 697                 pmap_steal_memory(vm_page_bucket_lock_count *
 698                                   sizeof(lck_spin_t));
 699
 700         for (i = 0; i < vm_page_bucket_count; i++) {
 701                 register vm_page_bucket_t *bucket = &vm_page_buckets[i];
 702
 703                 bucket->pages = VM_PAGE_NULL;
 704 #if     MACH_PAGE_HASH_STATS
 705                 bucket->cur_count = 0;
 706                 bucket->hi_count = 0;
 707 #endif /* MACH_PAGE_HASH_STATS */
 708         }
 709
 710         for (i = 0; i < vm_page_bucket_lock_count; i++)
 711                 lck_spin_init(&vm_page_bucket_locks[i], &vm_page_lck_grp_bucket, &vm_page_lck_attr);
 712
 713 #if VM_PAGE_BUCKETS_CHECK
 714         vm_page_buckets_check_ready = TRUE;
 715 #endif /* VM_PAGE_BUCKETS_CHECK */
 716
 717         /*
 718          *      Machine-dependent code allocates the resident page table.
 719          *      It uses vm_page_init to initialize the page frames.
 720          *      The code also returns to us the virtual space available
 721          *      to the kernel.  We don't trust the pmap module
 722          *      to get the alignment right.
 723          */
 724
 725         pmap_startup(&virtual_space_start, &virtual_space_end);
 726         virtual_space_start = round_page(virtual_space_start);
 727         virtual_space_end = trunc_page(virtual_space_end);
 728
 729         *startp = virtual_space_start;
 730         *endp = virtual_space_end;
 731
 732         /*
 733          *      Compute the initial "wire" count.
 734          *      Up until now, the pages which have been set aside are not under
 735          *      the VM system's control, so although they aren't explicitly
 736          *      wired, they nonetheless can't be moved. At this moment,
 737          *      all VM managed pages are "free", courtesy of pmap_startup.
 738          */
 739         assert((unsigned int) atop_64(max_mem) == atop_64(max_mem));
 740         vm_page_wire_count = ((unsigned int) atop_64(max_mem)) - vm_page_free_count - vm_lopage_free_count;     /* initial value */
 741         vm_page_wire_count_initial = vm_page_wire_count;
 742         vm_page_free_count_minimum = vm_page_free_count;
 743
 744         printf("vm_page_bootstrap: %d free pages and %d wired pages\n",
 745                vm_page_free_count, vm_page_wire_count);
 746
 747         simple_lock_init(&vm_paging_lock, 0);
 748 }
 749
 750 #ifndef MACHINE_PAGES
 751 /*
 752  *      We implement pmap_steal_memory and pmap_startup with the help
 753  *      of two simpler functions, pmap_virtual_space and pmap_next_page.
 754  */
 755
 756 void *
 757 pmap_steal_memory(
 758         vm_size_t size)
 759 {
 760         vm_offset_t addr, vaddr;
 761         ppnum_t phys_page;
 762
 763         /*
 764          *      We round the size to a round multiple.
 765          */
 766
 767         size = (size + sizeof (void *) - 1) &~ (sizeof (void *) - 1);
 768
 769         /*
 770          *      If this is the first call to pmap_steal_memory,
 771          *      we have to initialize ourself.
 772          */
 773
 774         if (virtual_space_start == virtual_space_end) {
 775                 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
 776
 777                 /*
 778                  *      The initial values must be aligned properly, and
 779                  *      we don't trust the pmap module to do it right.
 780                  */
 781
 782                 virtual_space_start = round_page(virtual_space_start);
 783                 virtual_space_end = trunc_page(virtual_space_end);
 784         }
 785
 786         /*
 787          *      Allocate virtual memory for this request.
 788          */
 789
 790         addr = virtual_space_start;
 791         virtual_space_start += size;
 792
 793         //kprintf("pmap_steal_memory: %08lX - %08lX; size=%08lX\n", (long)addr, (long)virtual_space_start, (long)size); /* (TEST/DEBUG) */
 794
 795         /*
 796          *      Allocate and map physical pages to back new virtual pages.
 797          */
 798
 799         for (vaddr = round_page(addr);
 800              vaddr < addr + size;
 801              vaddr += PAGE_SIZE) {
 802
 803                 if (!pmap_next_page_hi(&phys_page))
 804                         panic("pmap_steal_memory");
 805
 806                 /*
 807                  *      XXX Logically, these mappings should be wired,
 808                  *      but some pmap modules barf if they are.
 809                  */
 810 #if defined(__LP64__)
 811                 pmap_pre_expand(kernel_pmap, vaddr);
 812 #endif
 813
 814                 pmap_enter(kernel_pmap, vaddr, phys_page,
 815                            VM_PROT_READ|VM_PROT_WRITE, VM_PROT_NONE,
 816                                 VM_WIMG_USE_DEFAULT, FALSE);
 817                 /*
 818                  * Account for newly stolen memory
 819                  */
 820                 vm_page_wire_count++;
 821
 822         }
 823
 824         return (void *) addr;
 825 }
 826
 827 void
 828 pmap_startup(
 829         vm_offset_t *startp,
 830         vm_offset_t *endp)
 831 {
 832         unsigned int i, npages, pages_initialized, fill, fillval;
 833         ppnum_t         phys_page;
 834         addr64_t        tmpaddr;
 835
 836         /*
 837          *      We calculate how many page frames we will have
 838          *      and then allocate the page structures in one chunk.
 839          */
 840
 841         tmpaddr = (addr64_t)pmap_free_pages() * (addr64_t)PAGE_SIZE;    /* Get the amount of memory left */
 842         tmpaddr = tmpaddr + (addr64_t)(round_page(virtual_space_start) - virtual_space_start);  /* Account for any slop */
 843         npages = (unsigned int)(tmpaddr / (addr64_t)(PAGE_SIZE + sizeof(*vm_pages)));   /* Figure size of all vm_page_ts, including enough to hold the vm_page_ts */
 844
 845         vm_pages = (vm_page_t) pmap_steal_memory(npages * sizeof *vm_pages);
 846
 847         /*
 848          *      Initialize the page frames.
 849          */
 850         for (i = 0, pages_initialized = 0; i < npages; i++) {
 851                 if (!pmap_next_page(&phys_page))
 852                         break;
 853                 if (pages_initialized == 0 || phys_page < vm_page_lowest)
 854                         vm_page_lowest = phys_page;
 855
 856                 vm_page_init(&vm_pages[i], phys_page, FALSE);
 857                 vm_page_pages++;
 858                 pages_initialized++;
 859         }
 860         vm_pages_count = pages_initialized;
 861
 862         /*
 863          * Check if we want to initialize pages to a known value
 864          */
 865         fill = 0;                                                               /* Assume no fill */
 866         if (PE_parse_boot_argn("fill", &fillval, sizeof (fillval))) fill = 1;                   /* Set fill */
 867 #if     DEBUG
 868         /* This slows down booting the DEBUG kernel, particularly on
 869          * large memory systems, but is worthwhile in deterministically
 870          * trapping uninitialized memory usage.
 871          */
 872         if (fill == 0) {
 873                 fill = 1;
 874                 fillval = 0xDEB8F177;
 875         }
 876 #endif
 877         if (fill)
 878                 kprintf("Filling vm_pages with pattern: 0x%x\n", fillval);
 879         // -debug code remove
 880         if (2 == vm_himemory_mode) {
 881                 // free low -> high so high is preferred
 882                 for (i = 1; i <= pages_initialized; i++) {
 883                         if(fill) fillPage(vm_pages[i - 1].phys_page, fillval);          /* Fill the page with a know value if requested at boot */
 884                         vm_page_release(&vm_pages[i - 1]);
 885                 }
 886         }
 887         else
 888         // debug code remove-
 889
 890         /*
 891          * Release pages in reverse order so that physical pages
 892          * initially get allocated in ascending addresses. This keeps
 893          * the devices (which must address physical memory) happy if
 894          * they require several consecutive pages.
 895          */
 896         for (i = pages_initialized; i > 0; i--) {
 897                 if(fill) fillPage(vm_pages[i - 1].phys_page, fillval);          /* Fill the page with a know value if requested at boot */
 898                 vm_page_release(&vm_pages[i - 1]);
 899         }
 900
 901 #if 0
 902         {
 903                 vm_page_t xx, xxo, xxl;
 904                 int i, j, k, l;
 905
 906                 j = 0;                                                                                                  /* (BRINGUP) */
 907                 xxl = 0;
 908
 909                 for( i = 0; i < vm_colors; i++ ) {
 910                         queue_iterate(&vm_page_queue_free[i],
 911                                       xx,
 912                                       vm_page_t,
 913                                       pageq) {  /* BRINGUP */
 914                                 j++;                                                                                            /* (BRINGUP) */
 915                                 if(j > vm_page_free_count) {                                            /* (BRINGUP) */
 916                                         panic("pmap_startup: too many pages, xx = %08X, xxl = %08X\n", xx, xxl);
 917                                 }
 918
 919                                 l = vm_page_free_count - j;                                                     /* (BRINGUP) */
 920                                 k = 0;                                                                                          /* (BRINGUP) */
 921
 922                                 if(((j - 1) & 0xFFFF) == 0) kprintf("checking number %d of %d\n", j, vm_page_free_count);
 923
 924                                 for(xxo = xx->pageq.next; xxo != &vm_page_queue_free[i]; xxo = xxo->pageq.next) {       /* (BRINGUP) */
 925                                         k++;
 926                                         if(k > l) panic("pmap_startup: too many in secondary check %d %d\n", k, l);
 927                                         if((xx->phys_page & 0xFFFFFFFF) == (xxo->phys_page & 0xFFFFFFFF)) {     /* (BRINGUP) */
 928                                                 panic("pmap_startup: duplicate physaddr, xx = %08X, xxo = %08X\n", xx, xxo);
 929                                         }
 930                                 }
 931
 932                                 xxl = xx;
 933                         }
 934                 }
 935
 936                 if(j != vm_page_free_count) {                                           /* (BRINGUP) */
 937                         panic("pmap_startup: vm_page_free_count does not match, calc =  %d, vm_page_free_count = %08X\n", j, vm_page_free_count);
 938                 }
 939         }
 940 #endif
 941
 942
 943         /*
 944          *      We have to re-align virtual_space_start,
 945          *      because pmap_steal_memory has been using it.
 946          */
 947
 948         virtual_space_start = round_page(virtual_space_start);
 949
 950         *startp = virtual_space_start;
 951         *endp = virtual_space_end;
 952 }
 953 #endif  /* MACHINE_PAGES */
 954
 955 /*
 956  *      Routine:        vm_page_module_init
 957  *      Purpose:
 958  *              Second initialization pass, to be done after
 959  *              the basic VM system is ready.
 960  */
 961 void
 962 vm_page_module_init(void)
 963 {
 964         vm_page_zone = zinit((vm_size_t) sizeof(struct vm_page),
 965                              0, PAGE_SIZE, "vm pages");
 966
 967 #if     ZONE_DEBUG
 968         zone_debug_disable(vm_page_zone);
 969 #endif  /* ZONE_DEBUG */
 970
 971         zone_change(vm_page_zone, Z_CALLERACCT, FALSE);
 972         zone_change(vm_page_zone, Z_EXPAND, FALSE);
 973         zone_change(vm_page_zone, Z_EXHAUST, TRUE);
 974         zone_change(vm_page_zone, Z_FOREIGN, TRUE);
 975         zone_change(vm_page_zone, Z_GZALLOC_EXEMPT, TRUE);
 976         /*
 977          * Adjust zone statistics to account for the real pages allocated
 978          * in vm_page_create(). [Q: is this really what we want?]
 979          */
 980         vm_page_zone->count += vm_page_pages;
 981         vm_page_zone->sum_count += vm_page_pages;
 982         vm_page_zone->cur_size += vm_page_pages * vm_page_zone->elem_size;
 983 }
 984
 985 /*
 986  *      Routine:        vm_page_create
 987  *      Purpose:
 988  *              After the VM system is up, machine-dependent code
 989  *              may stumble across more physical memory.  For example,
 990  *              memory that it was reserving for a frame buffer.
 991  *              vm_page_create turns this memory into available pages.
 992  */
 993
 994 void
 995 vm_page_create(
 996         ppnum_t start,
 997         ppnum_t end)
 998 {
 999         ppnum_t         phys_page;
1000         vm_page_t       m;
1001
1002         for (phys_page = start;
1003              phys_page < end;
1004              phys_page++) {
1005                 while ((m = (vm_page_t) vm_page_grab_fictitious_common(phys_page))
1006                         == VM_PAGE_NULL)
1007                         vm_page_more_fictitious();
1008
1009                 m->fictitious = FALSE;
1010                 pmap_clear_noencrypt(phys_page);
1011
1012                 vm_page_pages++;
1013                 vm_page_release(m);
1014         }
1015 }
1016
1017 /*
1018  *      vm_page_hash:
1019  *
1020  *      Distributes the object/offset key pair among hash buckets.
1021  *
1022  *      NOTE:   The bucket count must be a power of 2
1023  */
1024 #define vm_page_hash(object, offset) (\
1025         ( (natural_t)((uintptr_t)object * vm_page_bucket_hash) + ((uint32_t)atop_64(offset) ^ vm_page_bucket_hash))\
1026          & vm_page_hash_mask)
1027
1028
1029 /*
1030  *      vm_page_insert:         [ internal use only ]
1031  *
1032  *      Inserts the given mem entry into the object/object-page
1033  *      table and object list.
1034  *
1035  *      The object must be locked.
1036  */
1037 void
1038 vm_page_insert(
1039         vm_page_t               mem,
1040         vm_object_t             object,
1041         vm_object_offset_t      offset)
1042 {
1043         vm_page_insert_internal(mem, object, offset, FALSE, TRUE, FALSE);
1044 }
1045
1046 void
1047 vm_page_insert_internal(
1048         vm_page_t               mem,
1049         vm_object_t             object,
1050         vm_object_offset_t      offset,
1051         boolean_t               queues_lock_held,
1052         boolean_t               insert_in_hash,
1053         boolean_t               batch_pmap_op)
1054 {
1055         vm_page_bucket_t *bucket;
1056         lck_spin_t      *bucket_lock;
1057         int     hash_id;
1058
1059         XPR(XPR_VM_PAGE,
1060                 "vm_page_insert, object 0x%X offset 0x%X page 0x%X\n",
1061                 object, offset, mem, 0,0);
1062 #if 0
1063         /*
1064          * we may not hold the page queue lock
1065          * so this check isn't safe to make
1066          */
1067         VM_PAGE_CHECK(mem);
1068 #endif
1069
1070         assert(page_aligned(offset));
1071
1072         if (object == vm_submap_object) {
1073                 /* the vm_submap_object is only a placeholder for submaps */
1074                 panic("vm_page_insert(vm_submap_object,0x%llx)\n", offset);
1075         }
1076
1077         vm_object_lock_assert_exclusive(object);
1078 #if DEBUG
1079         lck_mtx_assert(&vm_page_queue_lock,
1080                        queues_lock_held ? LCK_MTX_ASSERT_OWNED
1081                                         : LCK_MTX_ASSERT_NOTOWNED);
1082 #endif  /* DEBUG */
1083
1084         if (insert_in_hash == TRUE) {
1085 #if DEBUG || VM_PAGE_CHECK_BUCKETS
1086                 if (mem->tabled || mem->object != VM_OBJECT_NULL)
1087                         panic("vm_page_insert: page %p for (obj=%p,off=0x%llx) "
1088                               "already in (obj=%p,off=0x%llx)",
1089                               mem, object, offset, mem->object, mem->offset);
1090 #endif
1091                 assert(!object->internal || offset < object->vo_size);
1092
1093                 /* only insert "pageout" pages into "pageout" objects,
1094                  * and normal pages into normal objects */
1095                 assert(object->pageout == mem->pageout);
1096
1097                 assert(vm_page_lookup(object, offset) == VM_PAGE_NULL);
1098
1099                 /*
1100                  *      Record the object/offset pair in this page
1101                  */
1102
1103                 mem->object = object;
1104                 mem->offset = offset;
1105
1106                 /*
1107                  *      Insert it into the object_object/offset hash table
1108                  */
1109                 hash_id = vm_page_hash(object, offset);
1110                 bucket = &vm_page_buckets[hash_id];
1111                 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
1112
1113                 lck_spin_lock(bucket_lock);
1114
1115                 mem->next = bucket->pages;
1116                 bucket->pages = mem;
1117 #if     MACH_PAGE_HASH_STATS
1118                 if (++bucket->cur_count > bucket->hi_count)
1119                         bucket->hi_count = bucket->cur_count;
1120 #endif /* MACH_PAGE_HASH_STATS */
1121                 mem->hashed = TRUE;
1122                 lck_spin_unlock(bucket_lock);
1123         }
1124
1125         {
1126                 unsigned int    cache_attr;
1127
1128                 cache_attr = object->wimg_bits & VM_WIMG_MASK;
1129
1130                 if (cache_attr != VM_WIMG_USE_DEFAULT) {
1131                         PMAP_SET_CACHE_ATTR(mem, object, cache_attr, batch_pmap_op);
1132                 }
1133         }
1134         /*
1135          *      Now link into the object's list of backed pages.
1136          */
1137         VM_PAGE_INSERT(mem, object);
1138         mem->tabled = TRUE;
1139
1140         /*
1141          *      Show that the object has one more resident page.
1142          */
1143
1144         object->resident_page_count++;
1145         if (VM_PAGE_WIRED(mem)) {
1146                 object->wired_page_count++;
1147         }
1148         assert(object->resident_page_count >= object->wired_page_count);
1149
1150         if (object->internal) {
1151                 OSAddAtomic(1, &vm_page_internal_count);
1152         } else {
1153                 OSAddAtomic(1, &vm_page_external_count);
1154         }
1155
1156         /*
1157          * It wouldn't make sense to insert a "reusable" page in
1158          * an object (the page would have been marked "reusable" only
1159          * at the time of a madvise(MADV_FREE_REUSABLE) if it was already
1160          * in the object at that time).
1161          * But a page could be inserted in a "all_reusable" object, if
1162          * something faults it in (a vm_read() from another task or a
1163          * "use-after-free" issue in user space, for example).  It can
1164          * also happen if we're relocating a page from that object to
1165          * a different physical page during a physically-contiguous
1166          * allocation.
1167          */
1168         assert(!mem->reusable);
1169         if (mem->object->all_reusable) {
1170                 OSAddAtomic(+1, &vm_page_stats_reusable.reusable_count);
1171         }
1172
1173         if (object->purgable == VM_PURGABLE_VOLATILE) {
1174                 if (VM_PAGE_WIRED(mem)) {
1175                         OSAddAtomic(1, &vm_page_purgeable_wired_count);
1176                 } else {
1177                         OSAddAtomic(1, &vm_page_purgeable_count);
1178                 }
1179         } else if (object->purgable == VM_PURGABLE_EMPTY &&
1180                    mem->throttled) {
1181                 /*
1182                  * This page belongs to a purged VM object but hasn't
1183                  * been purged (because it was "busy").
1184                  * It's in the "throttled" queue and hence not
1185                  * visible to vm_pageout_scan().  Move it to a pageable
1186                  * queue, so that it can eventually be reclaimed, instead
1187                  * of lingering in the "empty" object.
1188                  */
1189                 if (queues_lock_held == FALSE)
1190                         vm_page_lockspin_queues();
1191                 vm_page_deactivate(mem);
1192                 if (queues_lock_held == FALSE)
1193                         vm_page_unlock_queues();
1194         }
1195 }
1196
1197 /*
1198  *      vm_page_replace:
1199  *
1200  *      Exactly like vm_page_insert, except that we first
1201  *      remove any existing page at the given offset in object.
1202  *
1203  *      The object must be locked.
1204  */
1205 void
1206 vm_page_replace(
1207         register vm_page_t              mem,
1208         register vm_object_t            object,
1209         register vm_object_offset_t     offset)
1210 {
1211         vm_page_bucket_t *bucket;
1212         vm_page_t        found_m = VM_PAGE_NULL;
1213         lck_spin_t      *bucket_lock;
1214         int             hash_id;
1215
1216 #if 0
1217         /*
1218          * we don't hold the page queue lock
1219          * so this check isn't safe to make
1220          */
1221         VM_PAGE_CHECK(mem);
1222 #endif
1223         vm_object_lock_assert_exclusive(object);
1224 #if DEBUG || VM_PAGE_CHECK_BUCKETS
1225         if (mem->tabled || mem->object != VM_OBJECT_NULL)
1226                 panic("vm_page_replace: page %p for (obj=%p,off=0x%llx) "
1227                       "already in (obj=%p,off=0x%llx)",
1228                       mem, object, offset, mem->object, mem->offset);
1229         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
1230 #endif
1231         /*
1232          *      Record the object/offset pair in this page
1233          */
1234
1235         mem->object = object;
1236         mem->offset = offset;
1237
1238         /*
1239          *      Insert it into the object_object/offset hash table,
1240          *      replacing any page that might have been there.
1241          */
1242
1243         hash_id = vm_page_hash(object, offset);
1244         bucket = &vm_page_buckets[hash_id];
1245         bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
1246
1247         lck_spin_lock(bucket_lock);
1248
1249         if (bucket->pages) {
1250                 vm_page_t *mp = &bucket->pages;
1251                 vm_page_t m = *mp;
1252
1253                 do {
1254                         if (m->object == object && m->offset == offset) {
1255                                 /*
1256                                  * Remove old page from hash list
1257                                  */
1258                                 *mp = m->next;
1259                                 m->hashed = FALSE;
1260
1261                                 found_m = m;
1262                                 break;
1263                         }
1264                         mp = &m->next;
1265                 } while ((m = *mp));
1266
1267                 mem->next = bucket->pages;
1268         } else {
1269                 mem->next = VM_PAGE_NULL;
1270         }
1271         /*
1272          * insert new page at head of hash list
1273          */
1274         bucket->pages = mem;
1275         mem->hashed = TRUE;
1276
1277         lck_spin_unlock(bucket_lock);
1278
1279         if (found_m) {
1280                 /*
1281                  * there was already a page at the specified
1282                  * offset for this object... remove it from
1283                  * the object and free it back to the free list
1284                  */
1285                 vm_page_free_unlocked(found_m, FALSE);
1286         }
1287         vm_page_insert_internal(mem, object, offset, FALSE, FALSE, FALSE);
1288 }
1289
1290 /*
1291  *      vm_page_remove:         [ internal use only ]
1292  *
1293  *      Removes the given mem entry from the object/offset-page
1294  *      table and the object page list.
1295  *
1296  *      The object must be locked.
1297  */
1298
1299 void
1300 vm_page_remove(
1301         vm_page_t       mem,
1302         boolean_t       remove_from_hash)
1303 {
1304         vm_page_bucket_t *bucket;
1305         vm_page_t       this;
1306         lck_spin_t      *bucket_lock;
1307         int             hash_id;
1308
1309         XPR(XPR_VM_PAGE,
1310                 "vm_page_remove, object 0x%X offset 0x%X page 0x%X\n",
1311                 mem->object, mem->offset,
1312                 mem, 0,0);
1313
1314         vm_object_lock_assert_exclusive(mem->object);
1315         assert(mem->tabled);
1316         assert(!mem->cleaning);
1317         assert(!mem->laundry);
1318 #if 0
1319         /*
1320          * we don't hold the page queue lock
1321          * so this check isn't safe to make
1322          */
1323         VM_PAGE_CHECK(mem);
1324 #endif
1325         if (remove_from_hash == TRUE) {
1326                 /*
1327                  *      Remove from the object_object/offset hash table
1328                  */
1329                 hash_id = vm_page_hash(mem->object, mem->offset);
1330                 bucket = &vm_page_buckets[hash_id];
1331                 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
1332
1333                 lck_spin_lock(bucket_lock);
1334
1335                 if ((this = bucket->pages) == mem) {
1336                         /* optimize for common case */
1337
1338                         bucket->pages = mem->next;
1339                 } else {
1340                         vm_page_t       *prev;
1341
1342                         for (prev = &this->next;
1343                              (this = *prev) != mem;
1344                              prev = &this->next)
1345                                 continue;
1346                         *prev = this->next;
1347                 }
1348 #if     MACH_PAGE_HASH_STATS
1349                 bucket->cur_count--;
1350 #endif /* MACH_PAGE_HASH_STATS */
1351                 mem->hashed = FALSE;
1352                 lck_spin_unlock(bucket_lock);
1353         }
1354         /*
1355          *      Now remove from the object's list of backed pages.
1356          */
1357
1358         VM_PAGE_REMOVE(mem);
1359
1360         /*
1361          *      And show that the object has one fewer resident
1362          *      page.
1363          */
1364
1365         assert(mem->object->resident_page_count > 0);
1366         mem->object->resident_page_count--;
1367
1368         if (mem->object->internal) {
1369                 assert(vm_page_internal_count);
1370                 OSAddAtomic(-1, &vm_page_internal_count);
1371         } else {
1372                 assert(vm_page_external_count);
1373                 OSAddAtomic(-1, &vm_page_external_count);
1374         }
1375         if (!mem->object->internal && (mem->object->objq.next || mem->object->objq.prev)) {
1376                 if (mem->object->resident_page_count == 0)
1377                         vm_object_cache_remove(mem->object);
1378         }
1379
1380         if (VM_PAGE_WIRED(mem)) {
1381                 assert(mem->object->wired_page_count > 0);
1382                 mem->object->wired_page_count--;
1383         }
1384         assert(mem->object->resident_page_count >=
1385                mem->object->wired_page_count);
1386         if (mem->reusable) {
1387                 assert(mem->object->reusable_page_count > 0);
1388                 mem->object->reusable_page_count--;
1389                 assert(mem->object->reusable_page_count <=
1390                        mem->object->resident_page_count);
1391                 mem->reusable = FALSE;
1392                 OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
1393                 vm_page_stats_reusable.reused_remove++;
1394         } else if (mem->object->all_reusable) {
1395                 OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
1396                 vm_page_stats_reusable.reused_remove++;
1397         }
1398
1399         if (mem->object->purgable == VM_PURGABLE_VOLATILE) {
1400                 if (VM_PAGE_WIRED(mem)) {
1401                         assert(vm_page_purgeable_wired_count > 0);
1402                         OSAddAtomic(-1, &vm_page_purgeable_wired_count);
1403                 } else {
1404                         assert(vm_page_purgeable_count > 0);
1405                         OSAddAtomic(-1, &vm_page_purgeable_count);
1406                 }
1407         }
1408         if (mem->object->set_cache_attr == TRUE)
1409                 pmap_set_cache_attributes(mem->phys_page, 0);
1410
1411         mem->tabled = FALSE;
1412         mem->object = VM_OBJECT_NULL;
1413         mem->offset = (vm_object_offset_t) -1;
1414 }
1415
1416
1417 /*
1418  *      vm_page_lookup:
1419  *
1420  *      Returns the page associated with the object/offset
1421  *      pair specified; if none is found, VM_PAGE_NULL is returned.
1422  *
1423  *      The object must be locked.  No side effects.
1424  */
1425
1426 unsigned long vm_page_lookup_hint = 0;
1427 unsigned long vm_page_lookup_hint_next = 0;
1428 unsigned long vm_page_lookup_hint_prev = 0;
1429 unsigned long vm_page_lookup_hint_miss = 0;
1430 unsigned long vm_page_lookup_bucket_NULL = 0;
1431 unsigned long vm_page_lookup_miss = 0;
1432
1433
1434 vm_page_t
1435 vm_page_lookup(
1436         vm_object_t             object,
1437         vm_object_offset_t      offset)
1438 {
1439         vm_page_t       mem;
1440         vm_page_bucket_t *bucket;
1441         queue_entry_t   qe;
1442         lck_spin_t      *bucket_lock;
1443         int             hash_id;
1444
1445         vm_object_lock_assert_held(object);
1446         mem = object->memq_hint;
1447
1448         if (mem != VM_PAGE_NULL) {
1449                 assert(mem->object == object);
1450
1451                 if (mem->offset == offset) {
1452                         vm_page_lookup_hint++;
1453                         return mem;
1454                 }
1455                 qe = queue_next(&mem->listq);
1456
1457                 if (! queue_end(&object->memq, qe)) {
1458                         vm_page_t       next_page;
1459
1460                         next_page = (vm_page_t) qe;
1461                         assert(next_page->object == object);
1462
1463                         if (next_page->offset == offset) {
1464                                 vm_page_lookup_hint_next++;
1465                                 object->memq_hint = next_page; /* new hint */
1466                                 return next_page;
1467                         }
1468                 }
1469                 qe = queue_prev(&mem->listq);
1470
1471                 if (! queue_end(&object->memq, qe)) {
1472                         vm_page_t prev_page;
1473
1474                         prev_page = (vm_page_t) qe;
1475                         assert(prev_page->object == object);
1476
1477                         if (prev_page->offset == offset) {
1478                                 vm_page_lookup_hint_prev++;
1479                                 object->memq_hint = prev_page; /* new hint */
1480                                 return prev_page;
1481                         }
1482                 }
1483         }
1484         /*
1485          * Search the hash table for this object/offset pair
1486          */
1487         hash_id = vm_page_hash(object, offset);
1488         bucket = &vm_page_buckets[hash_id];
1489
1490         /*
1491          * since we hold the object lock, we are guaranteed that no
1492          * new pages can be inserted into this object... this in turn
1493          * guarantess that the page we're looking for can't exist
1494          * if the bucket it hashes to is currently NULL even when looked
1495          * at outside the scope of the hash bucket lock... this is a
1496          * really cheap optimiztion to avoid taking the lock
1497          */
1498         if (bucket->pages == VM_PAGE_NULL) {
1499                 vm_page_lookup_bucket_NULL++;
1500
1501                 return (VM_PAGE_NULL);
1502         }
1503         bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
1504
1505         lck_spin_lock(bucket_lock);
1506
1507         for (mem = bucket->pages; mem != VM_PAGE_NULL; mem = mem->next) {
1508 #if 0
1509                 /*
1510                  * we don't hold the page queue lock
1511                  * so this check isn't safe to make
1512                  */
1513                 VM_PAGE_CHECK(mem);
1514 #endif
1515                 if ((mem->object == object) && (mem->offset == offset))
1516                         break;
1517         }
1518         lck_spin_unlock(bucket_lock);
1519
1520         if (mem != VM_PAGE_NULL) {
1521                 if (object->memq_hint != VM_PAGE_NULL) {
1522                         vm_page_lookup_hint_miss++;
1523                 }
1524                 assert(mem->object == object);
1525                 object->memq_hint = mem;
1526         } else
1527                 vm_page_lookup_miss++;
1528
1529         return(mem);
1530 }
1531
1532
1533 /*
1534  *      vm_page_rename:
1535  *
1536  *      Move the given memory entry from its
1537  *      current object to the specified target object/offset.
1538  *
1539  *      The object must be locked.
1540  */
1541 void
1542 vm_page_rename(
1543         register vm_page_t              mem,
1544         register vm_object_t            new_object,
1545         vm_object_offset_t              new_offset,
1546         boolean_t                       encrypted_ok)
1547 {
1548         boolean_t       internal_to_external, external_to_internal;
1549
1550         assert(mem->object != new_object);
1551
1552         /*
1553          * ENCRYPTED SWAP:
1554          * The encryption key is based on the page's memory object
1555          * (aka "pager") and paging offset.  Moving the page to
1556          * another VM object changes its "pager" and "paging_offset"
1557          * so it has to be decrypted first, or we would lose the key.
1558          *
1559          * One exception is VM object collapsing, where we transfer pages
1560          * from one backing object to its parent object.  This operation also
1561          * transfers the paging information, so the <pager,paging_offset> info
1562          * should remain consistent.  The caller (vm_object_do_collapse())
1563          * sets "encrypted_ok" in this case.
1564          */
1565         if (!encrypted_ok && mem->encrypted) {
1566                 panic("vm_page_rename: page %p is encrypted\n", mem);
1567         }
1568
1569         XPR(XPR_VM_PAGE,
1570                 "vm_page_rename, new object 0x%X, offset 0x%X page 0x%X\n",
1571                 new_object, new_offset,
1572                 mem, 0,0);
1573
1574         /*
1575          *      Changes to mem->object require the page lock because
1576          *      the pageout daemon uses that lock to get the object.
1577          */
1578         vm_page_lockspin_queues();
1579
1580         internal_to_external = FALSE;
1581         external_to_internal = FALSE;
1582
1583         if (mem->local) {
1584                 /*
1585                  * it's much easier to get the vm_page_pageable_xxx accounting correct
1586                  * if we first move the page to the active queue... it's going to end
1587                  * up there anyway, and we don't do vm_page_rename's frequently enough
1588                  * for this to matter.
1589                  */
1590                 VM_PAGE_QUEUES_REMOVE(mem);
1591                 vm_page_activate(mem);
1592         }
1593         if (mem->active || mem->inactive || mem->speculative) {
1594                 if (mem->object->internal && !new_object->internal) {
1595                         internal_to_external = TRUE;
1596                 }
1597                 if (!mem->object->internal && new_object->internal) {
1598                         external_to_internal = TRUE;
1599                 }
1600         }
1601
1602         vm_page_remove(mem, TRUE);
1603         vm_page_insert_internal(mem, new_object, new_offset, TRUE, TRUE, FALSE);
1604
1605         if (internal_to_external) {
1606                 vm_page_pageable_internal_count--;
1607                 vm_page_pageable_external_count++;
1608         } else if (external_to_internal) {
1609                 vm_page_pageable_external_count--;
1610                 vm_page_pageable_internal_count++;
1611         }
1612
1613         vm_page_unlock_queues();
1614 }
1615
1616 /*
1617  *      vm_page_init:
1618  *
1619  *      Initialize the fields in a new page.
1620  *      This takes a structure with random values and initializes it
1621  *      so that it can be given to vm_page_release or vm_page_insert.
1622  */
1623 void
1624 vm_page_init(
1625         vm_page_t       mem,
1626         ppnum_t         phys_page,
1627         boolean_t       lopage)
1628 {
1629         assert(phys_page);
1630
1631 #if     DEBUG
1632         if ((phys_page != vm_page_fictitious_addr) && (phys_page != vm_page_guard_addr)) {
1633                 if (!(pmap_valid_page(phys_page))) {
1634                         panic("vm_page_init: non-DRAM phys_page 0x%x\n", phys_page);
1635                 }
1636         }
1637 #endif
1638         *mem = vm_page_template;
1639         mem->phys_page = phys_page;
1640 #if 0
1641         /*
1642          * we're leaving this turned off for now... currently pages
1643          * come off the free list and are either immediately dirtied/referenced
1644          * due to zero-fill or COW faults, or are used to read or write files...
1645          * in the file I/O case, the UPL mechanism takes care of clearing
1646          * the state of the HW ref/mod bits in a somewhat fragile way.
1647          * Since we may change the way this works in the future (to toughen it up),
1648          * I'm leaving this as a reminder of where these bits could get cleared
1649          */
1650
1651         /*
1652          * make sure both the h/w referenced and modified bits are
1653          * clear at this point... we are especially dependent on
1654          * not finding a 'stale' h/w modified in a number of spots
1655          * once this page goes back into use
1656          */
1657         pmap_clear_refmod(phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
1658 #endif
1659         mem->lopage = lopage;
1660 }
1661
1662 /*
1663  *      vm_page_grab_fictitious:
1664  *
1665  *      Remove a fictitious page from the free list.
1666  *      Returns VM_PAGE_NULL if there are no free pages.
1667  */
1668 int     c_vm_page_grab_fictitious = 0;
1669 int     c_vm_page_grab_fictitious_failed = 0;
1670 int     c_vm_page_release_fictitious = 0;
1671 int     c_vm_page_more_fictitious = 0;
1672
1673 vm_page_t
1674 vm_page_grab_fictitious_common(
1675         ppnum_t phys_addr)
1676 {
1677         vm_page_t       m;
1678
1679         if ((m = (vm_page_t)zget(vm_page_zone))) {
1680
1681                 vm_page_init(m, phys_addr, FALSE);
1682                 m->fictitious = TRUE;
1683
1684                 c_vm_page_grab_fictitious++;
1685         } else
1686                 c_vm_page_grab_fictitious_failed++;
1687
1688         return m;
1689 }
1690
1691 vm_page_t
1692 vm_page_grab_fictitious(void)
1693 {
1694         return vm_page_grab_fictitious_common(vm_page_fictitious_addr);
1695 }
1696
1697 vm_page_t
1698 vm_page_grab_guard(void)
1699 {
1700         return vm_page_grab_fictitious_common(vm_page_guard_addr);
1701 }
1702
1703
1704 /*
1705  *      vm_page_release_fictitious:
1706  *
1707  *      Release a fictitious page to the zone pool
1708  */
1709 void
1710 vm_page_release_fictitious(
1711         vm_page_t m)
1712 {
1713         assert(!m->free);
1714         assert(m->fictitious);
1715         assert(m->phys_page == vm_page_fictitious_addr ||
1716                m->phys_page == vm_page_guard_addr);
1717
1718         c_vm_page_release_fictitious++;
1719
1720         zfree(vm_page_zone, m);
1721 }
1722
1723 /*
1724  *      vm_page_more_fictitious:
1725  *
1726  *      Add more fictitious pages to the zone.
1727  *      Allowed to block. This routine is way intimate
1728  *      with the zones code, for several reasons:
1729  *      1. we need to carve some page structures out of physical
1730  *         memory before zones work, so they _cannot_ come from
1731  *         the zone_map.
1732  *      2. the zone needs to be collectable in order to prevent
1733  *         growth without bound. These structures are used by
1734  *         the device pager (by the hundreds and thousands), as
1735  *         private pages for pageout, and as blocking pages for
1736  *         pagein. Temporary bursts in demand should not result in
1737  *         permanent allocation of a resource.
1738  *      3. To smooth allocation humps, we allocate single pages
1739  *         with kernel_memory_allocate(), and cram them into the
1740  *         zone.
1741  */
1742
1743 void vm_page_more_fictitious(void)
1744 {
1745         vm_offset_t     addr;
1746         kern_return_t   retval;
1747
1748         c_vm_page_more_fictitious++;
1749
1750         /*
1751          * Allocate a single page from the zone_map. Do not wait if no physical
1752          * pages are immediately available, and do not zero the space. We need
1753          * our own blocking lock here to prevent having multiple,
1754          * simultaneous requests from piling up on the zone_map lock. Exactly
1755          * one (of our) threads should be potentially waiting on the map lock.
1756          * If winner is not vm-privileged, then the page allocation will fail,
1757          * and it will temporarily block here in the vm_page_wait().
1758          */
1759         lck_mtx_lock(&vm_page_alloc_lock);
1760         /*
1761          * If another thread allocated space, just bail out now.
1762          */
1763         if (zone_free_count(vm_page_zone) > 5) {
1764                 /*
1765                  * The number "5" is a small number that is larger than the
1766                  * number of fictitious pages that any single caller will
1767                  * attempt to allocate. Otherwise, a thread will attempt to
1768                  * acquire a fictitious page (vm_page_grab_fictitious), fail,
1769                  * release all of the resources and locks already acquired,
1770                  * and then call this routine. This routine finds the pages
1771                  * that the caller released, so fails to allocate new space.
1772                  * The process repeats infinitely. The largest known number
1773                  * of fictitious pages required in this manner is 2. 5 is
1774                  * simply a somewhat larger number.
1775                  */
1776                 lck_mtx_unlock(&vm_page_alloc_lock);
1777                 return;
1778         }
1779
1780         retval = kernel_memory_allocate(zone_map,
1781                                         &addr, PAGE_SIZE, VM_PROT_ALL,
1782                                         KMA_KOBJECT|KMA_NOPAGEWAIT);
1783         if (retval != KERN_SUCCESS) {
1784                 /*
1785                  * No page was available. Drop the
1786                  * lock to give another thread a chance at it, and
1787                  * wait for the pageout daemon to make progress.
1788                  */
1789                 lck_mtx_unlock(&vm_page_alloc_lock);
1790                 vm_page_wait(THREAD_UNINT);
1791                 return;
1792         }
1793
1794         /* Increment zone page count. We account for all memory managed by the zone in z->page_count */
1795         OSAddAtomic64(1, &(vm_page_zone->page_count));
1796
1797         zcram(vm_page_zone, addr, PAGE_SIZE);
1798
1799         lck_mtx_unlock(&vm_page_alloc_lock);
1800 }
1801
1802
1803 /*
1804  *      vm_pool_low():
1805  *
1806  *      Return true if it is not likely that a non-vm_privileged thread
1807  *      can get memory without blocking.  Advisory only, since the
1808  *      situation may change under us.
1809  */
1810 int
1811 vm_pool_low(void)
1812 {
1813         /* No locking, at worst we will fib. */
1814         return( vm_page_free_count <= vm_page_free_reserved );
1815 }
1816
1817
1818
1819 /*
1820  * this is an interface to support bring-up of drivers
1821  * on platforms with physical memory > 4G...
1822  */
1823 int             vm_himemory_mode = 0;
1824
1825
1826 /*
1827  * this interface exists to support hardware controllers
1828  * incapable of generating DMAs with more than 32 bits
1829  * of address on platforms with physical memory > 4G...
1830  */
1831 unsigned int    vm_lopages_allocated_q = 0;
1832 unsigned int    vm_lopages_allocated_cpm_success = 0;
1833 unsigned int    vm_lopages_allocated_cpm_failed = 0;
1834 queue_head_t    vm_lopage_queue_free;
1835
1836 vm_page_t
1837 vm_page_grablo(void)
1838 {
1839         vm_page_t       mem;
1840
1841         if (vm_lopage_needed == FALSE)
1842                 return (vm_page_grab());
1843
1844         lck_mtx_lock_spin(&vm_page_queue_free_lock);
1845
1846         if ( !queue_empty(&vm_lopage_queue_free)) {
1847                 queue_remove_first(&vm_lopage_queue_free,
1848                                    mem,
1849                                    vm_page_t,
1850                                    pageq);
1851                 assert(vm_lopage_free_count);
1852
1853                 vm_lopage_free_count--;
1854                 vm_lopages_allocated_q++;
1855
1856                 if (vm_lopage_free_count < vm_lopage_lowater)
1857                         vm_lopage_refill = TRUE;
1858
1859                 lck_mtx_unlock(&vm_page_queue_free_lock);
1860         } else {
1861                 lck_mtx_unlock(&vm_page_queue_free_lock);
1862
1863                 if (cpm_allocate(PAGE_SIZE, &mem, atop(0xffffffff), 0, FALSE, KMA_LOMEM) != KERN_SUCCESS) {
1864
1865                         lck_mtx_lock_spin(&vm_page_queue_free_lock);
1866                         vm_lopages_allocated_cpm_failed++;
1867                         lck_mtx_unlock(&vm_page_queue_free_lock);
1868
1869                         return (VM_PAGE_NULL);
1870                 }
1871                 mem->busy = TRUE;
1872
1873                 vm_page_lockspin_queues();
1874
1875                 mem->gobbled = FALSE;
1876                 vm_page_gobble_count--;
1877                 vm_page_wire_count--;
1878
1879                 vm_lopages_allocated_cpm_success++;
1880                 vm_page_unlock_queues();
1881         }
1882         assert(mem->busy);
1883         assert(!mem->free);
1884         assert(!mem->pmapped);
1885         assert(!mem->wpmapped);
1886         assert(!pmap_is_noencrypt(mem->phys_page));
1887
1888         mem->pageq.next = NULL;
1889         mem->pageq.prev = NULL;
1890
1891         return (mem);
1892 }
1893
1894
1895 /*
1896  *      vm_page_grab:
1897  *
1898  *      first try to grab a page from the per-cpu free list...
1899  *      this must be done while pre-emption is disabled... if
1900  *      a page is available, we're done...
1901  *      if no page is available, grab the vm_page_queue_free_lock
1902  *      and see if current number of free pages would allow us
1903  *      to grab at least 1... if not, return VM_PAGE_NULL as before...
1904  *      if there are pages available, disable preemption and
1905  *      recheck the state of the per-cpu free list... we could
1906  *      have been preempted and moved to a different cpu, or
1907  *      some other thread could have re-filled it... if still
1908  *      empty, figure out how many pages we can steal from the
1909  *      global free queue and move to the per-cpu queue...
1910  *      return 1 of these pages when done... only wakeup the
1911  *      pageout_scan thread if we moved pages from the global
1912  *      list... no need for the wakeup if we've satisfied the
1913  *      request from the per-cpu queue.
1914  */
1915
1916 #define COLOR_GROUPS_TO_STEAL   4
1917
1918
1919 vm_page_t
1920 vm_page_grab( void )
1921 {
1922         vm_page_t       mem;
1923
1924
1925         disable_preemption();
1926
1927         if ((mem = PROCESSOR_DATA(current_processor(), free_pages))) {
1928 return_page_from_cpu_list:
1929                 PROCESSOR_DATA(current_processor(), page_grab_count) += 1;
1930                 PROCESSOR_DATA(current_processor(), free_pages) = mem->pageq.next;
1931                 mem->pageq.next = NULL;
1932
1933                 enable_preemption();
1934
1935                 assert(mem->listq.next == NULL && mem->listq.prev == NULL);
1936                 assert(mem->tabled == FALSE);
1937                 assert(mem->object == VM_OBJECT_NULL);
1938                 assert(!mem->laundry);
1939                 assert(!mem->free);
1940                 assert(pmap_verify_free(mem->phys_page));
1941                 assert(mem->busy);
1942                 assert(!mem->encrypted);
1943                 assert(!mem->pmapped);
1944                 assert(!mem->wpmapped);
1945                 assert(!mem->active);
1946                 assert(!mem->inactive);
1947                 assert(!mem->throttled);
1948                 assert(!mem->speculative);
1949                 assert(!pmap_is_noencrypt(mem->phys_page));
1950
1951                 return mem;
1952         }
1953         enable_preemption();
1954
1955
1956         /*
1957          *      Optionally produce warnings if the wire or gobble
1958          *      counts exceed some threshold.
1959          */
1960         if (vm_page_wire_count_warning > 0
1961             && vm_page_wire_count >= vm_page_wire_count_warning) {
1962                 printf("mk: vm_page_grab(): high wired page count of %d\n",
1963                         vm_page_wire_count);
1964                 assert(vm_page_wire_count < vm_page_wire_count_warning);
1965         }
1966         if (vm_page_gobble_count_warning > 0
1967             && vm_page_gobble_count >= vm_page_gobble_count_warning) {
1968                 printf("mk: vm_page_grab(): high gobbled page count of %d\n",
1969                         vm_page_gobble_count);
1970                 assert(vm_page_gobble_count < vm_page_gobble_count_warning);
1971         }
1972
1973         lck_mtx_lock_spin(&vm_page_queue_free_lock);
1974
1975         /*
1976          *      Only let privileged threads (involved in pageout)
1977          *      dip into the reserved pool.
1978          */
1979         if ((vm_page_free_count < vm_page_free_reserved) &&
1980             !(current_thread()->options & TH_OPT_VMPRIV)) {
1981                 lck_mtx_unlock(&vm_page_queue_free_lock);
1982                 mem = VM_PAGE_NULL;
1983         }
1984         else {
1985                vm_page_t        head;
1986                vm_page_t        tail;
1987                unsigned int     pages_to_steal;
1988                unsigned int     color;
1989
1990                while ( vm_page_free_count == 0 ) {
1991
1992                         lck_mtx_unlock(&vm_page_queue_free_lock);
1993                         /*
1994                          * must be a privileged thread to be
1995                          * in this state since a non-privileged
1996                          * thread would have bailed if we were
1997                          * under the vm_page_free_reserved mark
1998                          */
1999                         VM_PAGE_WAIT();
2000                         lck_mtx_lock_spin(&vm_page_queue_free_lock);
2001                 }
2002
2003                 disable_preemption();
2004
2005                 if ((mem = PROCESSOR_DATA(current_processor(), free_pages))) {
2006                         lck_mtx_unlock(&vm_page_queue_free_lock);
2007
2008                         /*
2009                          * we got preempted and moved to another processor
2010                          * or we got preempted and someone else ran and filled the cache
2011                          */
2012                         goto return_page_from_cpu_list;
2013                 }
2014                 if (vm_page_free_count <= vm_page_free_reserved)
2015                         pages_to_steal = 1;
2016                 else {
2017                         pages_to_steal = COLOR_GROUPS_TO_STEAL * vm_colors;
2018
2019                         if (pages_to_steal > (vm_page_free_count - vm_page_free_reserved))
2020                                 pages_to_steal = (vm_page_free_count - vm_page_free_reserved);
2021                 }
2022                 color = PROCESSOR_DATA(current_processor(), start_color);
2023                 head = tail = NULL;
2024
2025                 while (pages_to_steal--) {
2026                         if (--vm_page_free_count < vm_page_free_count_minimum)
2027                                 vm_page_free_count_minimum = vm_page_free_count;
2028
2029                         while (queue_empty(&vm_page_queue_free[color]))
2030                                 color = (color + 1) & vm_color_mask;
2031
2032                         queue_remove_first(&vm_page_queue_free[color],
2033                                            mem,
2034                                            vm_page_t,
2035                                            pageq);
2036                         mem->pageq.next = NULL;
2037                         mem->pageq.prev = NULL;
2038
2039                         assert(!mem->active);
2040                         assert(!mem->inactive);
2041                         assert(!mem->throttled);
2042                         assert(!mem->speculative);
2043
2044                         color = (color + 1) & vm_color_mask;
2045
2046                         if (head == NULL)
2047                                 head = mem;
2048                         else
2049                                 tail->pageq.next = (queue_t)mem;
2050                         tail = mem;
2051
2052                         mem->pageq.prev = NULL;
2053                         assert(mem->listq.next == NULL && mem->listq.prev == NULL);
2054                         assert(mem->tabled == FALSE);
2055                         assert(mem->object == VM_OBJECT_NULL);
2056                         assert(!mem->laundry);
2057                         assert(mem->free);
2058                         mem->free = FALSE;
2059
2060                         assert(pmap_verify_free(mem->phys_page));
2061                         assert(mem->busy);
2062                         assert(!mem->free);
2063                         assert(!mem->encrypted);
2064                         assert(!mem->pmapped);
2065                         assert(!mem->wpmapped);
2066                         assert(!pmap_is_noencrypt(mem->phys_page));
2067                 }
2068                 PROCESSOR_DATA(current_processor(), free_pages) = head->pageq.next;
2069                 PROCESSOR_DATA(current_processor(), start_color) = color;
2070
2071                 /*
2072                  * satisfy this request
2073                  */
2074                 PROCESSOR_DATA(current_processor(), page_grab_count) += 1;
2075                 mem = head;
2076                 mem->pageq.next = NULL;
2077
2078                 lck_mtx_unlock(&vm_page_queue_free_lock);
2079
2080                 enable_preemption();
2081         }
2082         /*
2083          *      Decide if we should poke the pageout daemon.
2084          *      We do this if the free count is less than the low
2085          *      water mark, or if the free count is less than the high
2086          *      water mark (but above the low water mark) and the inactive
2087          *      count is less than its target.
2088          *
2089          *      We don't have the counts locked ... if they change a little,
2090          *      it doesn't really matter.
2091          */
2092         if ((vm_page_free_count < vm_page_free_min) ||
2093              ((vm_page_free_count < vm_page_free_target) &&
2094               ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min)))
2095                  thread_wakeup((event_t) &vm_page_free_wanted);
2096
2097         VM_CHECK_MEMORYSTATUS;
2098
2099 //      dbgLog(mem->phys_page, vm_page_free_count, vm_page_wire_count, 4);      /* (TEST/DEBUG) */
2100
2101         return mem;
2102 }
2103
2104 /*
2105  *      vm_page_release:
2106  *
2107  *      Return a page to the free list.
2108  */
2109
2110 void
2111 vm_page_release(
2112         register vm_page_t      mem)
2113 {
2114         unsigned int    color;
2115         int     need_wakeup = 0;
2116         int     need_priv_wakeup = 0;
2117
2118
2119         assert(!mem->private && !mem->fictitious);
2120         if (vm_page_free_verify) {
2121                 assert(pmap_verify_free(mem->phys_page));
2122         }
2123 //      dbgLog(mem->phys_page, vm_page_free_count, vm_page_wire_count, 5);      /* (TEST/DEBUG) */
2124
2125         pmap_clear_noencrypt(mem->phys_page);
2126
2127         lck_mtx_lock_spin(&vm_page_queue_free_lock);
2128 #if DEBUG
2129         if (mem->free)
2130                 panic("vm_page_release");
2131 #endif
2132
2133         assert(mem->busy);
2134         assert(!mem->laundry);
2135         assert(mem->object == VM_OBJECT_NULL);
2136         assert(mem->pageq.next == NULL &&
2137                mem->pageq.prev == NULL);
2138         assert(mem->listq.next == NULL &&
2139                mem->listq.prev == NULL);
2140
2141         if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) &&
2142             vm_lopage_free_count < vm_lopage_free_limit &&
2143             mem->phys_page < max_valid_low_ppnum) {
2144                 /*
2145                  * this exists to support hardware controllers
2146                  * incapable of generating DMAs with more than 32 bits
2147                  * of address on platforms with physical memory > 4G...
2148                  */
2149                 queue_enter_first(&vm_lopage_queue_free,
2150                                   mem,
2151                                   vm_page_t,
2152                                   pageq);
2153                 vm_lopage_free_count++;
2154
2155                 if (vm_lopage_free_count >= vm_lopage_free_limit)
2156                         vm_lopage_refill = FALSE;
2157
2158                 mem->lopage = TRUE;
2159         } else {
2160                 mem->lopage = FALSE;
2161                 mem->free = TRUE;
2162
2163                 color = mem->phys_page & vm_color_mask;
2164                 queue_enter_first(&vm_page_queue_free[color],
2165                                   mem,
2166                                   vm_page_t,
2167                                   pageq);
2168                 vm_page_free_count++;
2169                 /*
2170                  *      Check if we should wake up someone waiting for page.
2171                  *      But don't bother waking them unless they can allocate.
2172                  *
2173                  *      We wakeup only one thread, to prevent starvation.
2174                  *      Because the scheduling system handles wait queues FIFO,
2175                  *      if we wakeup all waiting threads, one greedy thread
2176                  *      can starve multiple niceguy threads.  When the threads
2177                  *      all wakeup, the greedy threads runs first, grabs the page,
2178                  *      and waits for another page.  It will be the first to run
2179                  *      when the next page is freed.
2180                  *
2181                  *      However, there is a slight danger here.
2182                  *      The thread we wake might not use the free page.
2183                  *      Then the other threads could wait indefinitely
2184                  *      while the page goes unused.  To forestall this,
2185                  *      the pageout daemon will keep making free pages
2186                  *      as long as vm_page_free_wanted is non-zero.
2187                  */
2188
2189                 assert(vm_page_free_count > 0);
2190                 if (vm_page_free_wanted_privileged > 0) {
2191                         vm_page_free_wanted_privileged--;
2192                         need_priv_wakeup = 1;
2193                 } else if (vm_page_free_wanted > 0 &&
2194                            vm_page_free_count > vm_page_free_reserved) {
2195                         vm_page_free_wanted--;
2196                         need_wakeup = 1;
2197                 }
2198         }
2199         lck_mtx_unlock(&vm_page_queue_free_lock);
2200
2201         if (need_priv_wakeup)
2202                 thread_wakeup_one((event_t) &vm_page_free_wanted_privileged);
2203         else if (need_wakeup)
2204                 thread_wakeup_one((event_t) &vm_page_free_count);
2205
2206         VM_CHECK_MEMORYSTATUS;
2207 }
2208
2209 /*
2210  *      vm_page_wait:
2211  *
2212  *      Wait for a page to become available.
2213  *      If there are plenty of free pages, then we don't sleep.
2214  *
2215  *      Returns:
2216  *              TRUE:  There may be another page, try again
2217  *              FALSE: We were interrupted out of our wait, don't try again
2218  */
2219
2220 boolean_t
2221 vm_page_wait(
2222         int     interruptible )
2223 {
2224         /*
2225          *      We can't use vm_page_free_reserved to make this
2226          *      determination.  Consider: some thread might
2227          *      need to allocate two pages.  The first allocation
2228          *      succeeds, the second fails.  After the first page is freed,
2229          *      a call to vm_page_wait must really block.
2230          */
2231         kern_return_t   wait_result;
2232         int             need_wakeup = 0;
2233         int             is_privileged = current_thread()->options & TH_OPT_VMPRIV;
2234
2235         lck_mtx_lock_spin(&vm_page_queue_free_lock);
2236
2237         if (is_privileged && vm_page_free_count) {
2238                 lck_mtx_unlock(&vm_page_queue_free_lock);
2239                 return TRUE;
2240         }
2241         if (vm_page_free_count < vm_page_free_target) {
2242
2243                 if (is_privileged) {
2244                         if (vm_page_free_wanted_privileged++ == 0)
2245                                 need_wakeup = 1;
2246                         wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, interruptible);
2247                 } else {
2248                         if (vm_page_free_wanted++ == 0)
2249                                 need_wakeup = 1;
2250                         wait_result = assert_wait((event_t)&vm_page_free_count, interruptible);
2251                 }
2252                 lck_mtx_unlock(&vm_page_queue_free_lock);
2253                 counter(c_vm_page_wait_block++);
2254
2255                 if (need_wakeup)
2256                         thread_wakeup((event_t)&vm_page_free_wanted);
2257
2258                 if (wait_result == THREAD_WAITING) {
2259                         VM_DEBUG_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
2260                                        vm_page_free_wanted_privileged, vm_page_free_wanted, 0, 0);
2261                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2262                         VM_DEBUG_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
2263                 }
2264
2265                 return(wait_result == THREAD_AWAKENED);
2266         } else {
2267                 lck_mtx_unlock(&vm_page_queue_free_lock);
2268                 return TRUE;
2269         }
2270 }
2271
2272 /*
2273  *      vm_page_alloc:
2274  *
2275  *      Allocate and return a memory cell associated
2276  *      with this VM object/offset pair.
2277  *
2278  *      Object must be locked.
2279  */
2280
2281 vm_page_t
2282 vm_page_alloc(
2283         vm_object_t             object,
2284         vm_object_offset_t      offset)
2285 {
2286         register vm_page_t      mem;
2287
2288         vm_object_lock_assert_exclusive(object);
2289         mem = vm_page_grab();
2290         if (mem == VM_PAGE_NULL)
2291                 return VM_PAGE_NULL;
2292
2293         vm_page_insert(mem, object, offset);
2294
2295         return(mem);
2296 }
2297
2298 vm_page_t
2299 vm_page_alloclo(
2300         vm_object_t             object,
2301         vm_object_offset_t      offset)
2302 {
2303         register vm_page_t      mem;
2304
2305         vm_object_lock_assert_exclusive(object);
2306         mem = vm_page_grablo();
2307         if (mem == VM_PAGE_NULL)
2308                 return VM_PAGE_NULL;
2309
2310         vm_page_insert(mem, object, offset);
2311
2312         return(mem);
2313 }
2314
2315
2316 /*
2317  *      vm_page_alloc_guard:
2318  *
2319  *      Allocate a fictitious page which will be used
2320  *      as a guard page.  The page will be inserted into
2321  *      the object and returned to the caller.
2322  */
2323
2324 vm_page_t
2325 vm_page_alloc_guard(
2326         vm_object_t             object,
2327         vm_object_offset_t      offset)
2328 {
2329         register vm_page_t      mem;
2330
2331         vm_object_lock_assert_exclusive(object);
2332         mem = vm_page_grab_guard();
2333         if (mem == VM_PAGE_NULL)
2334                 return VM_PAGE_NULL;
2335
2336         vm_page_insert(mem, object, offset);
2337
2338         return(mem);
2339 }
2340
2341
2342 counter(unsigned int c_laundry_pages_freed = 0;)
2343
2344 /*
2345  *      vm_page_free_prepare:
2346  *
2347  *      Removes page from any queue it may be on
2348  *      and disassociates it from its VM object.
2349  *
2350  *      Object and page queues must be locked prior to entry.
2351  */
2352 static void
2353 vm_page_free_prepare(
2354         vm_page_t       mem)
2355 {
2356         vm_page_free_prepare_queues(mem);
2357         vm_page_free_prepare_object(mem, TRUE);
2358 }
2359
2360
2361 void
2362 vm_page_free_prepare_queues(
2363         vm_page_t       mem)
2364 {
2365         VM_PAGE_CHECK(mem);
2366         assert(!mem->free);
2367         assert(!mem->cleaning);
2368 #if DEBUG
2369         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2370         if (mem->free)
2371                 panic("vm_page_free: freeing page on free list\n");
2372 #endif
2373         if (mem->object) {
2374                 vm_object_lock_assert_exclusive(mem->object);
2375         }
2376         if (mem->laundry) {
2377                 /*
2378                  * We may have to free a page while it's being laundered
2379                  * if we lost its pager (due to a forced unmount, for example).
2380                  * We need to call vm_pageout_steal_laundry() before removing
2381                  * the page from its VM object, so that we can remove it
2382                  * from its pageout queue and adjust the laundry accounting
2383                  */
2384                 vm_pageout_steal_laundry(mem, TRUE);
2385                 counter(++c_laundry_pages_freed);
2386         }
2387
2388         VM_PAGE_QUEUES_REMOVE(mem);     /* clears local/active/inactive/throttled/speculative */
2389
2390         if (VM_PAGE_WIRED(mem)) {
2391                 if (mem->object) {
2392                         assert(mem->object->wired_page_count > 0);
2393                         mem->object->wired_page_count--;
2394                         assert(mem->object->resident_page_count >=
2395                                mem->object->wired_page_count);
2396
2397                         if (mem->object->purgable == VM_PURGABLE_VOLATILE) {
2398                                 OSAddAtomic(+1, &vm_page_purgeable_count);
2399                                 assert(vm_page_purgeable_wired_count > 0);
2400                                 OSAddAtomic(-1, &vm_page_purgeable_wired_count);
2401                         }
2402                 }
2403                 if (!mem->private && !mem->fictitious)
2404                         vm_page_wire_count--;
2405                 mem->wire_count = 0;
2406                 assert(!mem->gobbled);
2407         } else if (mem->gobbled) {
2408                 if (!mem->private && !mem->fictitious)
2409                         vm_page_wire_count--;
2410                 vm_page_gobble_count--;
2411         }
2412 }
2413
2414
2415 void
2416 vm_page_free_prepare_object(
2417         vm_page_t       mem,
2418         boolean_t       remove_from_hash)
2419 {
2420         if (mem->tabled)
2421                 vm_page_remove(mem, remove_from_hash);  /* clears tabled, object, offset */
2422
2423         PAGE_WAKEUP(mem);               /* clears wanted */
2424
2425         if (mem->private) {
2426                 mem->private = FALSE;
2427                 mem->fictitious = TRUE;
2428                 mem->phys_page = vm_page_fictitious_addr;
2429         }
2430         if ( !mem->fictitious) {
2431                 vm_page_init(mem, mem->phys_page, mem->lopage);
2432         }
2433 }
2434
2435
2436 /*
2437  *      vm_page_free:
2438  *
2439  *      Returns the given page to the free list,
2440  *      disassociating it with any VM object.
2441  *
2442  *      Object and page queues must be locked prior to entry.
2443  */
2444 void
2445 vm_page_free(
2446         vm_page_t       mem)
2447 {
2448         vm_page_free_prepare(mem);
2449
2450         if (mem->fictitious) {
2451                 vm_page_release_fictitious(mem);
2452         } else {
2453                 vm_page_release(mem);
2454         }
2455 }
2456
2457
2458 void
2459 vm_page_free_unlocked(
2460         vm_page_t       mem,
2461         boolean_t       remove_from_hash)
2462 {
2463         vm_page_lockspin_queues();
2464         vm_page_free_prepare_queues(mem);
2465         vm_page_unlock_queues();
2466
2467         vm_page_free_prepare_object(mem, remove_from_hash);
2468
2469         if (mem->fictitious) {
2470                 vm_page_release_fictitious(mem);
2471         } else {
2472                 vm_page_release(mem);
2473         }
2474 }
2475
2476
2477 /*
2478  * Free a list of pages.  The list can be up to several hundred pages,
2479  * as blocked up by vm_pageout_scan().
2480  * The big win is not having to take the free list lock once
2481  * per page.
2482  */
2483 void
2484 vm_page_free_list(
2485         vm_page_t       freeq,
2486         boolean_t       prepare_object)
2487 {
2488         vm_page_t       mem;
2489         vm_page_t       nxt;
2490         vm_page_t       local_freeq;
2491         int             pg_count;
2492
2493         while (freeq) {
2494
2495                 pg_count = 0;
2496                 local_freeq = VM_PAGE_NULL;
2497                 mem = freeq;
2498
2499                 /*
2500                  * break up the processing into smaller chunks so
2501                  * that we can 'pipeline' the pages onto the
2502                  * free list w/o introducing too much
2503                  * contention on the global free queue lock
2504                  */
2505                 while (mem && pg_count < 64) {
2506
2507                         assert(!mem->inactive);
2508                         assert(!mem->active);
2509                         assert(!mem->throttled);
2510                         assert(!mem->free);
2511                         assert(!mem->speculative);
2512                         assert(!VM_PAGE_WIRED(mem));
2513                         assert(mem->pageq.prev == NULL);
2514
2515                         nxt = (vm_page_t)(mem->pageq.next);
2516
2517                         if (vm_page_free_verify && !mem->fictitious && !mem->private) {
2518                                 assert(pmap_verify_free(mem->phys_page));
2519                         }
2520                         if (prepare_object == TRUE)
2521                                 vm_page_free_prepare_object(mem, TRUE);
2522
2523                         if (!mem->fictitious) {
2524                                 assert(mem->busy);
2525
2526                                 if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) &&
2527                                     vm_lopage_free_count < vm_lopage_free_limit &&
2528                                     mem->phys_page < max_valid_low_ppnum) {
2529                                         mem->pageq.next = NULL;
2530                                         vm_page_release(mem);
2531                                 } else {
2532                                         /*
2533                                          * IMPORTANT: we can't set the page "free" here
2534                                          * because that would make the page eligible for
2535                                          * a physically-contiguous allocation (see
2536                                          * vm_page_find_contiguous()) right away (we don't
2537                                          * hold the vm_page_queue_free lock).  That would
2538                                          * cause trouble because the page is not actually
2539                                          * in the free queue yet...
2540                                          */
2541                                         mem->pageq.next = (queue_entry_t)local_freeq;
2542                                         local_freeq = mem;
2543                                         pg_count++;
2544
2545                                         pmap_clear_noencrypt(mem->phys_page);
2546                                 }
2547                         } else {
2548                                 assert(mem->phys_page == vm_page_fictitious_addr ||
2549                                        mem->phys_page == vm_page_guard_addr);
2550                                 vm_page_release_fictitious(mem);
2551                         }
2552                         mem = nxt;
2553                 }
2554                 freeq = mem;
2555
2556                 if ( (mem = local_freeq) ) {
2557                         unsigned int    avail_free_count;
2558                         unsigned int    need_wakeup = 0;
2559                         unsigned int    need_priv_wakeup = 0;
2560
2561                         lck_mtx_lock_spin(&vm_page_queue_free_lock);
2562
2563                         while (mem) {
2564                                 int     color;
2565
2566                                 nxt = (vm_page_t)(mem->pageq.next);
2567
2568                                 assert(!mem->free);
2569                                 assert(mem->busy);
2570                                 mem->free = TRUE;
2571
2572                                 color = mem->phys_page & vm_color_mask;
2573                                 queue_enter_first(&vm_page_queue_free[color],
2574                                                   mem,
2575                                                   vm_page_t,
2576                                                   pageq);
2577                                 mem = nxt;
2578                         }
2579                         vm_page_free_count += pg_count;
2580                         avail_free_count = vm_page_free_count;
2581
2582                         if (vm_page_free_wanted_privileged > 0 && avail_free_count > 0) {
2583
2584                                 if (avail_free_count < vm_page_free_wanted_privileged) {
2585                                         need_priv_wakeup = avail_free_count;
2586                                         vm_page_free_wanted_privileged -= avail_free_count;
2587                                         avail_free_count = 0;
2588                                 } else {
2589                                         need_priv_wakeup = vm_page_free_wanted_privileged;
2590                                         vm_page_free_wanted_privileged = 0;
2591                                         avail_free_count -= vm_page_free_wanted_privileged;
2592                                 }
2593                         }
2594                         if (vm_page_free_wanted > 0 && avail_free_count > vm_page_free_reserved) {
2595                                 unsigned int  available_pages;
2596
2597                                 available_pages = avail_free_count - vm_page_free_reserved;
2598
2599                                 if (available_pages >= vm_page_free_wanted) {
2600                                         need_wakeup = vm_page_free_wanted;
2601                                         vm_page_free_wanted = 0;
2602                                 } else {
2603                                         need_wakeup = available_pages;
2604                                         vm_page_free_wanted -= available_pages;
2605                                 }
2606                         }
2607                         lck_mtx_unlock(&vm_page_queue_free_lock);
2608
2609                         if (need_priv_wakeup != 0) {
2610                                 /*
2611                                  * There shouldn't be that many VM-privileged threads,
2612                                  * so let's wake them all up, even if we don't quite
2613                                  * have enough pages to satisfy them all.
2614                                  */
2615                                 thread_wakeup((event_t)&vm_page_free_wanted_privileged);
2616                         }
2617                         if (need_wakeup != 0 && vm_page_free_wanted == 0) {
2618                                 /*
2619                                  * We don't expect to have any more waiters
2620                                  * after this, so let's wake them all up at
2621                                  * once.
2622                                  */
2623                                 thread_wakeup((event_t) &vm_page_free_count);
2624                         } else for (; need_wakeup != 0; need_wakeup--) {
2625                                 /*
2626                                  * Wake up one waiter per page we just released.
2627                                  */
2628                                 thread_wakeup_one((event_t) &vm_page_free_count);
2629                         }
2630
2631                         VM_CHECK_MEMORYSTATUS;
2632                 }
2633         }
2634 }
2635
2636
2637 /*
2638  *      vm_page_wire:
2639  *
2640  *      Mark this page as wired down by yet
2641  *      another map, removing it from paging queues
2642  *      as necessary.
2643  *
2644  *      The page's object and the page queues must be locked.
2645  */
2646 void
2647 vm_page_wire(
2648         register vm_page_t      mem)
2649 {
2650
2651 //      dbgLog(current_thread(), mem->offset, mem->object, 1);  /* (TEST/DEBUG) */
2652
2653         VM_PAGE_CHECK(mem);
2654         if (mem->object) {
2655                 vm_object_lock_assert_exclusive(mem->object);
2656         } else {
2657                 /*
2658                  * In theory, the page should be in an object before it
2659                  * gets wired, since we need to hold the object lock
2660                  * to update some fields in the page structure.
2661                  * However, some code (i386 pmap, for example) might want
2662                  * to wire a page before it gets inserted into an object.
2663                  * That's somewhat OK, as long as nobody else can get to
2664                  * that page and update it at the same time.
2665                  */
2666         }
2667 #if DEBUG
2668         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2669 #endif
2670         if ( !VM_PAGE_WIRED(mem)) {
2671
2672                 if (mem->pageout_queue) {
2673                         mem->pageout = FALSE;
2674                         vm_pageout_throttle_up(mem);
2675                 }
2676                 VM_PAGE_QUEUES_REMOVE(mem);
2677
2678                 if (mem->object) {
2679                         mem->object->wired_page_count++;
2680                         assert(mem->object->resident_page_count >=
2681                                mem->object->wired_page_count);
2682                         if (mem->object->purgable == VM_PURGABLE_VOLATILE) {
2683                                 assert(vm_page_purgeable_count > 0);
2684                                 OSAddAtomic(-1, &vm_page_purgeable_count);
2685                                 OSAddAtomic(1, &vm_page_purgeable_wired_count);
2686                         }
2687                         if (mem->object->all_reusable) {
2688                                 /*
2689                                  * Wired pages are not counted as "re-usable"
2690                                  * in "all_reusable" VM objects, so nothing
2691                                  * to do here.
2692                                  */
2693                         } else if (mem->reusable) {
2694                                 /*
2695                                  * This page is not "re-usable" when it's
2696                                  * wired, so adjust its state and the
2697                                  * accounting.
2698                                  */
2699                                 vm_object_reuse_pages(mem->object,
2700                                                       mem->offset,
2701                                                       mem->offset+PAGE_SIZE_64,
2702                                                       FALSE);
2703                         }
2704                 }
2705                 assert(!mem->reusable);
2706
2707                 if (!mem->private && !mem->fictitious && !mem->gobbled)
2708                         vm_page_wire_count++;
2709                 if (mem->gobbled)
2710                         vm_page_gobble_count--;
2711                 mem->gobbled = FALSE;
2712
2713                 VM_CHECK_MEMORYSTATUS;
2714
2715                 /*
2716                  * ENCRYPTED SWAP:
2717                  * The page could be encrypted, but
2718                  * We don't have to decrypt it here
2719                  * because we don't guarantee that the
2720                  * data is actually valid at this point.
2721                  * The page will get decrypted in
2722                  * vm_fault_wire() if needed.
2723                  */
2724         }
2725         assert(!mem->gobbled);
2726         mem->wire_count++;
2727         VM_PAGE_CHECK(mem);
2728 }
2729
2730 /*
2731  *      vm_page_gobble:
2732  *
2733  *      Mark this page as consumed by the vm/ipc/xmm subsystems.
2734  *
2735  *      Called only for freshly vm_page_grab()ed pages - w/ nothing locked.
2736  */
2737 void
2738 vm_page_gobble(
2739         register vm_page_t      mem)
2740 {
2741         vm_page_lockspin_queues();
2742         VM_PAGE_CHECK(mem);
2743
2744         assert(!mem->gobbled);
2745         assert( !VM_PAGE_WIRED(mem));
2746
2747         if (!mem->gobbled && !VM_PAGE_WIRED(mem)) {
2748                 if (!mem->private && !mem->fictitious)
2749                         vm_page_wire_count++;
2750         }
2751         vm_page_gobble_count++;
2752         mem->gobbled = TRUE;
2753         vm_page_unlock_queues();
2754 }
2755
2756 /*
2757  *      vm_page_unwire:
2758  *
2759  *      Release one wiring of this page, potentially
2760  *      enabling it to be paged again.
2761  *
2762  *      The page's object and the page queues must be locked.
2763  */
2764 void
2765 vm_page_unwire(
2766         vm_page_t       mem,
2767         boolean_t       queueit)
2768 {
2769
2770 //      dbgLog(current_thread(), mem->offset, mem->object, 0);  /* (TEST/DEBUG) */
2771
2772         VM_PAGE_CHECK(mem);
2773         assert(VM_PAGE_WIRED(mem));
2774         assert(mem->object != VM_OBJECT_NULL);
2775 #if DEBUG
2776         vm_object_lock_assert_exclusive(mem->object);
2777         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2778 #endif
2779         if (--mem->wire_count == 0) {
2780                 assert(!mem->private && !mem->fictitious);
2781                 vm_page_wire_count--;
2782                 assert(mem->object->wired_page_count > 0);
2783                 mem->object->wired_page_count--;
2784                 assert(mem->object->resident_page_count >=
2785                        mem->object->wired_page_count);
2786                 if (mem->object->purgable == VM_PURGABLE_VOLATILE) {
2787                         OSAddAtomic(+1, &vm_page_purgeable_count);
2788                         assert(vm_page_purgeable_wired_count > 0);
2789                         OSAddAtomic(-1, &vm_page_purgeable_wired_count);
2790                 }
2791                 assert(!mem->laundry);
2792                 assert(mem->object != kernel_object);
2793                 assert(mem->pageq.next == NULL && mem->pageq.prev == NULL);
2794
2795                 if (queueit == TRUE) {
2796                         if (mem->object->purgable == VM_PURGABLE_EMPTY) {
2797                                 vm_page_deactivate(mem);
2798                         } else {
2799                                 vm_page_activate(mem);
2800                         }
2801                 }
2802
2803                 VM_CHECK_MEMORYSTATUS;
2804
2805         }
2806         VM_PAGE_CHECK(mem);
2807 }
2808
2809 /*
2810  *      vm_page_deactivate:
2811  *
2812  *      Returns the given page to the inactive list,
2813  *      indicating that no physical maps have access
2814  *      to this page.  [Used by the physical mapping system.]
2815  *
2816  *      The page queues must be locked.
2817  */
2818 void
2819 vm_page_deactivate(
2820         vm_page_t       m)
2821 {
2822         vm_page_deactivate_internal(m, TRUE);
2823 }
2824
2825
2826 void
2827 vm_page_deactivate_internal(
2828         vm_page_t       m,
2829         boolean_t       clear_hw_reference)
2830 {
2831
2832         VM_PAGE_CHECK(m);
2833         assert(m->object != kernel_object);
2834         assert(m->phys_page != vm_page_guard_addr);
2835
2836 //      dbgLog(m->phys_page, vm_page_free_count, vm_page_wire_count, 6);        /* (TEST/DEBUG) */
2837 #if DEBUG
2838         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2839 #endif
2840         /*
2841          *      This page is no longer very interesting.  If it was
2842          *      interesting (active or inactive/referenced), then we
2843          *      clear the reference bit and (re)enter it in the
2844          *      inactive queue.  Note wired pages should not have
2845          *      their reference bit cleared.
2846          */
2847         assert ( !(m->absent && !m->unusual));
2848
2849         if (m->gobbled) {               /* can this happen? */
2850                 assert( !VM_PAGE_WIRED(m));
2851
2852                 if (!m->private && !m->fictitious)
2853                         vm_page_wire_count--;
2854                 vm_page_gobble_count--;
2855                 m->gobbled = FALSE;
2856         }
2857         /*
2858          * if this page is currently on the pageout queue, we can't do the
2859          * VM_PAGE_QUEUES_REMOVE (which doesn't handle the pageout queue case)
2860          * and we can't remove it manually since we would need the object lock
2861          * (which is not required here) to decrement the activity_in_progress
2862          * reference which is held on the object while the page is in the pageout queue...
2863          * just let the normal laundry processing proceed
2864          */
2865         if (m->pageout_queue || m->private || m->fictitious || m->compressor || (VM_PAGE_WIRED(m)))
2866                 return;
2867
2868         if (!m->absent && clear_hw_reference == TRUE)
2869                 pmap_clear_reference(m->phys_page);
2870
2871         m->reference = FALSE;
2872         m->no_cache = FALSE;
2873
2874         if (!m->inactive) {
2875                 VM_PAGE_QUEUES_REMOVE(m);
2876
2877                 if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
2878                     m->dirty && m->object->internal &&
2879                     (m->object->purgable == VM_PURGABLE_DENY ||
2880                      m->object->purgable == VM_PURGABLE_NONVOLATILE ||
2881                      m->object->purgable == VM_PURGABLE_VOLATILE)) {
2882                         queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
2883                         m->throttled = TRUE;
2884                         vm_page_throttled_count++;
2885                 } else {
2886                         if (m->object->named && m->object->ref_count == 1) {
2887                                 vm_page_speculate(m, FALSE);
2888 #if DEVELOPMENT || DEBUG
2889                                 vm_page_speculative_recreated++;
2890 #endif
2891                         } else {
2892                                 VM_PAGE_ENQUEUE_INACTIVE(m, FALSE);
2893                         }
2894                 }
2895         }
2896 }
2897
2898 /*
2899  * vm_page_enqueue_cleaned
2900  *
2901  * Put the page on the cleaned queue, mark it cleaned, etc.
2902  * Being on the cleaned queue (and having m->clean_queue set)
2903  * does ** NOT ** guarantee that the page is clean!
2904  *
2905  * Call with the queues lock held.
2906  */
2907
2908 void vm_page_enqueue_cleaned(vm_page_t m)
2909 {
2910         assert(m->phys_page != vm_page_guard_addr);
2911 #if DEBUG
2912         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2913 #endif
2914         assert( !(m->absent && !m->unusual));
2915
2916         if (m->gobbled) {
2917                 assert( !VM_PAGE_WIRED(m));
2918                 if (!m->private && !m->fictitious)
2919                         vm_page_wire_count--;
2920                 vm_page_gobble_count--;
2921                 m->gobbled = FALSE;
2922         }
2923         /*
2924          * if this page is currently on the pageout queue, we can't do the
2925          * VM_PAGE_QUEUES_REMOVE (which doesn't handle the pageout queue case)
2926          * and we can't remove it manually since we would need the object lock
2927          * (which is not required here) to decrement the activity_in_progress
2928          * reference which is held on the object while the page is in the pageout queue...
2929          * just let the normal laundry processing proceed
2930          */
2931         if (m->clean_queue || m->pageout_queue || m->private || m->fictitious)
2932                 return;
2933
2934         VM_PAGE_QUEUES_REMOVE(m);
2935
2936         queue_enter(&vm_page_queue_cleaned, m, vm_page_t, pageq);
2937         m->clean_queue = TRUE;
2938         vm_page_cleaned_count++;
2939
2940         m->inactive = TRUE;
2941         vm_page_inactive_count++;
2942         if (m->object->internal) {
2943                 vm_page_pageable_internal_count++;
2944         } else {
2945                 vm_page_pageable_external_count++;
2946         }
2947
2948         vm_pageout_enqueued_cleaned++;
2949 }
2950
2951 /*
2952  *      vm_page_activate:
2953  *
2954  *      Put the specified page on the active list (if appropriate).
2955  *
2956  *      The page queues must be locked.
2957  */
2958
2959 #if CONFIG_JETSAM
2960 #if LATENCY_JETSAM
2961 extern struct vm_page   jetsam_latency_page[NUM_OF_JETSAM_LATENCY_TOKENS];
2962 #endif  /* LATENCY_JETSAM */
2963 #endif  /* CONFIG_JETSAM */
2964
2965 void
2966 vm_page_activate(
2967         register vm_page_t      m)
2968 {
2969         VM_PAGE_CHECK(m);
2970 #ifdef  FIXME_4778297
2971         assert(m->object != kernel_object);
2972 #endif
2973         assert(m->phys_page != vm_page_guard_addr);
2974 #if DEBUG
2975         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2976 #endif
2977         assert( !(m->absent && !m->unusual));
2978
2979         if (m->gobbled) {
2980                 assert( !VM_PAGE_WIRED(m));
2981                 if (!m->private && !m->fictitious)
2982                         vm_page_wire_count--;
2983                 vm_page_gobble_count--;
2984                 m->gobbled = FALSE;
2985         }
2986         /*
2987          * if this page is currently on the pageout queue, we can't do the
2988          * VM_PAGE_QUEUES_REMOVE (which doesn't handle the pageout queue case)
2989          * and we can't remove it manually since we would need the object lock
2990          * (which is not required here) to decrement the activity_in_progress
2991          * reference which is held on the object while the page is in the pageout queue...
2992          * just let the normal laundry processing proceed
2993          */
2994         if (m->pageout_queue || m->private || m->fictitious || m->compressor)
2995                 return;
2996
2997 #if DEBUG
2998         if (m->active)
2999                 panic("vm_page_activate: already active");
3000 #endif
3001
3002         if (m->speculative) {
3003                 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
3004                 DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL);
3005         }
3006
3007         VM_PAGE_QUEUES_REMOVE(m);
3008
3009         if ( !VM_PAGE_WIRED(m)) {
3010
3011                 if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
3012                     m->dirty && m->object->internal &&
3013                     (m->object->purgable == VM_PURGABLE_DENY ||
3014                      m->object->purgable == VM_PURGABLE_NONVOLATILE ||
3015                      m->object->purgable == VM_PURGABLE_VOLATILE)) {
3016                         queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
3017                         m->throttled = TRUE;
3018                         vm_page_throttled_count++;
3019                 } else {
3020                         queue_enter(&vm_page_queue_active, m, vm_page_t, pageq);
3021                         m->active = TRUE;
3022                         vm_page_active_count++;
3023                         if (m->object->internal) {
3024                                 vm_page_pageable_internal_count++;
3025                         } else {
3026                                 vm_page_pageable_external_count++;
3027                         }
3028 #if LATENCY_JETSAM
3029                         if (jlp_init) {
3030                                 uint64_t now = mach_absolute_time();
3031                                 uint64_t delta = now - jlp_time;
3032                                 clock_sec_t jl_secs = 0;
3033                                 clock_usec_t jl_usecs = 0;
3034                                 vm_page_t jlp;
3035
3036                                 absolutetime_to_microtime(delta, &jl_secs, &jl_usecs);
3037
3038                                 jl_usecs += jl_secs * USEC_PER_SEC;
3039                                 if (jl_usecs >= JETSAM_LATENCY_TOKEN_AGE) {
3040
3041                                         jlp = &jetsam_latency_page[jlp_current];
3042                                         if (jlp->active) {
3043                                                 queue_remove(&vm_page_queue_active, jlp, vm_page_t, pageq);
3044                                         }
3045                                         queue_enter(&vm_page_queue_active, jlp, vm_page_t, pageq);
3046
3047                                         jlp->active = TRUE;
3048
3049                                         jlp->offset = now;
3050                                         jlp_time = jlp->offset;
3051
3052                                         if(++jlp_current == NUM_OF_JETSAM_LATENCY_TOKENS) {
3053                                                 jlp_current = 0;
3054                                         }
3055
3056                                 }
3057                         }
3058 #endif  /* LATENCY_JETSAM */
3059                 }
3060                 m->reference = TRUE;
3061                 m->no_cache = FALSE;
3062         }
3063         VM_PAGE_CHECK(m);
3064 }
3065
3066
3067 /*
3068  *      vm_page_speculate:
3069  *
3070  *      Put the specified page on the speculative list (if appropriate).
3071  *
3072  *      The page queues must be locked.
3073  */
3074 void
3075 vm_page_speculate(
3076         vm_page_t       m,
3077         boolean_t       new)
3078 {
3079         struct vm_speculative_age_q     *aq;
3080
3081         VM_PAGE_CHECK(m);
3082         assert(m->object != kernel_object);
3083         assert(m->phys_page != vm_page_guard_addr);
3084 #if DEBUG
3085         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3086 #endif
3087         assert( !(m->absent && !m->unusual));
3088
3089         /*
3090          * if this page is currently on the pageout queue, we can't do the
3091          * VM_PAGE_QUEUES_REMOVE (which doesn't handle the pageout queue case)
3092          * and we can't remove it manually since we would need the object lock
3093          * (which is not required here) to decrement the activity_in_progress
3094          * reference which is held on the object while the page is in the pageout queue...
3095          * just let the normal laundry processing proceed
3096          */
3097         if (m->pageout_queue || m->private || m->fictitious || m->compressor)
3098                 return;
3099
3100         VM_PAGE_QUEUES_REMOVE(m);
3101
3102         if ( !VM_PAGE_WIRED(m)) {
3103                 mach_timespec_t         ts;
3104                 clock_sec_t sec;
3105                 clock_nsec_t nsec;
3106
3107                 clock_get_system_nanotime(&sec, &nsec);
3108                 ts.tv_sec = (unsigned int) sec;
3109                 ts.tv_nsec = nsec;
3110
3111                 if (vm_page_speculative_count == 0) {
3112
3113                         speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
3114                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
3115
3116                         aq = &vm_page_queue_speculative[speculative_age_index];
3117
3118                         /*
3119                          * set the timer to begin a new group
3120                          */
3121                         aq->age_ts.tv_sec = vm_page_speculative_q_age_ms / 1000;
3122                         aq->age_ts.tv_nsec = (vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
3123
3124                         ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
3125                 } else {
3126                         aq = &vm_page_queue_speculative[speculative_age_index];
3127
3128                         if (CMP_MACH_TIMESPEC(&ts, &aq->age_ts) >= 0) {
3129
3130                                 speculative_age_index++;
3131
3132                                 if (speculative_age_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
3133                                         speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
3134                                 if (speculative_age_index == speculative_steal_index) {
3135                                         speculative_steal_index = speculative_age_index + 1;
3136
3137                                         if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
3138                                                 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
3139                                 }
3140                                 aq = &vm_page_queue_speculative[speculative_age_index];
3141
3142                                 if (!queue_empty(&aq->age_q))
3143                                         vm_page_speculate_ageit(aq);
3144
3145                                 aq->age_ts.tv_sec = vm_page_speculative_q_age_ms / 1000;
3146                                 aq->age_ts.tv_nsec = (vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
3147
3148                                 ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
3149                         }
3150                 }
3151                 enqueue_tail(&aq->age_q, &m->pageq);
3152                 m->speculative = TRUE;
3153                 vm_page_speculative_count++;
3154                 if (m->object->internal) {
3155                         vm_page_pageable_internal_count++;
3156                 } else {
3157                         vm_page_pageable_external_count++;
3158                 }
3159
3160                 if (new == TRUE) {
3161                         vm_object_lock_assert_exclusive(m->object);
3162
3163                         m->object->pages_created++;
3164 #if DEVELOPMENT || DEBUG
3165                         vm_page_speculative_created++;
3166 #endif
3167                 }
3168         }
3169         VM_PAGE_CHECK(m);
3170 }
3171
3172
3173 /*
3174  * move pages from the specified aging bin to
3175  * the speculative bin that pageout_scan claims from
3176  *
3177  *      The page queues must be locked.
3178  */
3179 void
3180 vm_page_speculate_ageit(struct vm_speculative_age_q *aq)
3181 {
3182         struct vm_speculative_age_q     *sq;
3183         vm_page_t       t;
3184
3185         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3186
3187         if (queue_empty(&sq->age_q)) {
3188                 sq->age_q.next = aq->age_q.next;
3189                 sq->age_q.prev = aq->age_q.prev;
3190
3191                 t = (vm_page_t)sq->age_q.next;
3192                 t->pageq.prev = &sq->age_q;
3193
3194                 t = (vm_page_t)sq->age_q.prev;
3195                 t->pageq.next = &sq->age_q;
3196         } else {
3197                 t = (vm_page_t)sq->age_q.prev;
3198                 t->pageq.next = aq->age_q.next;
3199
3200                 t = (vm_page_t)aq->age_q.next;
3201                 t->pageq.prev = sq->age_q.prev;
3202
3203                 t = (vm_page_t)aq->age_q.prev;
3204                 t->pageq.next = &sq->age_q;
3205
3206                 sq->age_q.prev = aq->age_q.prev;
3207         }
3208         queue_init(&aq->age_q);
3209 }
3210
3211
3212 void
3213 vm_page_lru(
3214         vm_page_t       m)
3215 {
3216         VM_PAGE_CHECK(m);
3217         assert(m->object != kernel_object);
3218         assert(m->phys_page != vm_page_guard_addr);
3219
3220 #if DEBUG
3221         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3222 #endif
3223         /*
3224          * if this page is currently on the pageout queue, we can't do the
3225          * VM_PAGE_QUEUES_REMOVE (which doesn't handle the pageout queue case)
3226          * and we can't remove it manually since we would need the object lock
3227          * (which is not required here) to decrement the activity_in_progress
3228          * reference which is held on the object while the page is in the pageout queue...
3229          * just let the normal laundry processing proceed
3230          */
3231         if (m->pageout_queue || m->private || m->compressor || (VM_PAGE_WIRED(m)))
3232                 return;
3233
3234         m->no_cache = FALSE;
3235
3236         VM_PAGE_QUEUES_REMOVE(m);
3237
3238         VM_PAGE_ENQUEUE_INACTIVE(m, FALSE);
3239 }
3240
3241
3242 void
3243 vm_page_reactivate_all_throttled(void)
3244 {
3245         vm_page_t       first_throttled, last_throttled;
3246         vm_page_t       first_active;
3247         vm_page_t       m;
3248         int             extra_active_count;
3249         int             extra_internal_count, extra_external_count;
3250
3251         if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default))
3252                 return;
3253
3254         extra_active_count = 0;
3255         extra_internal_count = 0;
3256         extra_external_count = 0;
3257         vm_page_lock_queues();
3258         if (! queue_empty(&vm_page_queue_throttled)) {
3259                 /*
3260                  * Switch "throttled" pages to "active".
3261                  */
3262                 queue_iterate(&vm_page_queue_throttled, m, vm_page_t, pageq) {
3263                         VM_PAGE_CHECK(m);
3264                         assert(m->throttled);
3265                         assert(!m->active);
3266                         assert(!m->inactive);
3267                         assert(!m->speculative);
3268                         assert(!VM_PAGE_WIRED(m));
3269
3270                         extra_active_count++;
3271                         if (m->object->internal) {
3272                                 extra_internal_count++;
3273                         } else {
3274                                 extra_external_count++;
3275                         }
3276
3277                         m->throttled = FALSE;
3278                         m->active = TRUE;
3279                         VM_PAGE_CHECK(m);
3280                 }
3281
3282                 /*
3283                  * Transfer the entire throttled queue to a regular LRU page queues.
3284                  * We insert it at the head of the active queue, so that these pages
3285                  * get re-evaluated by the LRU algorithm first, since they've been
3286                  * completely out of it until now.
3287                  */
3288                 first_throttled = (vm_page_t) queue_first(&vm_page_queue_throttled);
3289                 last_throttled = (vm_page_t) queue_last(&vm_page_queue_throttled);
3290                 first_active = (vm_page_t) queue_first(&vm_page_queue_active);
3291                 if (queue_empty(&vm_page_queue_active)) {
3292                         queue_last(&vm_page_queue_active) = (queue_entry_t) last_throttled;
3293                 } else {
3294                         queue_prev(&first_active->pageq) = (queue_entry_t) last_throttled;
3295                 }
3296                 queue_first(&vm_page_queue_active) = (queue_entry_t) first_throttled;
3297                 queue_prev(&first_throttled->pageq) = (queue_entry_t) &vm_page_queue_active;
3298                 queue_next(&last_throttled->pageq) = (queue_entry_t) first_active;
3299
3300 #if DEBUG
3301                 printf("reactivated %d throttled pages\n", vm_page_throttled_count);
3302 #endif
3303                 queue_init(&vm_page_queue_throttled);
3304                 /*
3305                  * Adjust the global page counts.
3306                  */
3307                 vm_page_active_count += extra_active_count;
3308                 vm_page_pageable_internal_count += extra_internal_count;
3309                 vm_page_pageable_external_count += extra_external_count;
3310                 vm_page_throttled_count = 0;
3311         }
3312         assert(vm_page_throttled_count == 0);
3313         assert(queue_empty(&vm_page_queue_throttled));
3314         vm_page_unlock_queues();
3315 }
3316
3317
3318 /*
3319  * move pages from the indicated local queue to the global active queue
3320  * its ok to fail if we're below the hard limit and force == FALSE
3321  * the nolocks == TRUE case is to allow this function to be run on
3322  * the hibernate path
3323  */
3324
3325 void
3326 vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks)
3327 {
3328         struct vpl      *lq;
3329         vm_page_t       first_local, last_local;
3330         vm_page_t       first_active;
3331         vm_page_t       m;
3332         uint32_t        count = 0;
3333
3334         if (vm_page_local_q == NULL)
3335                 return;
3336
3337         lq = &vm_page_local_q[lid].vpl_un.vpl;
3338
3339         if (nolocks == FALSE) {
3340                 if (lq->vpl_count < vm_page_local_q_hard_limit && force == FALSE) {
3341                         if ( !vm_page_trylockspin_queues())
3342                                 return;
3343                 } else
3344                         vm_page_lockspin_queues();
3345
3346                 VPL_LOCK(&lq->vpl_lock);
3347         }
3348         if (lq->vpl_count) {
3349                 /*
3350                  * Switch "local" pages to "active".
3351                  */
3352                 assert(!queue_empty(&lq->vpl_queue));
3353
3354                 queue_iterate(&lq->vpl_queue, m, vm_page_t, pageq) {
3355                         VM_PAGE_CHECK(m);
3356                         assert(m->local);
3357                         assert(!m->active);
3358                         assert(!m->inactive);
3359                         assert(!m->speculative);
3360                         assert(!VM_PAGE_WIRED(m));
3361                         assert(!m->throttled);
3362                         assert(!m->fictitious);
3363
3364                         if (m->local_id != lid)
3365                                 panic("vm_page_reactivate_local: found vm_page_t(%p) with wrong cpuid", m);
3366
3367                         m->local_id = 0;
3368                         m->local = FALSE;
3369                         m->active = TRUE;
3370                         VM_PAGE_CHECK(m);
3371
3372                         count++;
3373                 }
3374                 if (count != lq->vpl_count)
3375                         panic("vm_page_reactivate_local: count = %d, vm_page_local_count = %d\n", count, lq->vpl_count);
3376
3377                 /*
3378                  * Transfer the entire local queue to a regular LRU page queues.
3379                  */
3380                 first_local = (vm_page_t) queue_first(&lq->vpl_queue);
3381                 last_local = (vm_page_t) queue_last(&lq->vpl_queue);
3382                 first_active = (vm_page_t) queue_first(&vm_page_queue_active);
3383
3384                 if (queue_empty(&vm_page_queue_active)) {
3385                         queue_last(&vm_page_queue_active) = (queue_entry_t) last_local;
3386                 } else {
3387                         queue_prev(&first_active->pageq) = (queue_entry_t) last_local;
3388                 }
3389                 queue_first(&vm_page_queue_active) = (queue_entry_t) first_local;
3390                 queue_prev(&first_local->pageq) = (queue_entry_t) &vm_page_queue_active;
3391                 queue_next(&last_local->pageq) = (queue_entry_t) first_active;
3392
3393                 queue_init(&lq->vpl_queue);
3394                 /*
3395                  * Adjust the global page counts.
3396                  */
3397                 vm_page_active_count += lq->vpl_count;
3398                 vm_page_pageable_internal_count += lq->vpl_internal_count;
3399                 vm_page_pageable_external_count += lq->vpl_external_count;
3400                 lq->vpl_count = 0;
3401                 lq->vpl_internal_count = 0;
3402                 lq->vpl_external_count = 0;
3403         }
3404         assert(queue_empty(&lq->vpl_queue));
3405
3406         if (nolocks == FALSE) {
3407                 VPL_UNLOCK(&lq->vpl_lock);
3408                 vm_page_unlock_queues();
3409         }
3410 }
3411
3412 /*
3413  *      vm_page_part_zero_fill:
3414  *
3415  *      Zero-fill a part of the page.
3416  */
3417 #define PMAP_ZERO_PART_PAGE_IMPLEMENTED
3418 void
3419 vm_page_part_zero_fill(
3420         vm_page_t       m,
3421         vm_offset_t     m_pa,
3422         vm_size_t       len)
3423 {
3424
3425 #if 0
3426         /*
3427          * we don't hold the page queue lock
3428          * so this check isn't safe to make
3429          */
3430         VM_PAGE_CHECK(m);
3431 #endif
3432
3433 #ifdef PMAP_ZERO_PART_PAGE_IMPLEMENTED
3434         pmap_zero_part_page(m->phys_page, m_pa, len);
3435 #else
3436         vm_page_t       tmp;
3437         while (1) {
3438                 tmp = vm_page_grab();
3439                 if (tmp == VM_PAGE_NULL) {
3440                         vm_page_wait(THREAD_UNINT);
3441                         continue;
3442                 }
3443                 break;
3444         }
3445         vm_page_zero_fill(tmp);
3446         if(m_pa != 0) {
3447                 vm_page_part_copy(m, 0, tmp, 0, m_pa);
3448         }
3449         if((m_pa + len) <  PAGE_SIZE) {
3450                 vm_page_part_copy(m, m_pa + len, tmp,
3451                                 m_pa + len, PAGE_SIZE - (m_pa + len));
3452         }
3453         vm_page_copy(tmp,m);
3454         VM_PAGE_FREE(tmp);
3455 #endif
3456
3457 }
3458
3459 /*
3460  *      vm_page_zero_fill:
3461  *
3462  *      Zero-fill the specified page.
3463  */
3464 void
3465 vm_page_zero_fill(
3466         vm_page_t       m)
3467 {
3468         XPR(XPR_VM_PAGE,
3469                 "vm_page_zero_fill, object 0x%X offset 0x%X page 0x%X\n",
3470                 m->object, m->offset, m, 0,0);
3471 #if 0
3472         /*
3473          * we don't hold the page queue lock
3474          * so this check isn't safe to make
3475          */
3476         VM_PAGE_CHECK(m);
3477 #endif
3478
3479 //      dbgTrace(0xAEAEAEAE, m->phys_page, 0);          /* (BRINGUP) */
3480         pmap_zero_page(m->phys_page);
3481 }
3482
3483 /*
3484  *      vm_page_part_copy:
3485  *
3486  *      copy part of one page to another
3487  */
3488
3489 void
3490 vm_page_part_copy(
3491         vm_page_t       src_m,
3492         vm_offset_t     src_pa,
3493         vm_page_t       dst_m,
3494         vm_offset_t     dst_pa,
3495         vm_size_t       len)
3496 {
3497 #if 0
3498         /*
3499          * we don't hold the page queue lock
3500          * so this check isn't safe to make
3501          */
3502         VM_PAGE_CHECK(src_m);
3503         VM_PAGE_CHECK(dst_m);
3504 #endif
3505         pmap_copy_part_page(src_m->phys_page, src_pa,
3506                         dst_m->phys_page, dst_pa, len);
3507 }
3508
3509 /*
3510  *      vm_page_copy:
3511  *
3512  *      Copy one page to another
3513  *
3514  * ENCRYPTED SWAP:
3515  * The source page should not be encrypted.  The caller should
3516  * make sure the page is decrypted first, if necessary.
3517  */
3518
3519 int vm_page_copy_cs_validations = 0;
3520 int vm_page_copy_cs_tainted = 0;
3521
3522 void
3523 vm_page_copy(
3524         vm_page_t       src_m,
3525         vm_page_t       dest_m)
3526 {
3527         XPR(XPR_VM_PAGE,
3528         "vm_page_copy, object 0x%X offset 0x%X to object 0x%X offset 0x%X\n",
3529         src_m->object, src_m->offset,
3530         dest_m->object, dest_m->offset,
3531         0);
3532 #if 0
3533         /*
3534          * we don't hold the page queue lock
3535          * so this check isn't safe to make
3536          */
3537         VM_PAGE_CHECK(src_m);
3538         VM_PAGE_CHECK(dest_m);
3539 #endif
3540         vm_object_lock_assert_held(src_m->object);
3541
3542         /*
3543          * ENCRYPTED SWAP:
3544          * The source page should not be encrypted at this point.
3545          * The destination page will therefore not contain encrypted
3546          * data after the copy.
3547          */
3548         if (src_m->encrypted) {
3549                 panic("vm_page_copy: source page %p is encrypted\n", src_m);
3550         }
3551         dest_m->encrypted = FALSE;
3552
3553         if (src_m->object != VM_OBJECT_NULL &&
3554             src_m->object->code_signed) {
3555                 /*
3556                  * We're copying a page from a code-signed object.
3557                  * Whoever ends up mapping the copy page might care about
3558                  * the original page's integrity, so let's validate the
3559                  * source page now.
3560                  */
3561                 vm_page_copy_cs_validations++;
3562                 vm_page_validate_cs(src_m);
3563         }
3564
3565         if (vm_page_is_slideable(src_m)) {
3566                 boolean_t was_busy = src_m->busy;
3567                 src_m->busy = TRUE;
3568                 (void) vm_page_slide(src_m, 0);
3569                 assert(src_m->busy);
3570                 if (!was_busy) {
3571                         PAGE_WAKEUP_DONE(src_m);
3572                 }
3573         }
3574
3575         /*
3576          * Propagate the cs_tainted bit to the copy page. Do not propagate
3577          * the cs_validated bit.
3578          */
3579         dest_m->cs_tainted = src_m->cs_tainted;
3580         if (dest_m->cs_tainted) {
3581                 vm_page_copy_cs_tainted++;
3582         }
3583         dest_m->slid = src_m->slid;
3584         dest_m->error = src_m->error; /* sliding src_m might have failed... */
3585         pmap_copy_page(src_m->phys_page, dest_m->phys_page);
3586 }
3587
3588 #if MACH_ASSERT
3589 static void
3590 _vm_page_print(
3591         vm_page_t       p)
3592 {
3593         printf("vm_page %p: \n", p);
3594         printf("  pageq: next=%p prev=%p\n", p->pageq.next, p->pageq.prev);
3595         printf("  listq: next=%p prev=%p\n", p->listq.next, p->listq.prev);
3596         printf("  next=%p\n", p->next);
3597         printf("  object=%p offset=0x%llx\n", p->object, p->offset);
3598         printf("  wire_count=%u\n", p->wire_count);
3599
3600         printf("  %slocal, %sinactive, %sactive, %spageout_queue, %sspeculative, %slaundry\n",
3601                (p->local ? "" : "!"),
3602                (p->inactive ? "" : "!"),
3603                (p->active ? "" : "!"),
3604                (p->pageout_queue ? "" : "!"),
3605                (p->speculative ? "" : "!"),
3606                (p->laundry ? "" : "!"));
3607         printf("  %sfree, %sref, %sgobbled, %sprivate, %sthrottled\n",
3608                (p->free ? "" : "!"),
3609                (p->reference ? "" : "!"),
3610                (p->gobbled ? "" : "!"),
3611                (p->private ? "" : "!"),
3612                (p->throttled ? "" : "!"));
3613         printf("  %sbusy, %swanted, %stabled, %sfictitious, %spmapped, %swpmapped\n",
3614                 (p->busy ? "" : "!"),
3615                 (p->wanted ? "" : "!"),
3616                 (p->tabled ? "" : "!"),
3617                 (p->fictitious ? "" : "!"),
3618                 (p->pmapped ? "" : "!"),
3619                 (p->wpmapped ? "" : "!"));
3620         printf("  %spageout, %sabsent, %serror, %sdirty, %scleaning, %sprecious, %sclustered\n",
3621                (p->pageout ? "" : "!"),
3622                (p->absent ? "" : "!"),
3623                (p->error ? "" : "!"),
3624                (p->dirty ? "" : "!"),
3625                (p->cleaning ? "" : "!"),
3626                (p->precious ? "" : "!"),
3627                (p->clustered ? "" : "!"));
3628         printf("  %soverwriting, %srestart, %sunusual, %sencrypted, %sencrypted_cleaning\n",
3629                (p->overwriting ? "" : "!"),
3630                (p->restart ? "" : "!"),
3631                (p->unusual ? "" : "!"),
3632                (p->encrypted ? "" : "!"),
3633                (p->encrypted_cleaning ? "" : "!"));
3634         printf("  %scs_validated, %scs_tainted, %sno_cache\n",
3635                (p->cs_validated ? "" : "!"),
3636                (p->cs_tainted ? "" : "!"),
3637                (p->no_cache ? "" : "!"));
3638
3639         printf("phys_page=0x%x\n", p->phys_page);
3640 }
3641
3642 /*
3643  *      Check that the list of pages is ordered by
3644  *      ascending physical address and has no holes.
3645  */
3646 static int
3647 vm_page_verify_contiguous(
3648         vm_page_t       pages,
3649         unsigned int    npages)
3650 {
3651         register vm_page_t      m;
3652         unsigned int            page_count;
3653         vm_offset_t             prev_addr;
3654
3655         prev_addr = pages->phys_page;
3656         page_count = 1;
3657         for (m = NEXT_PAGE(pages); m != VM_PAGE_NULL; m = NEXT_PAGE(m)) {
3658                 if (m->phys_page != prev_addr + 1) {
3659                         printf("m %p prev_addr 0x%lx, current addr 0x%x\n",
3660                                m, (long)prev_addr, m->phys_page);
3661                         printf("pages %p page_count %d npages %d\n", pages, page_count, npages);
3662                         panic("vm_page_verify_contiguous:  not contiguous!");
3663                 }
3664                 prev_addr = m->phys_page;
3665                 ++page_count;
3666         }
3667         if (page_count != npages) {
3668                 printf("pages %p actual count 0x%x but requested 0x%x\n",
3669                        pages, page_count, npages);
3670                 panic("vm_page_verify_contiguous:  count error");
3671         }
3672         return 1;
3673 }
3674
3675
3676 /*
3677  *      Check the free lists for proper length etc.
3678  */
3679 static unsigned int
3680 vm_page_verify_free_list(
3681         queue_head_t    *vm_page_queue,
3682         unsigned int    color,
3683         vm_page_t       look_for_page,
3684         boolean_t       expect_page)
3685 {
3686         unsigned int    npages;
3687         vm_page_t       m;
3688         vm_page_t       prev_m;
3689         boolean_t       found_page;
3690
3691         found_page = FALSE;
3692         npages = 0;
3693         prev_m = (vm_page_t) vm_page_queue;
3694         queue_iterate(vm_page_queue,
3695                       m,
3696                       vm_page_t,
3697                       pageq) {
3698
3699                 if (m == look_for_page) {
3700                         found_page = TRUE;
3701                 }
3702                 if ((vm_page_t) m->pageq.prev != prev_m)
3703                         panic("vm_page_verify_free_list(color=%u, npages=%u): page %p corrupted prev ptr %p instead of %p\n",
3704                               color, npages, m, m->pageq.prev, prev_m);
3705                 if ( ! m->busy )
3706                         panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not busy\n",
3707                               color, npages, m);
3708                 if (color != (unsigned int) -1) {
3709                         if ((m->phys_page & vm_color_mask) != color)
3710                                 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p wrong color %u instead of %u\n",
3711                                       color, npages, m, m->phys_page & vm_color_mask, color);
3712                         if ( ! m->free )
3713                                 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not free\n",
3714                                       color, npages, m);
3715                 }
3716                 ++npages;
3717                 prev_m = m;
3718         }
3719         if (look_for_page != VM_PAGE_NULL) {
3720                 unsigned int other_color;
3721
3722                 if (expect_page && !found_page) {
3723                         printf("vm_page_verify_free_list(color=%u, npages=%u): page %p not found phys=%u\n",
3724                                color, npages, look_for_page, look_for_page->phys_page);
3725                         _vm_page_print(look_for_page);
3726                         for (other_color = 0;
3727                              other_color < vm_colors;
3728                              other_color++) {
3729                                 if (other_color == color)
3730                                         continue;
3731                                 vm_page_verify_free_list(&vm_page_queue_free[other_color],
3732                                                          other_color, look_for_page, FALSE);
3733                         }
3734                         if (color == (unsigned int) -1) {
3735                                 vm_page_verify_free_list(&vm_lopage_queue_free,
3736                                                          (unsigned int) -1, look_for_page, FALSE);
3737                         }
3738                         panic("vm_page_verify_free_list(color=%u)\n", color);
3739                 }
3740                 if (!expect_page && found_page) {
3741                         printf("vm_page_verify_free_list(color=%u, npages=%u): page %p found phys=%u\n",
3742                                color, npages, look_for_page, look_for_page->phys_page);
3743                 }
3744         }
3745         return npages;
3746 }
3747
3748 static boolean_t vm_page_verify_free_lists_enabled = FALSE;
3749 static void
3750 vm_page_verify_free_lists( void )
3751 {
3752         unsigned int    color, npages, nlopages;
3753
3754         if (! vm_page_verify_free_lists_enabled)
3755                 return;
3756
3757         npages = 0;
3758
3759         lck_mtx_lock(&vm_page_queue_free_lock);
3760
3761         for( color = 0; color < vm_colors; color++ ) {
3762                 npages += vm_page_verify_free_list(&vm_page_queue_free[color],
3763                                                    color, VM_PAGE_NULL, FALSE);
3764         }
3765         nlopages = vm_page_verify_free_list(&vm_lopage_queue_free,
3766                                             (unsigned int) -1,
3767                                             VM_PAGE_NULL, FALSE);
3768         if (npages != vm_page_free_count || nlopages != vm_lopage_free_count)
3769                 panic("vm_page_verify_free_lists:  "
3770                       "npages %u free_count %d nlopages %u lo_free_count %u",
3771                       npages, vm_page_free_count, nlopages, vm_lopage_free_count);
3772
3773         lck_mtx_unlock(&vm_page_queue_free_lock);
3774 }
3775
3776 void
3777 vm_page_queues_assert(
3778         vm_page_t       mem,
3779         int             val)
3780 {
3781 #if DEBUG
3782         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3783 #endif
3784         if (mem->free + mem->active + mem->inactive + mem->speculative +
3785             mem->throttled + mem->pageout_queue > (val)) {
3786                 _vm_page_print(mem);
3787                 panic("vm_page_queues_assert(%p, %d)\n", mem, val);
3788         }
3789         if (VM_PAGE_WIRED(mem)) {
3790                 assert(!mem->active);
3791                 assert(!mem->inactive);
3792                 assert(!mem->speculative);
3793                 assert(!mem->throttled);
3794                 assert(!mem->pageout_queue);
3795         }
3796 }
3797 #endif  /* MACH_ASSERT */
3798
3799
3800 /*
3801  *      CONTIGUOUS PAGE ALLOCATION
3802  *
3803  *      Find a region large enough to contain at least n pages
3804  *      of contiguous physical memory.
3805  *
3806  *      This is done by traversing the vm_page_t array in a linear fashion
3807  *      we assume that the vm_page_t array has the avaiable physical pages in an
3808  *      ordered, ascending list... this is currently true of all our implementations
3809  *      and must remain so... there can be 'holes' in the array...  we also can
3810  *      no longer tolerate the vm_page_t's in the list being 'freed' and reclaimed
3811  *      which use to happen via 'vm_page_convert'... that function was no longer
3812  *      being called and was removed...
3813  *
3814  *      The basic flow consists of stabilizing some of the interesting state of
3815  *      a vm_page_t behind the vm_page_queue and vm_page_free locks... we start our
3816  *      sweep at the beginning of the array looking for pages that meet our criterea
3817  *      for a 'stealable' page... currently we are pretty conservative... if the page
3818  *      meets this criterea and is physically contiguous to the previous page in the 'run'
3819  *      we keep developing it.  If we hit a page that doesn't fit, we reset our state
3820  *      and start to develop a new run... if at this point we've already considered
3821  *      at least MAX_CONSIDERED_BEFORE_YIELD pages, we'll drop the 2 locks we hold,
3822  *      and mutex_pause (which will yield the processor), to keep the latency low w/r
3823  *      to other threads trying to acquire free pages (or move pages from q to q),
3824  *      and then continue from the spot we left off... we only make 1 pass through the
3825  *      array.  Once we have a 'run' that is long enough, we'll go into the loop which
3826  *      which steals the pages from the queues they're currently on... pages on the free
3827  *      queue can be stolen directly... pages that are on any of the other queues
3828  *      must be removed from the object they are tabled on... this requires taking the
3829  *      object lock... we do this as a 'try' to prevent deadlocks... if the 'try' fails
3830  *      or if the state of the page behind the vm_object lock is no longer viable, we'll
3831  *      dump the pages we've currently stolen back to the free list, and pick up our
3832  *      scan from the point where we aborted the 'current' run.
3833  *
3834  *
3835  *      Requirements:
3836  *              - neither vm_page_queue nor vm_free_list lock can be held on entry
3837  *
3838  *      Returns a pointer to a list of gobbled/wired pages or VM_PAGE_NULL.
3839  *
3840  * Algorithm:
3841  */
3842
3843 #define MAX_CONSIDERED_BEFORE_YIELD     1000
3844
3845
3846 #define RESET_STATE_OF_RUN()    \
3847         MACRO_BEGIN             \
3848         prevcontaddr = -2;      \
3849         start_pnum = -1;        \
3850         free_considered = 0;    \
3851         substitute_needed = 0;  \
3852         npages = 0;             \
3853         MACRO_END
3854
3855 /*
3856  * Can we steal in-use (i.e. not free) pages when searching for
3857  * physically-contiguous pages ?
3858  */
3859 #define VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL 1
3860
3861 static unsigned int vm_page_find_contiguous_last_idx = 0,  vm_page_lomem_find_contiguous_last_idx = 0;
3862 #if DEBUG
3863 int vm_page_find_contig_debug = 0;
3864 #endif
3865
3866 static vm_page_t
3867 vm_page_find_contiguous(
3868         unsigned int    contig_pages,
3869         ppnum_t         max_pnum,
3870         ppnum_t     pnum_mask,
3871         boolean_t       wire,
3872         int             flags)
3873 {
3874         vm_page_t       m = NULL;
3875         ppnum_t         prevcontaddr;
3876         ppnum_t         start_pnum;
3877         unsigned int    npages, considered, scanned;
3878         unsigned int    page_idx, start_idx, last_idx, orig_last_idx;
3879         unsigned int    idx_last_contig_page_found = 0;
3880         int             free_considered, free_available;
3881         int             substitute_needed;
3882         boolean_t       wrapped;
3883 #if DEBUG
3884         clock_sec_t     tv_start_sec, tv_end_sec;
3885         clock_usec_t    tv_start_usec, tv_end_usec;
3886 #endif
3887 #if MACH_ASSERT
3888         int             yielded = 0;
3889         int             dumped_run = 0;
3890         int             stolen_pages = 0;
3891         int             compressed_pages = 0;
3892 #endif
3893
3894         if (contig_pages == 0)
3895                 return VM_PAGE_NULL;
3896
3897 #if MACH_ASSERT
3898         vm_page_verify_free_lists();
3899 #endif
3900 #if DEBUG
3901         clock_get_system_microtime(&tv_start_sec, &tv_start_usec);
3902 #endif
3903         PAGE_REPLACEMENT_ALLOWED(TRUE);
3904
3905         vm_page_lock_queues();
3906         lck_mtx_lock(&vm_page_queue_free_lock);
3907
3908         RESET_STATE_OF_RUN();
3909
3910         scanned = 0;
3911         considered = 0;
3912         free_available = vm_page_free_count - vm_page_free_reserved;
3913
3914         wrapped = FALSE;
3915
3916         if(flags & KMA_LOMEM)
3917                 idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx;
3918         else
3919                 idx_last_contig_page_found =  vm_page_find_contiguous_last_idx;
3920
3921         orig_last_idx = idx_last_contig_page_found;
3922         last_idx = orig_last_idx;
3923
3924         for (page_idx = last_idx, start_idx = last_idx;
3925              npages < contig_pages && page_idx < vm_pages_count;
3926              page_idx++) {
3927 retry:
3928                 if (wrapped &&
3929                     npages == 0 &&
3930                     page_idx >= orig_last_idx) {
3931                         /*
3932                          * We're back where we started and we haven't
3933                          * found any suitable contiguous range.  Let's
3934                          * give up.
3935                          */
3936                         break;
3937                 }
3938                 scanned++;
3939                 m = &vm_pages[page_idx];
3940
3941                 assert(!m->fictitious);
3942                 assert(!m->private);
3943
3944                 if (max_pnum && m->phys_page > max_pnum) {
3945                         /* no more low pages... */
3946                         break;
3947                 }
3948                 if (!npages & ((m->phys_page & pnum_mask) != 0)) {
3949                         /*
3950                          * not aligned
3951                          */
3952                         RESET_STATE_OF_RUN();
3953
3954                 } else if (VM_PAGE_WIRED(m) || m->gobbled ||
3955                            m->encrypted_cleaning ||
3956                            m->pageout_queue || m->laundry || m->wanted ||
3957                            m->cleaning || m->overwriting || m->pageout) {
3958                         /*
3959                          * page is in a transient state
3960                          * or a state we don't want to deal
3961                          * with, so don't consider it which
3962                          * means starting a new run
3963                          */
3964                         RESET_STATE_OF_RUN();
3965
3966                 } else if (!m->free && !m->active && !m->inactive && !m->speculative && !m->throttled && !m->compressor) {
3967                         /*
3968                          * page needs to be on one of our queues
3969                          * or it needs to belong to the compressor pool
3970                          * in order for it to be stable behind the
3971                          * locks we hold at this point...
3972                          * if not, don't consider it which
3973                          * means starting a new run
3974                          */
3975                         RESET_STATE_OF_RUN();
3976
3977                 } else if (!m->free && (!m->tabled || m->busy)) {
3978                         /*
3979                          * pages on the free list are always 'busy'
3980                          * so we couldn't test for 'busy' in the check
3981                          * for the transient states... pages that are
3982                          * 'free' are never 'tabled', so we also couldn't
3983                          * test for 'tabled'.  So we check here to make
3984                          * sure that a non-free page is not busy and is
3985                          * tabled on an object...
3986                          * if not, don't consider it which
3987                          * means starting a new run
3988                          */
3989                         RESET_STATE_OF_RUN();
3990
3991                 } else {
3992                         if (m->phys_page != prevcontaddr + 1) {
3993                                 if ((m->phys_page & pnum_mask) != 0) {
3994                                         RESET_STATE_OF_RUN();
3995                                         goto did_consider;
3996                                 } else {
3997                                         npages = 1;
3998                                         start_idx = page_idx;
3999                                         start_pnum = m->phys_page;
4000                                 }
4001                         } else {
4002                                 npages++;
4003                         }
4004                         prevcontaddr = m->phys_page;
4005
4006                         VM_PAGE_CHECK(m);
4007                         if (m->free) {
4008                                 free_considered++;
4009                         } else {
4010                                 /*
4011                                  * This page is not free.
4012                                  * If we can't steal used pages,
4013                                  * we have to give up this run
4014                                  * and keep looking.
4015                                  * Otherwise, we might need to
4016                                  * move the contents of this page
4017                                  * into a substitute page.
4018                                  */
4019 #if VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
4020                                 if (m->pmapped || m->dirty || m->precious) {
4021                                         substitute_needed++;
4022                                 }
4023 #else
4024                                 RESET_STATE_OF_RUN();
4025 #endif
4026                         }
4027
4028                         if ((free_considered + substitute_needed) > free_available) {
4029                                 /*
4030                                  * if we let this run continue
4031                                  * we will end up dropping the vm_page_free_count
4032                                  * below the reserve limit... we need to abort
4033                                  * this run, but we can at least re-consider this
4034                                  * page... thus the jump back to 'retry'
4035                                  */
4036                                 RESET_STATE_OF_RUN();
4037
4038                                 if (free_available && considered <= MAX_CONSIDERED_BEFORE_YIELD) {
4039                                         considered++;
4040                                         goto retry;
4041                                 }
4042                                 /*
4043                                  * free_available == 0
4044                                  * so can't consider any free pages... if
4045                                  * we went to retry in this case, we'd
4046                                  * get stuck looking at the same page
4047                                  * w/o making any forward progress
4048                                  * we also want to take this path if we've already
4049                                  * reached our limit that controls the lock latency
4050                                  */
4051                         }
4052                 }
4053 did_consider:
4054                 if (considered > MAX_CONSIDERED_BEFORE_YIELD && npages <= 1) {
4055
4056                         PAGE_REPLACEMENT_ALLOWED(FALSE);
4057
4058                         lck_mtx_unlock(&vm_page_queue_free_lock);
4059                         vm_page_unlock_queues();
4060
4061                         mutex_pause(0);
4062
4063                         PAGE_REPLACEMENT_ALLOWED(TRUE);
4064
4065                         vm_page_lock_queues();
4066                         lck_mtx_lock(&vm_page_queue_free_lock);
4067
4068                         RESET_STATE_OF_RUN();
4069                         /*
4070                          * reset our free page limit since we
4071                          * dropped the lock protecting the vm_page_free_queue
4072                          */
4073                         free_available = vm_page_free_count - vm_page_free_reserved;
4074                         considered = 0;
4075 #if MACH_ASSERT
4076                         yielded++;
4077 #endif
4078                         goto retry;
4079                 }
4080                 considered++;
4081         }
4082         m = VM_PAGE_NULL;
4083
4084         if (npages != contig_pages) {
4085                 if (!wrapped) {
4086                         /*
4087                          * We didn't find a contiguous range but we didn't
4088                          * start from the very first page.
4089                          * Start again from the very first page.
4090                          */
4091                         RESET_STATE_OF_RUN();
4092                         if( flags & KMA_LOMEM)
4093                                 idx_last_contig_page_found  = vm_page_lomem_find_contiguous_last_idx = 0;
4094                         else
4095                                 idx_last_contig_page_found = vm_page_find_contiguous_last_idx = 0;
4096                         last_idx = 0;
4097                         page_idx = last_idx;
4098                         wrapped = TRUE;
4099                         goto retry;
4100                 }
4101                 lck_mtx_unlock(&vm_page_queue_free_lock);
4102         } else {
4103                 vm_page_t       m1;
4104                 vm_page_t       m2;
4105                 unsigned int    cur_idx;
4106                 unsigned int    tmp_start_idx;
4107                 vm_object_t     locked_object = VM_OBJECT_NULL;
4108                 boolean_t       abort_run = FALSE;
4109
4110                 assert(page_idx - start_idx == contig_pages);
4111
4112                 tmp_start_idx = start_idx;
4113
4114                 /*
4115                  * first pass through to pull the free pages
4116                  * off of the free queue so that in case we
4117                  * need substitute pages, we won't grab any
4118                  * of the free pages in the run... we'll clear
4119                  * the 'free' bit in the 2nd pass, and even in
4120                  * an abort_run case, we'll collect all of the
4121                  * free pages in this run and return them to the free list
4122                  */
4123                 while (start_idx < page_idx) {
4124
4125                         m1 = &vm_pages[start_idx++];
4126
4127 #if !VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
4128                         assert(m1->free);
4129 #endif
4130
4131                         if (m1->free) {
4132                                 unsigned int color;
4133
4134                                 color = m1->phys_page & vm_color_mask;
4135 #if MACH_ASSERT
4136                                 vm_page_verify_free_list(&vm_page_queue_free[color], color, m1, TRUE);
4137 #endif
4138                                 queue_remove(&vm_page_queue_free[color],
4139                                              m1,
4140                                              vm_page_t,
4141                                              pageq);
4142                                 m1->pageq.next = NULL;
4143                                 m1->pageq.prev = NULL;
4144 #if MACH_ASSERT
4145                                 vm_page_verify_free_list(&vm_page_queue_free[color], color, VM_PAGE_NULL, FALSE);
4146 #endif
4147                                 /*
4148                                  * Clear the "free" bit so that this page
4149                                  * does not get considered for another
4150                                  * concurrent physically-contiguous allocation.
4151                                  */
4152                                 m1->free = FALSE;
4153                                 assert(m1->busy);
4154
4155                                 vm_page_free_count--;
4156                         }
4157                 }
4158                 /*
4159                  * adjust global freelist counts
4160                  */
4161                 if (vm_page_free_count < vm_page_free_count_minimum)
4162                         vm_page_free_count_minimum = vm_page_free_count;
4163
4164                 if( flags & KMA_LOMEM)
4165                         vm_page_lomem_find_contiguous_last_idx = page_idx;
4166                 else
4167                         vm_page_find_contiguous_last_idx = page_idx;
4168
4169                 /*
4170                  * we can drop the free queue lock at this point since
4171                  * we've pulled any 'free' candidates off of the list
4172                  * we need it dropped so that we can do a vm_page_grab
4173                  * when substituing for pmapped/dirty pages
4174                  */
4175                 lck_mtx_unlock(&vm_page_queue_free_lock);
4176
4177                 start_idx = tmp_start_idx;
4178                 cur_idx = page_idx - 1;
4179
4180                 while (start_idx++ < page_idx) {
4181                         /*
4182                          * must go through the list from back to front
4183                          * so that the page list is created in the
4184                          * correct order - low -> high phys addresses
4185                          */
4186                         m1 = &vm_pages[cur_idx--];
4187
4188                         assert(!m1->free);
4189
4190                         if (m1->object == VM_OBJECT_NULL) {
4191                                 /*
4192                                  * page has already been removed from
4193                                  * the free list in the 1st pass
4194                                  */
4195                                 assert(m1->offset == (vm_object_offset_t) -1);
4196                                 assert(m1->busy);
4197                                 assert(!m1->wanted);
4198                                 assert(!m1->laundry);
4199                         } else {
4200                                 vm_object_t object;
4201                                 int refmod;
4202                                 boolean_t disconnected, reusable;
4203
4204                                 if (abort_run == TRUE)
4205                                         continue;
4206
4207                                 object = m1->object;
4208
4209                                 if (object != locked_object) {
4210                                         if (locked_object) {
4211                                                 vm_object_unlock(locked_object);
4212                                                 locked_object = VM_OBJECT_NULL;
4213                                         }
4214                                         if (vm_object_lock_try(object))
4215                                                 locked_object = object;
4216                                 }
4217                                 if (locked_object == VM_OBJECT_NULL ||
4218                                     (VM_PAGE_WIRED(m1) || m1->gobbled ||
4219                                      m1->encrypted_cleaning ||
4220                                      m1->pageout_queue || m1->laundry || m1->wanted ||
4221                                      m1->cleaning || m1->overwriting || m1->pageout || m1->busy)) {
4222
4223                                         if (locked_object) {
4224                                                 vm_object_unlock(locked_object);
4225                                                 locked_object = VM_OBJECT_NULL;
4226                                         }
4227                                         tmp_start_idx = cur_idx;
4228                                         abort_run = TRUE;
4229                                         continue;
4230                                 }
4231
4232                                 disconnected = FALSE;
4233                                 reusable = FALSE;
4234
4235                                 if ((m1->reusable ||
4236                                      m1->object->all_reusable) &&
4237                                     m1->inactive &&
4238                                     !m1->dirty &&
4239                                     !m1->reference) {
4240                                         /* reusable page... */
4241                                         refmod = pmap_disconnect(m1->phys_page);
4242                                         disconnected = TRUE;
4243                                         if (refmod == 0) {
4244                                                 /*
4245                                                  * ... not reused: can steal
4246                                                  * without relocating contents.
4247                                                  */
4248                                                 reusable = TRUE;
4249                                         }
4250                                 }
4251
4252                                 if ((m1->pmapped &&
4253                                      ! reusable) ||
4254                                     m1->dirty ||
4255                                     m1->precious) {
4256                                         vm_object_offset_t offset;
4257
4258                                         m2 = vm_page_grab();
4259
4260                                         if (m2 == VM_PAGE_NULL) {
4261                                                 if (locked_object) {
4262                                                         vm_object_unlock(locked_object);
4263                                                         locked_object = VM_OBJECT_NULL;
4264                                                 }
4265                                                 tmp_start_idx = cur_idx;
4266                                                 abort_run = TRUE;
4267                                                 continue;
4268                                         }
4269                                         if (! disconnected) {
4270                                                 if (m1->pmapped)
4271                                                         refmod = pmap_disconnect(m1->phys_page);
4272                                                 else
4273                                                         refmod = 0;
4274                                         }
4275
4276                                         /* copy the page's contents */
4277                                         pmap_copy_page(m1->phys_page, m2->phys_page);
4278                                         /* copy the page's state */
4279                                         assert(!VM_PAGE_WIRED(m1));
4280                                         assert(!m1->free);
4281                                         assert(!m1->pageout_queue);
4282                                         assert(!m1->laundry);
4283                                         m2->reference   = m1->reference;
4284                                         assert(!m1->gobbled);
4285                                         assert(!m1->private);
4286                                         m2->no_cache    = m1->no_cache;
4287                                         m2->xpmapped    = m1->xpmapped;
4288                                         assert(!m1->busy);
4289                                         assert(!m1->wanted);
4290                                         assert(!m1->fictitious);
4291                                         m2->pmapped     = m1->pmapped; /* should flush cache ? */
4292                                         m2->wpmapped    = m1->wpmapped;
4293                                         assert(!m1->pageout);
4294                                         m2->absent      = m1->absent;
4295                                         m2->error       = m1->error;
4296                                         m2->dirty       = m1->dirty;
4297                                         assert(!m1->cleaning);
4298                                         m2->precious    = m1->precious;
4299                                         m2->clustered   = m1->clustered;
4300                                         assert(!m1->overwriting);
4301                                         m2->restart     = m1->restart;
4302                                         m2->unusual     = m1->unusual;
4303                                         m2->encrypted   = m1->encrypted;
4304                                         assert(!m1->encrypted_cleaning);
4305                                         m2->cs_validated = m1->cs_validated;
4306                                         m2->cs_tainted  = m1->cs_tainted;
4307
4308                                         /*
4309                                          * If m1 had really been reusable,
4310                                          * we would have just stolen it, so
4311                                          * let's not propagate it's "reusable"
4312                                          * bit and assert that m2 is not
4313                                          * marked as "reusable".
4314                                          */
4315                                         // m2->reusable = m1->reusable;
4316                                         assert(!m2->reusable);
4317
4318                                         assert(!m1->lopage);
4319                                         m2->slid        = m1->slid;
4320                                         m2->was_dirty   = m1->was_dirty;
4321                                         m2->compressor  = m1->compressor;
4322
4323                                         /*
4324                                          * page may need to be flushed if
4325                                          * it is marshalled into a UPL
4326                                          * that is going to be used by a device
4327                                          * that doesn't support coherency
4328                                          */
4329                                         m2->written_by_kernel = TRUE;
4330
4331                                         /*
4332                                          * make sure we clear the ref/mod state
4333                                          * from the pmap layer... else we risk
4334                                          * inheriting state from the last time
4335                                          * this page was used...
4336                                          */
4337                                         pmap_clear_refmod(m2->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
4338
4339                                         if (refmod & VM_MEM_REFERENCED)
4340                                                 m2->reference = TRUE;
4341                                         if (refmod & VM_MEM_MODIFIED) {
4342                                                 SET_PAGE_DIRTY(m2, TRUE);
4343                                         }
4344                                         offset = m1->offset;
4345
4346                                         /*
4347                                          * completely cleans up the state
4348                                          * of the page so that it is ready
4349                                          * to be put onto the free list, or
4350                                          * for this purpose it looks like it
4351                                          * just came off of the free list
4352                                          */
4353                                         vm_page_free_prepare(m1);
4354
4355                                         /*
4356                                          * now put the substitute page
4357                                          * on the object
4358                                          */
4359                                         vm_page_insert_internal(m2, locked_object, offset, TRUE, TRUE, FALSE);
4360
4361                                         if (m2->compressor) {
4362                                                 m2->pmapped = TRUE;
4363                                                 m2->wpmapped = TRUE;
4364
4365                                                 PMAP_ENTER(kernel_pmap, m2->offset, m2,
4366                                                            VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE);
4367 #if MACH_ASSERT
4368                                                 compressed_pages++;
4369 #endif
4370                                         } else {
4371                                                 if (m2->reference)
4372                                                         vm_page_activate(m2);
4373                                                 else
4374                                                         vm_page_deactivate(m2);
4375                                         }
4376                                         PAGE_WAKEUP_DONE(m2);
4377
4378                                 } else {
4379                                         assert(!m1->compressor);
4380
4381                                         /*
4382                                          * completely cleans up the state
4383                                          * of the page so that it is ready
4384                                          * to be put onto the free list, or
4385                                          * for this purpose it looks like it
4386                                          * just came off of the free list
4387                                          */
4388                                         vm_page_free_prepare(m1);
4389                                 }
4390 #if MACH_ASSERT
4391                                 stolen_pages++;
4392 #endif
4393                         }
4394                         m1->pageq.next = (queue_entry_t) m;
4395                         m1->pageq.prev = NULL;
4396                         m = m1;
4397                 }
4398                 if (locked_object) {
4399                         vm_object_unlock(locked_object);
4400                         locked_object = VM_OBJECT_NULL;
4401                 }
4402
4403                 if (abort_run == TRUE) {
4404                         if (m != VM_PAGE_NULL) {
4405                                 vm_page_free_list(m, FALSE);
4406                         }
4407 #if MACH_ASSERT
4408                         dumped_run++;
4409 #endif
4410                         /*
4411                          * want the index of the last
4412                          * page in this run that was
4413                          * successfully 'stolen', so back
4414                          * it up 1 for the auto-decrement on use
4415                          * and 1 more to bump back over this page
4416                          */
4417                         page_idx = tmp_start_idx + 2;
4418                         if (page_idx >= vm_pages_count) {
4419                                 if (wrapped)
4420                                         goto done_scanning;
4421                                 page_idx = last_idx = 0;
4422                                 wrapped = TRUE;
4423                         }
4424                         abort_run = FALSE;
4425
4426                         /*
4427                          * We didn't find a contiguous range but we didn't
4428                          * start from the very first page.
4429                          * Start again from the very first page.
4430                          */
4431                         RESET_STATE_OF_RUN();
4432
4433                         if( flags & KMA_LOMEM)
4434                                 idx_last_contig_page_found  = vm_page_lomem_find_contiguous_last_idx = page_idx;
4435                         else
4436                                 idx_last_contig_page_found = vm_page_find_contiguous_last_idx = page_idx;
4437
4438                         last_idx = page_idx;
4439
4440                         lck_mtx_lock(&vm_page_queue_free_lock);
4441                         /*
4442                         * reset our free page limit since we
4443                         * dropped the lock protecting the vm_page_free_queue
4444                         */
4445                         free_available = vm_page_free_count - vm_page_free_reserved;
4446                         goto retry;
4447                 }
4448
4449                 for (m1 = m; m1 != VM_PAGE_NULL; m1 = NEXT_PAGE(m1)) {
4450
4451                         if (wire == TRUE)
4452                                 m1->wire_count++;
4453                         else
4454                                 m1->gobbled = TRUE;
4455                 }
4456                 if (wire == FALSE)
4457                         vm_page_gobble_count += npages;
4458
4459                 /*
4460                  * gobbled pages are also counted as wired pages
4461                  */
4462                 vm_page_wire_count += npages;
4463
4464                 assert(vm_page_verify_contiguous(m, npages));
4465         }
4466 done_scanning:
4467         PAGE_REPLACEMENT_ALLOWED(FALSE);
4468
4469         vm_page_unlock_queues();
4470
4471 #if DEBUG
4472         clock_get_system_microtime(&tv_end_sec, &tv_end_usec);
4473
4474         tv_end_sec -= tv_start_sec;
4475         if (tv_end_usec < tv_start_usec) {
4476                 tv_end_sec--;
4477                 tv_end_usec += 1000000;
4478         }
4479         tv_end_usec -= tv_start_usec;
4480         if (tv_end_usec >= 1000000) {
4481                 tv_end_sec++;
4482                 tv_end_sec -= 1000000;
4483         }
4484         if (vm_page_find_contig_debug) {
4485                 printf("%s(num=%d,low=%d): found %d pages at 0x%llx in %ld.%06ds...  started at %d...  scanned %d pages...  yielded %d times...  dumped run %d times... stole %d pages... stole %d compressed pages\n",
4486                        __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
4487                        (long)tv_end_sec, tv_end_usec, orig_last_idx,
4488                        scanned, yielded, dumped_run, stolen_pages, compressed_pages);
4489         }
4490
4491 #endif
4492 #if MACH_ASSERT
4493         vm_page_verify_free_lists();
4494 #endif
4495         return m;
4496 }
4497
4498 /*
4499  *      Allocate a list of contiguous, wired pages.
4500  */
4501 kern_return_t
4502 cpm_allocate(
4503         vm_size_t       size,
4504         vm_page_t       *list,
4505         ppnum_t         max_pnum,
4506         ppnum_t         pnum_mask,
4507         boolean_t       wire,
4508         int             flags)
4509 {
4510         vm_page_t               pages;
4511         unsigned int            npages;
4512
4513         if (size % PAGE_SIZE != 0)
4514                 return KERN_INVALID_ARGUMENT;
4515
4516         npages = (unsigned int) (size / PAGE_SIZE);
4517         if (npages != size / PAGE_SIZE) {
4518                 /* 32-bit overflow */
4519                 return KERN_INVALID_ARGUMENT;
4520         }
4521
4522         /*
4523          *      Obtain a pointer to a subset of the free
4524          *      list large enough to satisfy the request;
4525          *      the region will be physically contiguous.
4526          */
4527         pages = vm_page_find_contiguous(npages, max_pnum, pnum_mask, wire, flags);
4528
4529         if (pages == VM_PAGE_NULL)
4530                 return KERN_NO_SPACE;
4531         /*
4532          * determine need for wakeups
4533          */
4534         if ((vm_page_free_count < vm_page_free_min) ||
4535              ((vm_page_free_count < vm_page_free_target) &&
4536               ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min)))
4537                  thread_wakeup((event_t) &vm_page_free_wanted);
4538
4539         VM_CHECK_MEMORYSTATUS;
4540
4541         /*
4542          *      The CPM pages should now be available and
4543          *      ordered by ascending physical address.
4544          */
4545         assert(vm_page_verify_contiguous(pages, npages));
4546
4547         *list = pages;
4548         return KERN_SUCCESS;
4549 }
4550
4551
4552 unsigned int vm_max_delayed_work_limit = DEFAULT_DELAYED_WORK_LIMIT;
4553
4554 /*
4555  * when working on a 'run' of pages, it is necessary to hold
4556  * the vm_page_queue_lock (a hot global lock) for certain operations
4557  * on the page... however, the majority of the work can be done
4558  * while merely holding the object lock... in fact there are certain
4559  * collections of pages that don't require any work brokered by the
4560  * vm_page_queue_lock... to mitigate the time spent behind the global
4561  * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT
4562  * while doing all of the work that doesn't require the vm_page_queue_lock...
4563  * then call vm_page_do_delayed_work to acquire the vm_page_queue_lock and do the
4564  * necessary work for each page... we will grab the busy bit on the page
4565  * if it's not already held so that vm_page_do_delayed_work can drop the object lock
4566  * if it can't immediately take the vm_page_queue_lock in order to compete
4567  * for the locks in the same order that vm_pageout_scan takes them.
4568  * the operation names are modeled after the names of the routines that
4569  * need to be called in order to make the changes very obvious in the
4570  * original loop
4571  */
4572
4573 void
4574 vm_page_do_delayed_work(
4575         vm_object_t     object,
4576         struct vm_page_delayed_work *dwp,
4577         int             dw_count)
4578 {
4579         int             j;
4580         vm_page_t       m;
4581         vm_page_t       local_free_q = VM_PAGE_NULL;
4582
4583         /*
4584          * pageout_scan takes the vm_page_lock_queues first
4585          * then tries for the object lock... to avoid what
4586          * is effectively a lock inversion, we'll go to the
4587          * trouble of taking them in that same order... otherwise
4588          * if this object contains the majority of the pages resident
4589          * in the UBC (or a small set of large objects actively being
4590          * worked on contain the majority of the pages), we could
4591          * cause the pageout_scan thread to 'starve' in its attempt
4592          * to find pages to move to the free queue, since it has to
4593          * successfully acquire the object lock of any candidate page
4594          * before it can steal/clean it.
4595          */
4596         if (!vm_page_trylockspin_queues()) {
4597                 vm_object_unlock(object);
4598
4599                 vm_page_lockspin_queues();
4600
4601                 for (j = 0; ; j++) {
4602                         if (!vm_object_lock_avoid(object) &&
4603                             _vm_object_lock_try(object))
4604                                 break;
4605                         vm_page_unlock_queues();
4606                         mutex_pause(j);
4607                         vm_page_lockspin_queues();
4608                 }
4609         }
4610         for (j = 0; j < dw_count; j++, dwp++) {
4611
4612                 m = dwp->dw_m;
4613
4614                 if (dwp->dw_mask & DW_vm_pageout_throttle_up)
4615                         vm_pageout_throttle_up(m);
4616
4617                 if (dwp->dw_mask & DW_vm_page_wire)
4618                         vm_page_wire(m);
4619                 else if (dwp->dw_mask & DW_vm_page_unwire) {
4620                         boolean_t       queueit;
4621
4622                         queueit = (dwp->dw_mask & DW_vm_page_free) ? FALSE : TRUE;
4623
4624                         vm_page_unwire(m, queueit);
4625                 }
4626                 if (dwp->dw_mask & DW_vm_page_free) {
4627                         vm_page_free_prepare_queues(m);
4628
4629                         assert(m->pageq.next == NULL && m->pageq.prev == NULL);
4630                         /*
4631                          * Add this page to our list of reclaimed pages,
4632                          * to be freed later.
4633                          */
4634                         m->pageq.next = (queue_entry_t) local_free_q;
4635                         local_free_q = m;
4636                 } else {
4637                         if (dwp->dw_mask & DW_vm_page_deactivate_internal)
4638                                 vm_page_deactivate_internal(m, FALSE);
4639                         else if (dwp->dw_mask & DW_vm_page_activate) {
4640                                 if (m->active == FALSE) {
4641                                         vm_page_activate(m);
4642                                 }
4643                         }
4644                         else if (dwp->dw_mask & DW_vm_page_speculate)
4645                                 vm_page_speculate(m, TRUE);
4646                         else if (dwp->dw_mask & DW_enqueue_cleaned) {
4647                                 /*
4648                                  * if we didn't hold the object lock and did this,
4649                                  * we might disconnect the page, then someone might
4650                                  * soft fault it back in, then we would put it on the
4651                                  * cleaned queue, and so we would have a referenced (maybe even dirty)
4652                                  * page on that queue, which we don't want
4653                                  */
4654                                 int refmod_state = pmap_disconnect(m->phys_page);
4655
4656                                 if ((refmod_state & VM_MEM_REFERENCED)) {
4657                                         /*
4658                                          * this page has been touched since it got cleaned; let's activate it
4659                                          * if it hasn't already been
4660                                          */
4661                                         vm_pageout_enqueued_cleaned++;
4662                                         vm_pageout_cleaned_reactivated++;
4663                                         vm_pageout_cleaned_commit_reactivated++;
4664
4665                                         if (m->active == FALSE)
4666                                                 vm_page_activate(m);
4667                                 } else {
4668                                         m->reference = FALSE;
4669                                         vm_page_enqueue_cleaned(m);
4670                                 }
4671                         }
4672                         else if (dwp->dw_mask & DW_vm_page_lru)
4673                                 vm_page_lru(m);
4674                         else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) {
4675                                 if ( !m->pageout_queue)
4676                                         VM_PAGE_QUEUES_REMOVE(m);
4677                         }
4678                         if (dwp->dw_mask & DW_set_reference)
4679                                 m->reference = TRUE;
4680                         else if (dwp->dw_mask & DW_clear_reference)
4681                                 m->reference = FALSE;
4682
4683                         if (dwp->dw_mask & DW_move_page) {
4684                                 if ( !m->pageout_queue) {
4685                                         VM_PAGE_QUEUES_REMOVE(m);
4686
4687                                         assert(m->object != kernel_object);
4688
4689                                         VM_PAGE_ENQUEUE_INACTIVE(m, FALSE);
4690                                 }
4691                         }
4692                         if (dwp->dw_mask & DW_clear_busy)
4693                                 m->busy = FALSE;
4694
4695                         if (dwp->dw_mask & DW_PAGE_WAKEUP)
4696                                 PAGE_WAKEUP(m);
4697                 }
4698         }
4699         vm_page_unlock_queues();
4700
4701         if (local_free_q)
4702                 vm_page_free_list(local_free_q, TRUE);
4703
4704         VM_CHECK_MEMORYSTATUS;
4705
4706 }
4707
4708 kern_return_t
4709 vm_page_alloc_list(
4710         int     page_count,
4711         int     flags,
4712         vm_page_t *list)
4713 {
4714         vm_page_t       lo_page_list = VM_PAGE_NULL;
4715         vm_page_t       mem;
4716         int             i;
4717
4718         if ( !(flags & KMA_LOMEM))
4719                 panic("vm_page_alloc_list: called w/o KMA_LOMEM");
4720
4721         for (i = 0; i < page_count; i++) {
4722
4723                 mem = vm_page_grablo();
4724
4725                 if (mem == VM_PAGE_NULL) {
4726                         if (lo_page_list)
4727                                 vm_page_free_list(lo_page_list, FALSE);
4728
4729                         *list = VM_PAGE_NULL;
4730
4731                         return (KERN_RESOURCE_SHORTAGE);
4732                 }
4733                 mem->pageq.next = (queue_entry_t) lo_page_list;
4734                 lo_page_list = mem;
4735         }
4736         *list = lo_page_list;
4737
4738         return (KERN_SUCCESS);
4739 }
4740
4741 void
4742 vm_page_set_offset(vm_page_t page, vm_object_offset_t offset)
4743 {
4744         page->offset = offset;
4745 }
4746
4747 vm_page_t
4748 vm_page_get_next(vm_page_t page)
4749 {
4750         return ((vm_page_t) page->pageq.next);
4751 }
4752
4753 vm_object_offset_t
4754 vm_page_get_offset(vm_page_t page)
4755 {
4756         return (page->offset);
4757 }
4758
4759 ppnum_t
4760 vm_page_get_phys_page(vm_page_t page)
4761 {
4762         return (page->phys_page);
4763 }
4764
4765
4766 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
4767
4768 #if HIBERNATION
4769
4770 static vm_page_t hibernate_gobble_queue;
4771
4772 extern boolean_t (* volatile consider_buffer_cache_collect)(int);
4773
4774 static int  hibernate_drain_pageout_queue(struct vm_pageout_queue *);
4775 static int  hibernate_flush_dirty_pages(int);
4776 static int  hibernate_flush_queue(queue_head_t *, int);
4777
4778 void hibernate_flush_wait(void);
4779 void hibernate_mark_in_progress(void);
4780 void hibernate_clear_in_progress(void);
4781
4782 void            hibernate_free_range(int, int);
4783 void            hibernate_hash_insert_page(vm_page_t);
4784 uint32_t        hibernate_mark_as_unneeded(addr64_t, addr64_t, hibernate_page_list_t *, hibernate_page_list_t *);
4785 void            hibernate_rebuild_vm_structs(void);
4786 uint32_t        hibernate_teardown_vm_structs(hibernate_page_list_t *, hibernate_page_list_t *);
4787 ppnum_t         hibernate_lookup_paddr(unsigned int);
4788
4789 struct hibernate_statistics {
4790         int hibernate_considered;
4791         int hibernate_reentered_on_q;
4792         int hibernate_found_dirty;
4793         int hibernate_skipped_cleaning;
4794         int hibernate_skipped_transient;
4795         int hibernate_skipped_precious;
4796         int hibernate_skipped_external;
4797         int hibernate_queue_nolock;
4798         int hibernate_queue_paused;
4799         int hibernate_throttled;
4800         int hibernate_throttle_timeout;
4801         int hibernate_drained;
4802         int hibernate_drain_timeout;
4803         int cd_lock_failed;
4804         int cd_found_precious;
4805         int cd_found_wired;
4806         int cd_found_busy;
4807         int cd_found_unusual;
4808         int cd_found_cleaning;
4809         int cd_found_laundry;
4810         int cd_found_dirty;
4811         int cd_found_xpmapped;
4812         int cd_local_free;
4813         int cd_total_free;
4814         int cd_vm_page_wire_count;
4815         int cd_vm_struct_pages_unneeded;
4816         int cd_pages;
4817         int cd_discarded;
4818         int cd_count_wire;
4819 } hibernate_stats;
4820
4821
4822
4823 static int
4824 hibernate_drain_pageout_queue(struct vm_pageout_queue *q)
4825 {
4826         wait_result_t   wait_result;
4827
4828         vm_page_lock_queues();
4829
4830         while ( !queue_empty(&q->pgo_pending) ) {
4831
4832                 q->pgo_draining = TRUE;
4833
4834                 assert_wait_timeout((event_t) (&q->pgo_laundry+1), THREAD_INTERRUPTIBLE, 5000, 1000*NSEC_PER_USEC);
4835
4836                 vm_page_unlock_queues();
4837
4838                 wait_result = thread_block(THREAD_CONTINUE_NULL);
4839
4840                 if (wait_result == THREAD_TIMED_OUT && !queue_empty(&q->pgo_pending)) {
4841                         hibernate_stats.hibernate_drain_timeout++;
4842
4843                         if (q == &vm_pageout_queue_external)
4844                                 return (0);
4845
4846                         return (1);
4847                 }
4848                 vm_page_lock_queues();
4849
4850                 hibernate_stats.hibernate_drained++;
4851         }
4852         vm_page_unlock_queues();
4853
4854         return (0);
4855 }
4856
4857
4858 boolean_t hibernate_skip_external = FALSE;
4859
4860 static int
4861 hibernate_flush_queue(queue_head_t *q, int qcount)
4862 {
4863         vm_page_t       m;
4864         vm_object_t     l_object = NULL;
4865         vm_object_t     m_object = NULL;
4866         int             refmod_state = 0;
4867         int             try_failed_count = 0;
4868         int             retval = 0;
4869         int             current_run = 0;
4870         struct  vm_pageout_queue *iq;
4871         struct  vm_pageout_queue *eq;
4872         struct  vm_pageout_queue *tq;
4873
4874
4875         KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_START, q, qcount, 0, 0, 0);
4876
4877         iq = &vm_pageout_queue_internal;
4878         eq = &vm_pageout_queue_external;
4879
4880         vm_page_lock_queues();
4881
4882         while (qcount && !queue_empty(q)) {
4883
4884                 if (current_run++ == 1000) {
4885                         if (hibernate_should_abort()) {
4886                                 retval = 1;
4887                                 break;
4888                         }
4889                         current_run = 0;
4890                 }
4891
4892                 m = (vm_page_t) queue_first(q);
4893                 m_object = m->object;
4894
4895                 /*
4896                  * check to see if we currently are working
4897                  * with the same object... if so, we've
4898                  * already got the lock
4899                  */
4900                 if (m_object != l_object) {
4901                         /*
4902                          * the object associated with candidate page is
4903                          * different from the one we were just working
4904                          * with... dump the lock if we still own it
4905                          */
4906                         if (l_object != NULL) {
4907                                 vm_object_unlock(l_object);
4908                                 l_object = NULL;
4909                         }
4910                         /*
4911                          * Try to lock object; since we've alread got the
4912                          * page queues lock, we can only 'try' for this one.
4913                          * if the 'try' fails, we need to do a mutex_pause
4914                          * to allow the owner of the object lock a chance to
4915                          * run...
4916                          */
4917                         if ( !vm_object_lock_try_scan(m_object)) {
4918
4919                                 if (try_failed_count > 20) {
4920                                         hibernate_stats.hibernate_queue_nolock++;
4921
4922                                         goto reenter_pg_on_q;
4923                                 }
4924                                 vm_pageout_scan_wants_object = m_object;
4925
4926                                 vm_page_unlock_queues();
4927                                 mutex_pause(try_failed_count++);
4928                                 vm_page_lock_queues();
4929
4930                                 hibernate_stats.hibernate_queue_paused++;
4931                                 continue;
4932                         } else {
4933                                 l_object = m_object;
4934                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
4935                         }
4936                 }
4937                 if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error) {
4938                         /*
4939                          * page is not to be cleaned
4940                          * put it back on the head of its queue
4941                          */
4942                         if (m->cleaning)
4943                                 hibernate_stats.hibernate_skipped_cleaning++;
4944                         else
4945                                 hibernate_stats.hibernate_skipped_transient++;
4946
4947                         goto reenter_pg_on_q;
4948                 }
4949                 if (m_object->copy == VM_OBJECT_NULL) {
4950                         if (m_object->purgable == VM_PURGABLE_VOLATILE || m_object->purgable == VM_PURGABLE_EMPTY) {
4951                                 /*
4952                                  * let the normal hibernate image path
4953                                  * deal with these
4954                                  */
4955                                 goto reenter_pg_on_q;
4956                         }
4957                 }
4958                 if ( !m->dirty && m->pmapped) {
4959                         refmod_state = pmap_get_refmod(m->phys_page);
4960
4961                         if ((refmod_state & VM_MEM_MODIFIED)) {
4962                                 SET_PAGE_DIRTY(m, FALSE);
4963                         }
4964                 } else
4965                         refmod_state = 0;
4966
4967                 if ( !m->dirty) {
4968                         /*
4969                          * page is not to be cleaned
4970                          * put it back on the head of its queue
4971                          */
4972                         if (m->precious)
4973                                 hibernate_stats.hibernate_skipped_precious++;
4974
4975                         goto reenter_pg_on_q;
4976                 }
4977
4978                 if (hibernate_skip_external == TRUE && !m_object->internal) {
4979
4980                         hibernate_stats.hibernate_skipped_external++;
4981
4982                         goto reenter_pg_on_q;
4983                 }
4984                 tq = NULL;
4985
4986                 if (m_object->internal) {
4987                         if (VM_PAGE_Q_THROTTLED(iq))
4988                                 tq = iq;
4989                 } else if (VM_PAGE_Q_THROTTLED(eq))
4990                         tq = eq;
4991
4992                 if (tq != NULL) {
4993                         wait_result_t   wait_result;
4994                         int             wait_count = 5;
4995
4996                         if (l_object != NULL) {
4997                                 vm_object_unlock(l_object);
4998                                 l_object = NULL;
4999                         }
5000                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
5001
5002                         while (retval == 0) {
5003
5004                                 tq->pgo_throttled = TRUE;
5005
5006                                 assert_wait_timeout((event_t) &tq->pgo_laundry, THREAD_INTERRUPTIBLE, 1000, 1000*NSEC_PER_USEC);
5007
5008                                 vm_page_unlock_queues();
5009
5010                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
5011
5012                                 vm_page_lock_queues();
5013
5014                                 if (wait_result != THREAD_TIMED_OUT)
5015                                         break;
5016                                 if (!VM_PAGE_Q_THROTTLED(tq))
5017                                         break;
5018
5019                                 if (hibernate_should_abort())
5020                                         retval = 1;
5021
5022                                 if (--wait_count == 0) {
5023
5024                                         hibernate_stats.hibernate_throttle_timeout++;
5025
5026                                         if (tq == eq) {
5027                                                 hibernate_skip_external = TRUE;
5028                                                 break;
5029                                         }
5030                                         retval = 1;
5031                                 }
5032                         }
5033                         if (retval)
5034                                 break;
5035
5036                         hibernate_stats.hibernate_throttled++;
5037
5038                         continue;
5039                 }
5040                 /*
5041                  * we've already factored out pages in the laundry which
5042                  * means this page can't be on the pageout queue so it's
5043                  * safe to do the VM_PAGE_QUEUES_REMOVE
5044                  */
5045                 assert(!m->pageout_queue);
5046
5047                 VM_PAGE_QUEUES_REMOVE(m);
5048
5049                 if (COMPRESSED_PAGER_IS_ACTIVE)
5050                         pmap_disconnect(m->phys_page);
5051
5052                 vm_pageout_cluster(m, FALSE);
5053
5054                 hibernate_stats.hibernate_found_dirty++;
5055
5056                 goto next_pg;
5057
5058 reenter_pg_on_q:
5059                 queue_remove(q, m, vm_page_t, pageq);
5060                 queue_enter(q, m, vm_page_t, pageq);
5061
5062                 hibernate_stats.hibernate_reentered_on_q++;
5063 next_pg:
5064                 hibernate_stats.hibernate_considered++;
5065
5066                 qcount--;
5067                 try_failed_count = 0;
5068         }
5069         if (l_object != NULL) {
5070                 vm_object_unlock(l_object);
5071                 l_object = NULL;
5072         }
5073         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
5074
5075         vm_page_unlock_queues();
5076
5077         KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_END, hibernate_stats.hibernate_found_dirty, retval, 0, 0, 0);
5078
5079         return (retval);
5080 }
5081
5082
5083 static int
5084 hibernate_flush_dirty_pages(int pass)
5085 {
5086         struct vm_speculative_age_q     *aq;
5087         uint32_t        i;
5088
5089         bzero(&hibernate_stats, sizeof(struct hibernate_statistics));
5090
5091         if (vm_page_local_q) {
5092                 for (i = 0; i < vm_page_local_q_count; i++)
5093                         vm_page_reactivate_local(i, TRUE, FALSE);
5094         }
5095
5096         for (i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++) {
5097                 int             qcount;
5098                 vm_page_t       m;
5099
5100                 aq = &vm_page_queue_speculative[i];
5101
5102                 if (queue_empty(&aq->age_q))
5103                         continue;
5104                 qcount = 0;
5105
5106                 vm_page_lockspin_queues();
5107
5108                 queue_iterate(&aq->age_q,
5109                               m,
5110                               vm_page_t,
5111                               pageq)
5112                 {
5113                         qcount++;
5114                 }
5115                 vm_page_unlock_queues();
5116
5117                 if (qcount) {
5118                         if (hibernate_flush_queue(&aq->age_q, qcount))
5119                                 return (1);
5120                 }
5121         }
5122         if (hibernate_flush_queue(&vm_page_queue_inactive, vm_page_inactive_count - vm_page_anonymous_count - vm_page_cleaned_count))
5123                 return (1);
5124         if (hibernate_flush_queue(&vm_page_queue_anonymous, vm_page_anonymous_count))
5125                 return (1);
5126         if (hibernate_flush_queue(&vm_page_queue_cleaned, vm_page_cleaned_count))
5127                 return (1);
5128         if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal))
5129                 return (1);
5130
5131         if (COMPRESSED_PAGER_IS_ACTIVE && pass == 1)
5132                 vm_compressor_record_warmup_start();
5133
5134         if (hibernate_flush_queue(&vm_page_queue_active, vm_page_active_count)) {
5135                 if (COMPRESSED_PAGER_IS_ACTIVE && pass == 1)
5136                         vm_compressor_record_warmup_end();
5137                 return (1);
5138         }
5139         if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) {
5140                 if (COMPRESSED_PAGER_IS_ACTIVE && pass == 1)
5141                         vm_compressor_record_warmup_end();
5142                 return (1);
5143         }
5144         if (COMPRESSED_PAGER_IS_ACTIVE && pass == 1)
5145                 vm_compressor_record_warmup_end();
5146
5147         if (hibernate_skip_external == FALSE && hibernate_drain_pageout_queue(&vm_pageout_queue_external))
5148                 return (1);
5149
5150         return (0);
5151 }
5152
5153
5154 int
5155 hibernate_flush_memory()
5156 {
5157         int     retval;
5158
5159         KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_START, vm_page_free_count, 0, 0, 0, 0);
5160
5161         hibernate_cleaning_in_progress = TRUE;
5162         hibernate_skip_external = FALSE;
5163
5164         if ((retval = hibernate_flush_dirty_pages(1)) == 0) {
5165
5166                 if (COMPRESSED_PAGER_IS_ACTIVE) {
5167
5168                         if ((retval = hibernate_flush_dirty_pages(2)) == 0) {
5169
5170                                 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_START, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
5171
5172                                 vm_compressor_flush();
5173
5174                                 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_END, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
5175                         }
5176                 }
5177                 if (retval == 0 && consider_buffer_cache_collect != NULL) {
5178                         unsigned int orig_wire_count;
5179
5180                         KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_START, 0, 0, 0, 0, 0);
5181                         orig_wire_count = vm_page_wire_count;
5182
5183                         (void)(*consider_buffer_cache_collect)(1);
5184                         consider_zone_gc(TRUE);
5185
5186                         HIBLOG("hibernate_flush_memory: buffer_cache_gc freed up %d wired pages\n", orig_wire_count - vm_page_wire_count);
5187
5188                         KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_END, orig_wire_count - vm_page_wire_count, 0, 0, 0, 0);
5189                 }
5190         }
5191         hibernate_cleaning_in_progress = FALSE;
5192
5193         KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_END, vm_page_free_count, hibernate_stats.hibernate_found_dirty, retval, 0, 0);
5194
5195         if (retval && COMPRESSED_PAGER_IS_ACTIVE)
5196                 HIBLOG("hibernate_flush_memory() failed to finish - vm_page_compressor_count(%d)\n", VM_PAGE_COMPRESSOR_COUNT);
5197
5198
5199     HIBPRINT("hibernate_flush_memory() considered(%d) reentered_on_q(%d) found_dirty(%d)\n",
5200                 hibernate_stats.hibernate_considered,
5201                 hibernate_stats.hibernate_reentered_on_q,
5202                 hibernate_stats.hibernate_found_dirty);
5203     HIBPRINT("   skipped_cleaning(%d) skipped_transient(%d) skipped_precious(%d) skipped_external(%d) queue_nolock(%d)\n",
5204                 hibernate_stats.hibernate_skipped_cleaning,
5205                 hibernate_stats.hibernate_skipped_transient,
5206                 hibernate_stats.hibernate_skipped_precious,
5207                 hibernate_stats.hibernate_skipped_external,
5208                 hibernate_stats.hibernate_queue_nolock);
5209     HIBPRINT("   queue_paused(%d) throttled(%d) throttle_timeout(%d) drained(%d) drain_timeout(%d)\n",
5210                 hibernate_stats.hibernate_queue_paused,
5211                 hibernate_stats.hibernate_throttled,
5212                 hibernate_stats.hibernate_throttle_timeout,
5213                 hibernate_stats.hibernate_drained,
5214                 hibernate_stats.hibernate_drain_timeout);
5215
5216         return (retval);
5217 }
5218
5219
5220 static void
5221 hibernate_page_list_zero(hibernate_page_list_t *list)
5222 {
5223     uint32_t             bank;
5224     hibernate_bitmap_t * bitmap;
5225
5226     bitmap = &list->bank_bitmap[0];
5227     for (bank = 0; bank < list->bank_count; bank++)
5228     {
5229         uint32_t last_bit;
5230
5231         bzero((void *) &bitmap->bitmap[0], bitmap->bitmapwords << 2);
5232         // set out-of-bound bits at end of bitmap.
5233         last_bit = ((bitmap->last_page - bitmap->first_page + 1) & 31);
5234         if (last_bit)
5235             bitmap->bitmap[bitmap->bitmapwords - 1] = (0xFFFFFFFF >> last_bit);
5236
5237         bitmap = (hibernate_bitmap_t *) &bitmap->bitmap[bitmap->bitmapwords];
5238     }
5239 }
5240
5241 void
5242 hibernate_gobble_pages(uint32_t gobble_count, uint32_t free_page_time)
5243 {
5244     uint32_t i;
5245     vm_page_t m;
5246     uint64_t start, end, timeout, nsec;
5247     clock_interval_to_deadline(free_page_time, 1000 * 1000 /*ms*/, &timeout);
5248     clock_get_uptime(&start);
5249
5250     for (i = 0; i < gobble_count; i++)
5251     {
5252         while (VM_PAGE_NULL == (m = vm_page_grab()))
5253         {
5254             clock_get_uptime(&end);
5255             if (end >= timeout)
5256                 break;
5257             VM_PAGE_WAIT();
5258         }
5259         if (!m)
5260             break;
5261         m->busy = FALSE;
5262         vm_page_gobble(m);
5263
5264         m->pageq.next = (queue_entry_t) hibernate_gobble_queue;
5265         hibernate_gobble_queue = m;
5266     }
5267
5268     clock_get_uptime(&end);
5269     absolutetime_to_nanoseconds(end - start, &nsec);
5270     HIBLOG("Gobbled %d pages, time: %qd ms\n", i, nsec / 1000000ULL);
5271 }
5272
5273 void
5274 hibernate_free_gobble_pages(void)
5275 {
5276     vm_page_t m, next;
5277     uint32_t  count = 0;
5278
5279     m = (vm_page_t) hibernate_gobble_queue;
5280     while(m)
5281     {
5282         next = (vm_page_t) m->pageq.next;
5283         vm_page_free(m);
5284         count++;
5285         m = next;
5286     }
5287     hibernate_gobble_queue = VM_PAGE_NULL;
5288
5289     if (count)
5290         HIBLOG("Freed %d pages\n", count);
5291 }
5292
5293 static boolean_t
5294 hibernate_consider_discard(vm_page_t m, boolean_t preflight)
5295 {
5296     vm_object_t object = NULL;
5297     int                  refmod_state;
5298     boolean_t            discard = FALSE;
5299
5300     do
5301     {
5302         if (m->private)
5303             panic("hibernate_consider_discard: private");
5304
5305         if (!vm_object_lock_try(m->object)) {
5306             if (!preflight) hibernate_stats.cd_lock_failed++;
5307             break;
5308         }
5309         object = m->object;
5310
5311         if (VM_PAGE_WIRED(m)) {
5312             if (!preflight) hibernate_stats.cd_found_wired++;
5313             break;
5314         }
5315         if (m->precious) {
5316             if (!preflight) hibernate_stats.cd_found_precious++;
5317             break;
5318         }
5319         if (m->busy || !object->alive) {
5320            /*
5321             *   Somebody is playing with this page.
5322             */
5323             if (!preflight) hibernate_stats.cd_found_busy++;
5324             break;
5325         }
5326         if (m->absent || m->unusual || m->error) {
5327            /*
5328             * If it's unusual in anyway, ignore it
5329             */
5330             if (!preflight) hibernate_stats.cd_found_unusual++;
5331             break;
5332         }
5333         if (m->cleaning) {
5334             if (!preflight) hibernate_stats.cd_found_cleaning++;
5335             break;
5336         }
5337         if (m->laundry) {
5338             if (!preflight) hibernate_stats.cd_found_laundry++;
5339             break;
5340         }
5341         if (!m->dirty)
5342         {
5343             refmod_state = pmap_get_refmod(m->phys_page);
5344
5345             if (refmod_state & VM_MEM_REFERENCED)
5346                 m->reference = TRUE;
5347             if (refmod_state & VM_MEM_MODIFIED) {
5348                 SET_PAGE_DIRTY(m, FALSE);
5349             }
5350         }
5351
5352         /*
5353          * If it's clean or purgeable we can discard the page on wakeup.
5354          */
5355         discard = (!m->dirty)
5356                     || (VM_PURGABLE_VOLATILE == object->purgable)
5357                     || (VM_PURGABLE_EMPTY    == object->purgable);
5358
5359
5360         if (discard == FALSE) {
5361                 if (!preflight)
5362                         hibernate_stats.cd_found_dirty++;
5363         } else if (m->xpmapped && m->reference) {
5364                 if (!preflight)
5365                         hibernate_stats.cd_found_xpmapped++;
5366                 discard = FALSE;
5367         }
5368     }
5369     while (FALSE);
5370
5371     if (object)
5372         vm_object_unlock(object);
5373
5374     return (discard);
5375 }
5376
5377
5378 static void
5379 hibernate_discard_page(vm_page_t m)
5380 {
5381     if (m->absent || m->unusual || m->error)
5382        /*
5383         * If it's unusual in anyway, ignore
5384         */
5385         return;
5386
5387 #if DEBUG
5388     vm_object_t object = m->object;
5389     if (!vm_object_lock_try(m->object))
5390         panic("hibernate_discard_page(%p) !vm_object_lock_try", m);
5391 #else
5392     /* No need to lock page queue for token delete, hibernate_vm_unlock()
5393        makes sure these locks are uncontended before sleep */
5394 #endif  /* !DEBUG */
5395
5396     if (m->pmapped == TRUE)
5397     {
5398         __unused int refmod_state = pmap_disconnect(m->phys_page);
5399     }
5400
5401     if (m->laundry)
5402         panic("hibernate_discard_page(%p) laundry", m);
5403     if (m->private)
5404         panic("hibernate_discard_page(%p) private", m);
5405     if (m->fictitious)
5406         panic("hibernate_discard_page(%p) fictitious", m);
5407
5408     if (VM_PURGABLE_VOLATILE == m->object->purgable)
5409     {
5410         /* object should be on a queue */
5411         assert((m->object->objq.next != NULL) && (m->object->objq.prev != NULL));
5412         purgeable_q_t old_queue = vm_purgeable_object_remove(m->object);
5413         assert(old_queue);
5414         if (m->object->purgeable_when_ripe) {
5415                 vm_purgeable_token_delete_first(old_queue);
5416         }
5417         m->object->purgable = VM_PURGABLE_EMPTY;
5418     }
5419
5420     vm_page_free(m);
5421
5422 #if DEBUG
5423     vm_object_unlock(object);
5424 #endif  /* DEBUG */
5425 }
5426
5427 /*
5428  Grab locks for hibernate_page_list_setall()
5429 */
5430 void
5431 hibernate_vm_lock_queues(void)
5432 {
5433     vm_object_lock(compressor_object);
5434     vm_page_lock_queues();
5435     lck_mtx_lock(&vm_page_queue_free_lock);
5436
5437     if (vm_page_local_q) {
5438         uint32_t  i;
5439         for (i = 0; i < vm_page_local_q_count; i++) {
5440             struct vpl  *lq;
5441             lq = &vm_page_local_q[i].vpl_un.vpl;
5442             VPL_LOCK(&lq->vpl_lock);
5443         }
5444     }
5445 }
5446
5447 void
5448 hibernate_vm_unlock_queues(void)
5449 {
5450     if (vm_page_local_q) {
5451         uint32_t  i;
5452         for (i = 0; i < vm_page_local_q_count; i++) {
5453             struct vpl  *lq;
5454             lq = &vm_page_local_q[i].vpl_un.vpl;
5455             VPL_UNLOCK(&lq->vpl_lock);
5456         }
5457     }
5458     lck_mtx_unlock(&vm_page_queue_free_lock);
5459     vm_page_unlock_queues();
5460     vm_object_unlock(compressor_object);
5461 }
5462
5463 /*
5464  Bits zero in the bitmaps => page needs to be saved. All pages default to be saved,
5465  pages known to VM to not need saving are subtracted.
5466  Wired pages to be saved are present in page_list_wired, pageable in page_list.
5467 */
5468
5469 void
5470 hibernate_page_list_setall(hibernate_page_list_t * page_list,
5471                            hibernate_page_list_t * page_list_wired,
5472                            hibernate_page_list_t * page_list_pal,
5473                            boolean_t preflight,
5474                            boolean_t will_discard,
5475                            uint32_t * pagesOut)
5476 {
5477     uint64_t start, end, nsec;
5478     vm_page_t m;
5479     vm_page_t next;
5480     uint32_t pages = page_list->page_count;
5481     uint32_t count_anonymous = 0, count_throttled = 0, count_compressor = 0;
5482     uint32_t count_inactive = 0, count_active = 0, count_speculative = 0, count_cleaned = 0;
5483     uint32_t count_wire = pages;
5484     uint32_t count_discard_active    = 0;
5485     uint32_t count_discard_inactive  = 0;
5486     uint32_t count_discard_cleaned   = 0;
5487     uint32_t count_discard_purgeable = 0;
5488     uint32_t count_discard_speculative = 0;
5489     uint32_t count_discard_vm_struct_pages = 0;
5490     uint32_t i;
5491     uint32_t             bank;
5492     hibernate_bitmap_t * bitmap;
5493     hibernate_bitmap_t * bitmap_wired;
5494     boolean_t                    discard_all;
5495     boolean_t            discard;
5496
5497     HIBLOG("hibernate_page_list_setall(preflight %d) start %p, %p\n", preflight, page_list, page_list_wired);
5498
5499     if (preflight) {
5500         page_list       = NULL;
5501         page_list_wired = NULL;
5502         page_list_pal   = NULL;
5503                 discard_all     = FALSE;
5504     } else {
5505                 discard_all     = will_discard;
5506     }
5507
5508 #if DEBUG
5509     if (!preflight)
5510     {
5511         vm_page_lock_queues();
5512         if (vm_page_local_q) {
5513             for (i = 0; i < vm_page_local_q_count; i++) {
5514                 struct vpl      *lq;
5515                 lq = &vm_page_local_q[i].vpl_un.vpl;
5516                 VPL_LOCK(&lq->vpl_lock);
5517             }
5518         }
5519     }
5520 #endif  /* DEBUG */
5521
5522
5523     KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_START, count_wire, 0, 0, 0, 0);
5524
5525     clock_get_uptime(&start);
5526
5527     if (!preflight) {
5528         hibernate_page_list_zero(page_list);
5529         hibernate_page_list_zero(page_list_wired);
5530         hibernate_page_list_zero(page_list_pal);
5531
5532         hibernate_stats.cd_vm_page_wire_count = vm_page_wire_count;
5533         hibernate_stats.cd_pages = pages;
5534     }
5535
5536     if (vm_page_local_q) {
5537             for (i = 0; i < vm_page_local_q_count; i++)
5538                     vm_page_reactivate_local(i, TRUE, !preflight);
5539     }
5540
5541     if (preflight) {
5542         vm_object_lock(compressor_object);
5543         vm_page_lock_queues();
5544         lck_mtx_lock(&vm_page_queue_free_lock);
5545     }
5546
5547     m = (vm_page_t) hibernate_gobble_queue;
5548     while (m)
5549     {
5550         pages--;
5551         count_wire--;
5552         if (!preflight) {
5553             hibernate_page_bitset(page_list,       TRUE, m->phys_page);
5554             hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
5555         }
5556         m = (vm_page_t) m->pageq.next;
5557     }
5558
5559     if (!preflight) for( i = 0; i < real_ncpus; i++ )
5560     {
5561         if (cpu_data_ptr[i] && cpu_data_ptr[i]->cpu_processor)
5562         {
5563             for (m = PROCESSOR_DATA(cpu_data_ptr[i]->cpu_processor, free_pages); m; m = (vm_page_t)m->pageq.next)
5564             {
5565                 pages--;
5566                 count_wire--;
5567                 hibernate_page_bitset(page_list,       TRUE, m->phys_page);
5568                 hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
5569
5570                 hibernate_stats.cd_local_free++;
5571                 hibernate_stats.cd_total_free++;
5572             }
5573         }
5574     }
5575
5576     for( i = 0; i < vm_colors; i++ )
5577     {
5578         queue_iterate(&vm_page_queue_free[i],
5579                       m,
5580                       vm_page_t,
5581                       pageq)
5582         {
5583             pages--;
5584             count_wire--;
5585             if (!preflight) {
5586                 hibernate_page_bitset(page_list,       TRUE, m->phys_page);
5587                 hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
5588
5589                 hibernate_stats.cd_total_free++;
5590             }
5591         }
5592     }
5593
5594     queue_iterate(&vm_lopage_queue_free,
5595                   m,
5596                   vm_page_t,
5597                   pageq)
5598     {
5599         pages--;
5600         count_wire--;
5601         if (!preflight) {
5602             hibernate_page_bitset(page_list,       TRUE, m->phys_page);
5603             hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
5604
5605             hibernate_stats.cd_total_free++;
5606         }
5607     }
5608
5609     m = (vm_page_t) queue_first(&vm_page_queue_throttled);
5610     while (m && !queue_end(&vm_page_queue_throttled, (queue_entry_t)m))
5611     {
5612         next = (vm_page_t) m->pageq.next;
5613         discard = FALSE;
5614         if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode)
5615          && hibernate_consider_discard(m, preflight))
5616         {
5617             if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page);
5618             count_discard_inactive++;
5619             discard = discard_all;
5620         }
5621         else
5622             count_throttled++;
5623         count_wire--;
5624         if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
5625
5626         if (discard) hibernate_discard_page(m);
5627         m = next;
5628     }
5629
5630     m = (vm_page_t) queue_first(&vm_page_queue_anonymous);
5631     while (m && !queue_end(&vm_page_queue_anonymous, (queue_entry_t)m))
5632     {
5633         next = (vm_page_t) m->pageq.next;
5634         discard = FALSE;
5635         if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode)
5636          && hibernate_consider_discard(m, preflight))
5637         {
5638             if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page);
5639             if (m->dirty)
5640                 count_discard_purgeable++;
5641             else
5642                 count_discard_inactive++;
5643             discard = discard_all;
5644         }
5645         else
5646             count_anonymous++;
5647         count_wire--;
5648         if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
5649         if (discard)    hibernate_discard_page(m);
5650         m = next;
5651     }
5652
5653     m = (vm_page_t) queue_first(&vm_page_queue_inactive);
5654     while (m && !queue_end(&vm_page_queue_inactive, (queue_entry_t)m))
5655     {
5656         next = (vm_page_t) m->pageq.next;
5657         discard = FALSE;
5658         if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode)
5659          && hibernate_consider_discard(m, preflight))
5660         {
5661             if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page);
5662             if (m->dirty)
5663                 count_discard_purgeable++;
5664             else
5665                 count_discard_inactive++;
5666             discard = discard_all;
5667         }
5668         else
5669             count_inactive++;
5670         count_wire--;
5671         if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
5672         if (discard)    hibernate_discard_page(m);
5673         m = next;
5674     }
5675
5676     m = (vm_page_t) queue_first(&vm_page_queue_cleaned);
5677     while (m && !queue_end(&vm_page_queue_cleaned, (queue_entry_t)m))
5678     {
5679         next = (vm_page_t) m->pageq.next;
5680         discard = FALSE;
5681         if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode)
5682          && hibernate_consider_discard(m, preflight))
5683         {
5684             if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page);
5685             if (m->dirty)
5686                 count_discard_purgeable++;
5687             else
5688                 count_discard_cleaned++;
5689             discard = discard_all;
5690         }
5691         else
5692             count_cleaned++;
5693         count_wire--;
5694         if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
5695         if (discard)    hibernate_discard_page(m);
5696         m = next;
5697     }
5698
5699     for( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ )
5700     {
5701         m = (vm_page_t) queue_first(&vm_page_queue_speculative[i].age_q);
5702         while (m && !queue_end(&vm_page_queue_speculative[i].age_q, (queue_entry_t)m))
5703         {
5704             next = (vm_page_t) m->pageq.next;
5705             discard = FALSE;
5706             if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode)
5707              && hibernate_consider_discard(m, preflight))
5708             {
5709                 if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page);
5710                 count_discard_speculative++;
5711                 discard = discard_all;
5712             }
5713             else
5714                 count_speculative++;
5715             count_wire--;
5716             if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
5717             if (discard)    hibernate_discard_page(m);
5718             m = next;
5719         }
5720     }
5721
5722     m = (vm_page_t) queue_first(&vm_page_queue_active);
5723     while (m && !queue_end(&vm_page_queue_active, (queue_entry_t)m))
5724     {
5725         next = (vm_page_t) m->pageq.next;
5726         discard = FALSE;
5727         if ((kIOHibernateModeDiscardCleanActive & gIOHibernateMode)
5728          && hibernate_consider_discard(m, preflight))
5729         {
5730             if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page);
5731             if (m->dirty)
5732                 count_discard_purgeable++;
5733             else
5734                 count_discard_active++;
5735             discard = discard_all;
5736         }
5737         else
5738             count_active++;
5739         count_wire--;
5740         if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
5741         if (discard)    hibernate_discard_page(m);
5742         m = next;
5743     }
5744
5745     queue_iterate(&compressor_object->memq, m, vm_page_t, listq)
5746     {
5747         count_compressor++;
5748         count_wire--;
5749         if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
5750     }
5751
5752     if (preflight == FALSE && discard_all == TRUE) {
5753             KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 12) | DBG_FUNC_START, 0, 0, 0, 0, 0);
5754
5755             HIBLOG("hibernate_teardown started\n");
5756             count_discard_vm_struct_pages = hibernate_teardown_vm_structs(page_list, page_list_wired);
5757             HIBLOG("hibernate_teardown completed - discarded %d\n", count_discard_vm_struct_pages);
5758
5759             pages -= count_discard_vm_struct_pages;
5760             count_wire -= count_discard_vm_struct_pages;
5761
5762             hibernate_stats.cd_vm_struct_pages_unneeded = count_discard_vm_struct_pages;
5763
5764             KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_END, 0, 0, 0, 0, 0);
5765     }
5766
5767     if (!preflight) {
5768         // pull wired from hibernate_bitmap
5769         bitmap = &page_list->bank_bitmap[0];
5770         bitmap_wired = &page_list_wired->bank_bitmap[0];
5771         for (bank = 0; bank < page_list->bank_count; bank++)
5772         {
5773             for (i = 0; i < bitmap->bitmapwords; i++)
5774                 bitmap->bitmap[i] = bitmap->bitmap[i] | ~bitmap_wired->bitmap[i];
5775             bitmap       = (hibernate_bitmap_t *) &bitmap->bitmap      [bitmap->bitmapwords];
5776             bitmap_wired = (hibernate_bitmap_t *) &bitmap_wired->bitmap[bitmap_wired->bitmapwords];
5777         }
5778     }
5779
5780     // machine dependent adjustments
5781     hibernate_page_list_setall_machine(page_list, page_list_wired, preflight, &pages);
5782
5783     if (!preflight) {
5784         hibernate_stats.cd_count_wire = count_wire;
5785         hibernate_stats.cd_discarded = count_discard_active + count_discard_inactive + count_discard_purgeable +
5786                 count_discard_speculative + count_discard_cleaned + count_discard_vm_struct_pages;
5787     }
5788
5789     clock_get_uptime(&end);
5790     absolutetime_to_nanoseconds(end - start, &nsec);
5791     HIBLOG("hibernate_page_list_setall time: %qd ms\n", nsec / 1000000ULL);
5792
5793     HIBLOG("pages %d, wire %d, act %d, inact %d, cleaned %d spec %d, zf %d, throt %d, compr %d, xpmapped %d\n  %s discard act %d inact %d purgeable %d spec %d cleaned %d\n",
5794            pages, count_wire, count_active, count_inactive, count_cleaned, count_speculative, count_anonymous, count_throttled, count_compressor, hibernate_stats.cd_found_xpmapped,
5795                 discard_all ? "did" : "could",
5796                 count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned);
5797
5798     *pagesOut = pages - count_discard_active - count_discard_inactive - count_discard_purgeable - count_discard_speculative - count_discard_cleaned;
5799
5800     if (preflight && will_discard) *pagesOut -= count_compressor + count_throttled + count_anonymous + count_inactive + count_cleaned + count_speculative + count_active;
5801
5802 #if DEBUG
5803     if (!preflight)
5804     {
5805         if (vm_page_local_q) {
5806             for (i = 0; i < vm_page_local_q_count; i++) {
5807                 struct vpl      *lq;
5808                 lq = &vm_page_local_q[i].vpl_un.vpl;
5809                 VPL_UNLOCK(&lq->vpl_lock);
5810             }
5811         }
5812         vm_page_unlock_queues();
5813     }
5814 #endif  /* DEBUG */
5815
5816     if (preflight) {
5817         lck_mtx_unlock(&vm_page_queue_free_lock);
5818         vm_page_unlock_queues();
5819         vm_object_unlock(compressor_object);
5820     }
5821
5822     KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_END, count_wire, *pagesOut, 0, 0, 0);
5823 }
5824
5825 void
5826 hibernate_page_list_discard(hibernate_page_list_t * page_list)
5827 {
5828     uint64_t  start, end, nsec;
5829     vm_page_t m;
5830     vm_page_t next;
5831     uint32_t  i;
5832     uint32_t  count_discard_active    = 0;
5833     uint32_t  count_discard_inactive  = 0;
5834     uint32_t  count_discard_purgeable = 0;
5835     uint32_t  count_discard_cleaned   = 0;
5836     uint32_t  count_discard_speculative = 0;
5837
5838
5839 #if DEBUG
5840         vm_page_lock_queues();
5841         if (vm_page_local_q) {
5842             for (i = 0; i < vm_page_local_q_count; i++) {
5843                 struct vpl      *lq;
5844                 lq = &vm_page_local_q[i].vpl_un.vpl;
5845                 VPL_LOCK(&lq->vpl_lock);
5846             }
5847         }
5848 #endif  /* DEBUG */
5849
5850     clock_get_uptime(&start);
5851
5852     m = (vm_page_t) queue_first(&vm_page_queue_anonymous);
5853     while (m && !queue_end(&vm_page_queue_anonymous, (queue_entry_t)m))
5854     {
5855         next = (vm_page_t) m->pageq.next;
5856         if (hibernate_page_bittst(page_list, m->phys_page))
5857         {
5858             if (m->dirty)
5859                 count_discard_purgeable++;
5860             else
5861                 count_discard_inactive++;
5862             hibernate_discard_page(m);
5863         }
5864         m = next;
5865     }
5866
5867     for( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ )
5868     {
5869        m = (vm_page_t) queue_first(&vm_page_queue_speculative[i].age_q);
5870        while (m && !queue_end(&vm_page_queue_speculative[i].age_q, (queue_entry_t)m))
5871        {
5872            next = (vm_page_t) m->pageq.next;
5873            if (hibernate_page_bittst(page_list, m->phys_page))
5874            {
5875                count_discard_speculative++;
5876                hibernate_discard_page(m);
5877            }
5878            m = next;
5879        }
5880     }
5881
5882     m = (vm_page_t) queue_first(&vm_page_queue_inactive);
5883     while (m && !queue_end(&vm_page_queue_inactive, (queue_entry_t)m))
5884     {
5885         next = (vm_page_t) m->pageq.next;
5886         if (hibernate_page_bittst(page_list, m->phys_page))
5887         {
5888             if (m->dirty)
5889                 count_discard_purgeable++;
5890             else
5891                 count_discard_inactive++;
5892             hibernate_discard_page(m);
5893         }
5894         m = next;
5895     }
5896
5897     m = (vm_page_t) queue_first(&vm_page_queue_active);
5898     while (m && !queue_end(&vm_page_queue_active, (queue_entry_t)m))
5899     {
5900         next = (vm_page_t) m->pageq.next;
5901         if (hibernate_page_bittst(page_list, m->phys_page))
5902         {
5903             if (m->dirty)
5904                 count_discard_purgeable++;
5905             else
5906                 count_discard_active++;
5907             hibernate_discard_page(m);
5908         }
5909         m = next;
5910     }
5911
5912     m = (vm_page_t) queue_first(&vm_page_queue_cleaned);
5913     while (m && !queue_end(&vm_page_queue_cleaned, (queue_entry_t)m))
5914     {
5915         next = (vm_page_t) m->pageq.next;
5916         if (hibernate_page_bittst(page_list, m->phys_page))
5917         {
5918             if (m->dirty)
5919                 count_discard_purgeable++;
5920             else
5921                 count_discard_cleaned++;
5922             hibernate_discard_page(m);
5923         }
5924         m = next;
5925     }
5926
5927 #if DEBUG
5928         if (vm_page_local_q) {
5929             for (i = 0; i < vm_page_local_q_count; i++) {
5930                 struct vpl      *lq;
5931                 lq = &vm_page_local_q[i].vpl_un.vpl;
5932                 VPL_UNLOCK(&lq->vpl_lock);
5933             }
5934         }
5935         vm_page_unlock_queues();
5936 #endif  /* DEBUG */
5937
5938     clock_get_uptime(&end);
5939     absolutetime_to_nanoseconds(end - start, &nsec);
5940     HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d spec %d cleaned %d\n",
5941                 nsec / 1000000ULL,
5942                 count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned);
5943 }
5944
5945 boolean_t       hibernate_paddr_map_inited = FALSE;
5946 boolean_t       hibernate_rebuild_needed = FALSE;
5947 unsigned int    hibernate_teardown_last_valid_compact_indx = -1;
5948 vm_page_t       hibernate_rebuild_hash_list = NULL;
5949
5950 unsigned int    hibernate_teardown_found_tabled_pages = 0;
5951 unsigned int    hibernate_teardown_found_created_pages = 0;
5952 unsigned int    hibernate_teardown_found_free_pages = 0;
5953 unsigned int    hibernate_teardown_vm_page_free_count;
5954
5955
5956 struct ppnum_mapping {
5957         struct ppnum_mapping    *ppnm_next;
5958         ppnum_t                 ppnm_base_paddr;
5959         unsigned int            ppnm_sindx;
5960         unsigned int            ppnm_eindx;
5961 };
5962
5963 struct ppnum_mapping    *ppnm_head;
5964 struct ppnum_mapping    *ppnm_last_found = NULL;
5965
5966
5967 void
5968 hibernate_create_paddr_map()
5969 {
5970         unsigned int    i;
5971         ppnum_t         next_ppnum_in_run = 0;
5972         struct ppnum_mapping *ppnm = NULL;
5973
5974         if (hibernate_paddr_map_inited == FALSE) {
5975
5976                 for (i = 0; i < vm_pages_count; i++) {
5977
5978                         if (ppnm)
5979                                 ppnm->ppnm_eindx = i;
5980
5981                         if (ppnm == NULL || vm_pages[i].phys_page != next_ppnum_in_run) {
5982
5983                                 ppnm = kalloc(sizeof(struct ppnum_mapping));
5984
5985                                 ppnm->ppnm_next = ppnm_head;
5986                                 ppnm_head = ppnm;
5987
5988                                 ppnm->ppnm_sindx = i;
5989                                 ppnm->ppnm_base_paddr = vm_pages[i].phys_page;
5990                         }
5991                         next_ppnum_in_run = vm_pages[i].phys_page + 1;
5992                 }
5993                 ppnm->ppnm_eindx++;
5994
5995                 hibernate_paddr_map_inited = TRUE;
5996         }
5997 }
5998
5999 ppnum_t
6000 hibernate_lookup_paddr(unsigned int indx)
6001 {
6002         struct ppnum_mapping *ppnm = NULL;
6003
6004         ppnm = ppnm_last_found;
6005
6006         if (ppnm) {
6007                 if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx)
6008                         goto done;
6009         }
6010         for (ppnm = ppnm_head; ppnm; ppnm = ppnm->ppnm_next) {
6011
6012                 if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx) {
6013                         ppnm_last_found = ppnm;
6014                         break;
6015                 }
6016         }
6017         if (ppnm == NULL)
6018                 panic("hibernate_lookup_paddr of %d failed\n", indx);
6019 done:
6020         return (ppnm->ppnm_base_paddr + (indx - ppnm->ppnm_sindx));
6021 }
6022
6023
6024 uint32_t
6025 hibernate_mark_as_unneeded(addr64_t saddr, addr64_t eaddr, hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
6026 {
6027         addr64_t        saddr_aligned;
6028         addr64_t        eaddr_aligned;
6029         addr64_t        addr;
6030         ppnum_t         paddr;
6031         unsigned int    mark_as_unneeded_pages = 0;
6032
6033         saddr_aligned = (saddr + PAGE_MASK_64) & ~PAGE_MASK_64;
6034         eaddr_aligned = eaddr & ~PAGE_MASK_64;
6035
6036         for (addr = saddr_aligned; addr < eaddr_aligned; addr += PAGE_SIZE_64) {
6037
6038                 paddr = pmap_find_phys(kernel_pmap, addr);
6039
6040                 assert(paddr);
6041
6042                 hibernate_page_bitset(page_list,       TRUE, paddr);
6043                 hibernate_page_bitset(page_list_wired, TRUE, paddr);
6044
6045                 mark_as_unneeded_pages++;
6046         }
6047         return (mark_as_unneeded_pages);
6048 }
6049
6050
6051 void
6052 hibernate_hash_insert_page(vm_page_t mem)
6053 {
6054         vm_page_bucket_t *bucket;
6055         int             hash_id;
6056
6057         assert(mem->hashed);
6058         assert(mem->object);
6059         assert(mem->offset != (vm_object_offset_t) -1);
6060
6061         /*
6062          *      Insert it into the object_object/offset hash table
6063          */
6064         hash_id = vm_page_hash(mem->object, mem->offset);
6065         bucket = &vm_page_buckets[hash_id];
6066
6067         mem->next = bucket->pages;
6068         bucket->pages = mem;
6069 }
6070
6071
6072 void
6073 hibernate_free_range(int sindx, int eindx)
6074 {
6075         vm_page_t       mem;
6076         unsigned int    color;
6077
6078         while (sindx < eindx) {
6079                 mem = &vm_pages[sindx];
6080
6081                 vm_page_init(mem, hibernate_lookup_paddr(sindx), FALSE);
6082
6083                 mem->lopage = FALSE;
6084                 mem->free = TRUE;
6085
6086                 color = mem->phys_page & vm_color_mask;
6087                 queue_enter_first(&vm_page_queue_free[color],
6088                                   mem,
6089                                   vm_page_t,
6090                                   pageq);
6091                 vm_page_free_count++;
6092
6093                 sindx++;
6094         }
6095 }
6096
6097
6098 extern void hibernate_rebuild_pmap_structs(void);
6099
6100 void
6101 hibernate_rebuild_vm_structs(void)
6102 {
6103         int             cindx, sindx, eindx;
6104         vm_page_t       mem, tmem, mem_next;
6105         AbsoluteTime    startTime, endTime;
6106         uint64_t        nsec;
6107
6108         if (hibernate_rebuild_needed == FALSE)
6109                 return;
6110
6111         KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_START, 0, 0, 0, 0, 0);
6112         HIBLOG("hibernate_rebuild started\n");
6113
6114         clock_get_uptime(&startTime);
6115
6116         hibernate_rebuild_pmap_structs();
6117
6118         bzero(&vm_page_buckets[0], vm_page_bucket_count * sizeof(vm_page_bucket_t));
6119         eindx = vm_pages_count;
6120
6121         for (cindx = hibernate_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
6122
6123                 mem = &vm_pages[cindx];
6124                 /*
6125                  * hibernate_teardown_vm_structs leaves the location where
6126                  * this vm_page_t must be located in "next".
6127                  */
6128                 tmem = mem->next;
6129                 mem->next = NULL;
6130
6131                 sindx = (int)(tmem - &vm_pages[0]);
6132
6133                 if (mem != tmem) {
6134                         /*
6135                          * this vm_page_t was moved by hibernate_teardown_vm_structs,
6136                          * so move it back to its real location
6137                          */
6138                         *tmem = *mem;
6139                         mem = tmem;
6140                 }
6141                 if (mem->hashed)
6142                         hibernate_hash_insert_page(mem);
6143                 /*
6144                  * the 'hole' between this vm_page_t and the previous
6145                  * vm_page_t we moved needs to be initialized as
6146                  * a range of free vm_page_t's
6147                  */
6148                 hibernate_free_range(sindx + 1, eindx);
6149
6150                 eindx = sindx;
6151         }
6152         if (sindx)
6153                 hibernate_free_range(0, sindx);
6154
6155         assert(vm_page_free_count == hibernate_teardown_vm_page_free_count);
6156
6157         /*
6158          * process the list of vm_page_t's that were entered in the hash,
6159          * but were not located in the vm_pages arrary... these are
6160          * vm_page_t's that were created on the fly (i.e. fictitious)
6161          */
6162         for (mem = hibernate_rebuild_hash_list; mem; mem = mem_next) {
6163                 mem_next = mem->next;
6164
6165                 mem->next = NULL;
6166                 hibernate_hash_insert_page(mem);
6167         }
6168         hibernate_rebuild_hash_list = NULL;
6169
6170         clock_get_uptime(&endTime);
6171         SUB_ABSOLUTETIME(&endTime, &startTime);
6172         absolutetime_to_nanoseconds(endTime, &nsec);
6173
6174         HIBLOG("hibernate_rebuild completed - took %qd msecs\n", nsec / 1000000ULL);
6175
6176         hibernate_rebuild_needed = FALSE;
6177
6178         KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_END, 0, 0, 0, 0, 0);
6179 }
6180
6181
6182 extern void hibernate_teardown_pmap_structs(addr64_t *, addr64_t *);
6183
6184 uint32_t
6185 hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
6186 {
6187         unsigned int    i;
6188         unsigned int    compact_target_indx;
6189         vm_page_t       mem, mem_next;
6190         vm_page_bucket_t *bucket;
6191         unsigned int    mark_as_unneeded_pages = 0;
6192         unsigned int    unneeded_vm_page_bucket_pages = 0;
6193         unsigned int    unneeded_vm_pages_pages = 0;
6194         unsigned int    unneeded_pmap_pages = 0;
6195         addr64_t        start_of_unneeded = 0;
6196         addr64_t        end_of_unneeded = 0;
6197
6198
6199         if (hibernate_should_abort())
6200                 return (0);
6201
6202         HIBLOG("hibernate_teardown: wired_pages %d, free_pages %d, active_pages %d, inactive_pages %d, speculative_pages %d, cleaned_pages %d, compressor_pages %d\n",
6203                vm_page_wire_count, vm_page_free_count, vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count,
6204                vm_page_cleaned_count, compressor_object->resident_page_count);
6205
6206         for (i = 0; i < vm_page_bucket_count; i++) {
6207
6208                 bucket = &vm_page_buckets[i];
6209
6210                 for (mem = bucket->pages; mem != VM_PAGE_NULL; mem = mem_next) {
6211                         assert(mem->hashed);
6212
6213                         mem_next = mem->next;
6214
6215                         if (mem < &vm_pages[0] || mem >= &vm_pages[vm_pages_count]) {
6216                                 mem->next = hibernate_rebuild_hash_list;
6217                                 hibernate_rebuild_hash_list = mem;
6218                         }
6219                 }
6220         }
6221         unneeded_vm_page_bucket_pages = hibernate_mark_as_unneeded((addr64_t)&vm_page_buckets[0], (addr64_t)&vm_page_buckets[vm_page_bucket_count], page_list, page_list_wired);
6222         mark_as_unneeded_pages += unneeded_vm_page_bucket_pages;
6223
6224         hibernate_teardown_vm_page_free_count = vm_page_free_count;
6225
6226         compact_target_indx = 0;
6227
6228         for (i = 0; i < vm_pages_count; i++) {
6229
6230                 mem = &vm_pages[i];
6231
6232                 if (mem->free) {
6233                         unsigned int color;
6234
6235                         assert(mem->busy);
6236                         assert(!mem->lopage);
6237
6238                         color = mem->phys_page & vm_color_mask;
6239
6240                         queue_remove(&vm_page_queue_free[color],
6241                                      mem,
6242                                      vm_page_t,
6243                                      pageq);
6244                         mem->pageq.next = NULL;
6245                         mem->pageq.prev = NULL;
6246
6247                         vm_page_free_count--;
6248
6249                         hibernate_teardown_found_free_pages++;
6250
6251                         if ( !vm_pages[compact_target_indx].free)
6252                                 compact_target_indx = i;
6253                 } else {
6254                         /*
6255                          * record this vm_page_t's original location
6256                          * we need this even if it doesn't get moved
6257                          * as an indicator to the rebuild function that
6258                          * we don't have to move it
6259                          */
6260                         mem->next = mem;
6261
6262                         if (vm_pages[compact_target_indx].free) {
6263                                 /*
6264                                  * we've got a hole to fill, so
6265                                  * move this vm_page_t to it's new home
6266                                  */
6267                                 vm_pages[compact_target_indx] = *mem;
6268                                 mem->free = TRUE;
6269
6270                                 hibernate_teardown_last_valid_compact_indx = compact_target_indx;
6271                                 compact_target_indx++;
6272                         } else
6273                                 hibernate_teardown_last_valid_compact_indx = i;
6274                 }
6275         }
6276         unneeded_vm_pages_pages = hibernate_mark_as_unneeded((addr64_t)&vm_pages[hibernate_teardown_last_valid_compact_indx+1],
6277                                                              (addr64_t)&vm_pages[vm_pages_count-1], page_list, page_list_wired);
6278         mark_as_unneeded_pages += unneeded_vm_pages_pages;
6279
6280         hibernate_teardown_pmap_structs(&start_of_unneeded, &end_of_unneeded);
6281
6282         if (start_of_unneeded) {
6283                 unneeded_pmap_pages = hibernate_mark_as_unneeded(start_of_unneeded, end_of_unneeded, page_list, page_list_wired);
6284                 mark_as_unneeded_pages += unneeded_pmap_pages;
6285         }
6286         HIBLOG("hibernate_teardown: mark_as_unneeded_pages %d, %d, %d\n", unneeded_vm_page_bucket_pages, unneeded_vm_pages_pages, unneeded_pmap_pages);
6287
6288         hibernate_rebuild_needed = TRUE;
6289
6290         return (mark_as_unneeded_pages);
6291 }
6292
6293
6294 #endif /* HIBERNATION */
6295
6296 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
6297
6298 #include <mach_vm_debug.h>
6299 #if     MACH_VM_DEBUG
6300
6301 #include <mach_debug/hash_info.h>
6302 #include <vm/vm_debug.h>
6303
6304 /*
6305  *      Routine:        vm_page_info
6306  *      Purpose:
6307  *              Return information about the global VP table.
6308  *              Fills the buffer with as much information as possible
6309  *              and returns the desired size of the buffer.
6310  *      Conditions:
6311  *              Nothing locked.  The caller should provide
6312  *              possibly-pageable memory.
6313  */
6314
6315 unsigned int
6316 vm_page_info(
6317         hash_info_bucket_t *info,
6318         unsigned int count)
6319 {
6320         unsigned int i;
6321         lck_spin_t      *bucket_lock;
6322
6323         if (vm_page_bucket_count < count)
6324                 count = vm_page_bucket_count;
6325
6326         for (i = 0; i < count; i++) {
6327                 vm_page_bucket_t *bucket = &vm_page_buckets[i];
6328                 unsigned int bucket_count = 0;
6329                 vm_page_t m;
6330
6331                 bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
6332                 lck_spin_lock(bucket_lock);
6333
6334                 for (m = bucket->pages; m != VM_PAGE_NULL; m = m->next)
6335                         bucket_count++;
6336
6337                 lck_spin_unlock(bucket_lock);
6338
6339                 /* don't touch pageable memory while holding locks */
6340                 info[i].hib_count = bucket_count;
6341         }
6342
6343         return vm_page_bucket_count;
6344 }
6345 #endif  /* MACH_VM_DEBUG */
6346
6347 #if VM_PAGE_BUCKETS_CHECK
6348 void
6349 vm_page_buckets_check(void)
6350 {
6351         unsigned int i;
6352         vm_page_t p;
6353         unsigned int p_hash;
6354         vm_page_bucket_t *bucket;
6355         lck_spin_t      *bucket_lock;
6356
6357         if (!vm_page_buckets_check_ready) {
6358                 return;
6359         }
6360
6361 #if HIBERNATION
6362         if (hibernate_rebuild_needed ||
6363             hibernate_rebuild_hash_list) {
6364                 panic("BUCKET_CHECK: hibernation in progress: "
6365                       "rebuild_needed=%d rebuild_hash_list=%p\n",
6366                       hibernate_rebuild_needed,
6367                       hibernate_rebuild_hash_list);
6368         }
6369 #endif /* HIBERNATION */
6370
6371 #if VM_PAGE_FAKE_BUCKETS
6372         char *cp;
6373         for (cp = (char *) vm_page_fake_buckets_start;
6374              cp < (char *) vm_page_fake_buckets_end;
6375              cp++) {
6376                 if (*cp != 0x5a) {
6377                         panic("BUCKET_CHECK: corruption at %p in fake buckets "
6378                               "[0x%llx:0x%llx]\n",
6379                               cp,
6380                               vm_page_fake_buckets_start,
6381                               vm_page_fake_buckets_end);
6382                 }
6383         }
6384 #endif /* VM_PAGE_FAKE_BUCKETS */
6385
6386         for (i = 0; i < vm_page_bucket_count; i++) {
6387                 bucket = &vm_page_buckets[i];
6388                 if (bucket->pages == VM_PAGE_NULL) {
6389                         continue;
6390                 }
6391
6392                 bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
6393                 lck_spin_lock(bucket_lock);
6394                 p = bucket->pages;
6395                 while (p != VM_PAGE_NULL) {
6396                         if (!p->hashed) {
6397                                 panic("BUCKET_CHECK: page %p (%p,0x%llx) "
6398                                       "hash %d in bucket %d at %p "
6399                                       "is not hashed\n",
6400                                       p, p->object, p->offset,
6401                                       p_hash, i, bucket);
6402                         }
6403                         p_hash = vm_page_hash(p->object, p->offset);
6404                         if (p_hash != i) {
6405                                 panic("BUCKET_CHECK: corruption in bucket %d "
6406                                       "at %p: page %p object %p offset 0x%llx "
6407                                       "hash %d\n",
6408                                       i, bucket, p, p->object, p->offset,
6409                                       p_hash);
6410                         }
6411                         p = p->next;
6412                 }
6413                 lck_spin_unlock(bucket_lock);
6414         }
6415
6416 //      printf("BUCKET_CHECK: checked buckets\n");
6417 }
6418 #endif /* VM_PAGE_BUCKETS_CHECK */