osfmk/vm/vm_resident.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_page.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Resident memory management module.
  63  */
  64
  65 #include <debug.h>
  66 #include <libkern/OSAtomic.h>
  67
  68 #include <mach/clock_types.h>
  69 #include <mach/vm_prot.h>
  70 #include <mach/vm_statistics.h>
  71 #include <mach/sdt.h>
  72 #include <kern/counters.h>
  73 #include <kern/sched_prim.h>
  74 #include <kern/task.h>
  75 #include <kern/thread.h>
  76 #include <kern/zalloc.h>
  77 #include <kern/xpr.h>
  78 #include <vm/pmap.h>
  79 #include <vm/vm_init.h>
  80 #include <vm/vm_map.h>
  81 #include <vm/vm_page.h>
  82 #include <vm/vm_pageout.h>
  83 #include <vm/vm_kern.h>                 /* kernel_memory_allocate() */
  84 #include <kern/misc_protos.h>
  85 #include <zone_debug.h>
  86 #include <vm/cpm.h>
  87 #include <ppc/mappings.h>               /* (BRINGUP) */
  88 #include <pexpert/pexpert.h>    /* (BRINGUP) */
  89
  90 #include <vm/vm_protos.h>
  91 #include <vm/memory_object.h>
  92 #include <vm/vm_purgeable_internal.h>
  93
  94 #if CONFIG_EMBEDDED
  95 #include <sys/kern_memorystatus.h>
  96 #endif
  97
  98 int                     speculative_age_index = 0;
  99 int                     speculative_steal_index = 0;
 100
 101 struct vm_speculative_age_q vm_page_queue_speculative[VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1];
 102
 103
 104 /*
 105  *      Associated with page of user-allocatable memory is a
 106  *      page structure.
 107  */
 108
 109 /*
 110  *      These variables record the values returned by vm_page_bootstrap,
 111  *      for debugging purposes.  The implementation of pmap_steal_memory
 112  *      and pmap_startup here also uses them internally.
 113  */
 114
 115 vm_offset_t virtual_space_start;
 116 vm_offset_t virtual_space_end;
 117 int     vm_page_pages;
 118
 119 /*
 120  *      The vm_page_lookup() routine, which provides for fast
 121  *      (virtual memory object, offset) to page lookup, employs
 122  *      the following hash table.  The vm_page_{insert,remove}
 123  *      routines install and remove associations in the table.
 124  *      [This table is often called the virtual-to-physical,
 125  *      or VP, table.]
 126  */
 127 typedef struct {
 128         vm_page_t       pages;
 129 #if     MACH_PAGE_HASH_STATS
 130         int             cur_count;              /* current count */
 131         int             hi_count;               /* high water mark */
 132 #endif /* MACH_PAGE_HASH_STATS */
 133 } vm_page_bucket_t;
 134
 135 vm_page_bucket_t *vm_page_buckets;              /* Array of buckets */
 136 unsigned int    vm_page_bucket_count = 0;       /* How big is array? */
 137 unsigned int    vm_page_hash_mask;              /* Mask for hash function */
 138 unsigned int    vm_page_hash_shift;             /* Shift for hash function */
 139 uint32_t        vm_page_bucket_hash;            /* Basic bucket hash */
 140 decl_simple_lock_data(,vm_page_bucket_lock)
 141
 142
 143 #if     MACH_PAGE_HASH_STATS
 144 /* This routine is only for debug.  It is intended to be called by
 145  * hand by a developer using a kernel debugger.  This routine prints
 146  * out vm_page_hash table statistics to the kernel debug console.
 147  */
 148 void
 149 hash_debug(void)
 150 {
 151         int     i;
 152         int     numbuckets = 0;
 153         int     highsum = 0;
 154         int     maxdepth = 0;
 155
 156         for (i = 0; i < vm_page_bucket_count; i++) {
 157                 if (vm_page_buckets[i].hi_count) {
 158                         numbuckets++;
 159                         highsum += vm_page_buckets[i].hi_count;
 160                         if (vm_page_buckets[i].hi_count > maxdepth)
 161                                 maxdepth = vm_page_buckets[i].hi_count;
 162                 }
 163         }
 164         printf("Total number of buckets: %d\n", vm_page_bucket_count);
 165         printf("Number used buckets:     %d = %d%%\n",
 166                 numbuckets, 100*numbuckets/vm_page_bucket_count);
 167         printf("Number unused buckets:   %d = %d%%\n",
 168                 vm_page_bucket_count - numbuckets,
 169                 100*(vm_page_bucket_count-numbuckets)/vm_page_bucket_count);
 170         printf("Sum of bucket max depth: %d\n", highsum);
 171         printf("Average bucket depth:    %d.%2d\n",
 172                 highsum/vm_page_bucket_count,
 173                 highsum%vm_page_bucket_count);
 174         printf("Maximum bucket depth:    %d\n", maxdepth);
 175 }
 176 #endif /* MACH_PAGE_HASH_STATS */
 177
 178 /*
 179  *      The virtual page size is currently implemented as a runtime
 180  *      variable, but is constant once initialized using vm_set_page_size.
 181  *      This initialization must be done in the machine-dependent
 182  *      bootstrap sequence, before calling other machine-independent
 183  *      initializations.
 184  *
 185  *      All references to the virtual page size outside this
 186  *      module must use the PAGE_SIZE, PAGE_MASK and PAGE_SHIFT
 187  *      constants.
 188  */
 189 vm_size_t       page_size  = PAGE_SIZE;
 190 vm_size_t       page_mask  = PAGE_MASK;
 191 int             page_shift = PAGE_SHIFT;
 192
 193 /*
 194  *      Resident page structures are initialized from
 195  *      a template (see vm_page_alloc).
 196  *
 197  *      When adding a new field to the virtual memory
 198  *      object structure, be sure to add initialization
 199  *      (see vm_page_bootstrap).
 200  */
 201 struct vm_page  vm_page_template;
 202
 203 vm_page_t       vm_pages = VM_PAGE_NULL;
 204 unsigned int    vm_pages_count = 0;
 205
 206 /*
 207  *      Resident pages that represent real memory
 208  *      are allocated from a set of free lists,
 209  *      one per color.
 210  */
 211 unsigned int    vm_colors;
 212 unsigned int    vm_color_mask;                  /* mask is == (vm_colors-1) */
 213 unsigned int    vm_cache_geometry_colors = 0;   /* set by hw dependent code during startup */
 214 queue_head_t    vm_page_queue_free[MAX_COLORS];
 215 vm_page_t       vm_page_queue_fictitious;
 216 unsigned int    vm_page_free_wanted;
 217 unsigned int    vm_page_free_wanted_privileged;
 218 unsigned int    vm_page_free_count;
 219 unsigned int    vm_page_fictitious_count;
 220
 221 unsigned int    vm_page_free_count_minimum;     /* debugging */
 222
 223 /*
 224  *      Occasionally, the virtual memory system uses
 225  *      resident page structures that do not refer to
 226  *      real pages, for example to leave a page with
 227  *      important state information in the VP table.
 228  *
 229  *      These page structures are allocated the way
 230  *      most other kernel structures are.
 231  */
 232 zone_t  vm_page_zone;
 233 decl_mutex_data(,vm_page_alloc_lock)
 234 unsigned int io_throttle_zero_fill;
 235
 236 /*
 237  *      Fictitious pages don't have a physical address,
 238  *      but we must initialize phys_page to something.
 239  *      For debugging, this should be a strange value
 240  *      that the pmap module can recognize in assertions.
 241  */
 242 vm_offset_t vm_page_fictitious_addr = (vm_offset_t) -1;
 243
 244 /*
 245  *      Guard pages are not accessible so they don't
 246  *      need a physical address, but we need to enter
 247  *      one in the pmap.
 248  *      Let's make it recognizable and make sure that
 249  *      we don't use a real physical page with that
 250  *      physical address.
 251  */
 252 vm_offset_t vm_page_guard_addr = (vm_offset_t) -2;
 253
 254 /*
 255  *      Resident page structures are also chained on
 256  *      queues that are used by the page replacement
 257  *      system (pageout daemon).  These queues are
 258  *      defined here, but are shared by the pageout
 259  *      module.  The inactive queue is broken into
 260  *      inactive and zf for convenience as the
 261  *      pageout daemon often assignes a higher
 262  *      affinity to zf pages
 263  */
 264 queue_head_t    vm_page_queue_active;
 265 queue_head_t    vm_page_queue_inactive;
 266 queue_head_t    vm_page_queue_zf;       /* inactive memory queue for zero fill */
 267
 268 unsigned int    vm_page_active_count;
 269 unsigned int    vm_page_inactive_count;
 270 unsigned int    vm_page_throttled_count;
 271 unsigned int    vm_page_speculative_count;
 272 unsigned int    vm_page_wire_count;
 273 unsigned int    vm_page_gobble_count = 0;
 274 unsigned int    vm_page_wire_count_warning = 0;
 275 unsigned int    vm_page_gobble_count_warning = 0;
 276
 277 unsigned int    vm_page_purgeable_count = 0; /* # of pages purgeable now */
 278 uint64_t        vm_page_purged_count = 0;    /* total count of purged pages */
 279
 280 unsigned int    vm_page_speculative_recreated = 0;
 281 unsigned int    vm_page_speculative_created = 0;
 282 unsigned int    vm_page_speculative_used = 0;
 283
 284 ppnum_t         vm_lopage_poolstart = 0;
 285 ppnum_t         vm_lopage_poolend = 0;
 286 int             vm_lopage_poolsize = 0;
 287 uint64_t        max_valid_dma_address = 0xffffffffffffffffULL;
 288
 289
 290 /*
 291  *      Several page replacement parameters are also
 292  *      shared with this module, so that page allocation
 293  *      (done here in vm_page_alloc) can trigger the
 294  *      pageout daemon.
 295  */
 296 unsigned int    vm_page_free_target = 0;
 297 unsigned int    vm_page_free_min = 0;
 298 unsigned int    vm_page_inactive_target = 0;
 299 unsigned int    vm_page_inactive_min = 0;
 300 unsigned int    vm_page_free_reserved = 0;
 301 unsigned int    vm_page_zfill_throttle_count = 0;
 302
 303 /*
 304  *      The VM system has a couple of heuristics for deciding
 305  *      that pages are "uninteresting" and should be placed
 306  *      on the inactive queue as likely candidates for replacement.
 307  *      These variables let the heuristics be controlled at run-time
 308  *      to make experimentation easier.
 309  */
 310
 311 boolean_t vm_page_deactivate_hint = TRUE;
 312
 313 /*
 314  *      vm_set_page_size:
 315  *
 316  *      Sets the page size, perhaps based upon the memory
 317  *      size.  Must be called before any use of page-size
 318  *      dependent functions.
 319  *
 320  *      Sets page_shift and page_mask from page_size.
 321  */
 322 void
 323 vm_set_page_size(void)
 324 {
 325         page_mask = page_size - 1;
 326
 327         if ((page_mask & page_size) != 0)
 328                 panic("vm_set_page_size: page size not a power of two");
 329
 330         for (page_shift = 0; ; page_shift++)
 331                 if ((1U << page_shift) == page_size)
 332                         break;
 333 }
 334
 335
 336 /* Called once during statup, once the cache geometry is known.
 337  */
 338 static void
 339 vm_page_set_colors( void )
 340 {
 341         unsigned int    n, override;
 342
 343         if ( PE_parse_boot_argn("colors", &override, sizeof (override)) )               /* colors specified as a boot-arg? */
 344                 n = override;
 345         else if ( vm_cache_geometry_colors )                    /* do we know what the cache geometry is? */
 346                 n = vm_cache_geometry_colors;
 347         else    n = DEFAULT_COLORS;                             /* use default if all else fails */
 348
 349         if ( n == 0 )
 350                 n = 1;
 351         if ( n > MAX_COLORS )
 352                 n = MAX_COLORS;
 353
 354         /* the count must be a power of 2  */
 355         if ( ( n & (n - 1)) !=0  )
 356                 panic("vm_page_set_colors");
 357
 358         vm_colors = n;
 359         vm_color_mask = n - 1;
 360 }
 361
 362
 363 /*
 364  *      vm_page_bootstrap:
 365  *
 366  *      Initializes the resident memory module.
 367  *
 368  *      Allocates memory for the page cells, and
 369  *      for the object/offset-to-page hash table headers.
 370  *      Each page cell is initialized and placed on the free list.
 371  *      Returns the range of available kernel virtual memory.
 372  */
 373
 374 void
 375 vm_page_bootstrap(
 376         vm_offset_t             *startp,
 377         vm_offset_t             *endp)
 378 {
 379         register vm_page_t      m;
 380         unsigned int            i;
 381         unsigned int            log1;
 382         unsigned int            log2;
 383         unsigned int            size;
 384
 385         /*
 386          *      Initialize the vm_page template.
 387          */
 388
 389         m = &vm_page_template;
 390         m->object = VM_OBJECT_NULL;             /* reset later */
 391         m->offset = (vm_object_offset_t) -1;    /* reset later */
 392         m->wire_count = 0;
 393
 394         m->pageq.next = NULL;
 395         m->pageq.prev = NULL;
 396         m->listq.next = NULL;
 397         m->listq.prev = NULL;
 398
 399         m->speculative = FALSE;
 400         m->throttled = FALSE;
 401         m->inactive = FALSE;
 402         m->active = FALSE;
 403         m->no_cache = FALSE;
 404         m->laundry = FALSE;
 405         m->free = FALSE;
 406         m->pmapped = FALSE;
 407         m->wpmapped = FALSE;
 408         m->reference = FALSE;
 409         m->pageout = FALSE;
 410         m->dump_cleaning = FALSE;
 411         m->list_req_pending = FALSE;
 412
 413         m->busy = TRUE;
 414         m->wanted = FALSE;
 415         m->tabled = FALSE;
 416         m->fictitious = FALSE;
 417         m->private = FALSE;
 418         m->absent = FALSE;
 419         m->error = FALSE;
 420         m->dirty = FALSE;
 421         m->cleaning = FALSE;
 422         m->precious = FALSE;
 423         m->clustered = FALSE;
 424         m->unusual = FALSE;
 425         m->restart = FALSE;
 426         m->zero_fill = FALSE;
 427         m->encrypted = FALSE;
 428         m->encrypted_cleaning = FALSE;
 429         m->deactivated = FALSE;
 430
 431         m->phys_page = 0;               /* reset later */
 432
 433         /*
 434          *      Initialize the page queues.
 435          */
 436
 437         mutex_init(&vm_page_queue_free_lock, 0);
 438         mutex_init(&vm_page_queue_lock, 0);
 439
 440         mutex_init(&vm_purgeable_queue_lock, 0);
 441
 442         for (i = 0; i < PURGEABLE_Q_TYPE_MAX; i++) {
 443                 int group;
 444
 445                 purgeable_queues[i].token_q_head = 0;
 446                 purgeable_queues[i].token_q_tail = 0;
 447                 for (group = 0; group < NUM_VOLATILE_GROUPS; group++)
 448                         queue_init(&purgeable_queues[i].objq[group]);
 449
 450                 purgeable_queues[i].type = i;
 451                 purgeable_queues[i].new_pages = 0;
 452 #if MACH_ASSERT
 453                 purgeable_queues[i].debug_count_tokens = 0;
 454                 purgeable_queues[i].debug_count_objects = 0;
 455 #endif
 456         };
 457
 458         for (i = 0; i < MAX_COLORS; i++ )
 459                 queue_init(&vm_page_queue_free[i]);
 460         queue_init(&vm_lopage_queue_free);
 461         vm_page_queue_fictitious = VM_PAGE_NULL;
 462         queue_init(&vm_page_queue_active);
 463         queue_init(&vm_page_queue_inactive);
 464         queue_init(&vm_page_queue_throttled);
 465         queue_init(&vm_page_queue_zf);
 466
 467         for ( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ ) {
 468                 queue_init(&vm_page_queue_speculative[i].age_q);
 469
 470                 vm_page_queue_speculative[i].age_ts.tv_sec = 0;
 471                 vm_page_queue_speculative[i].age_ts.tv_nsec = 0;
 472         }
 473         vm_page_free_wanted = 0;
 474         vm_page_free_wanted_privileged = 0;
 475
 476         vm_page_set_colors();
 477
 478
 479         /*
 480          *      Steal memory for the map and zone subsystems.
 481          */
 482
 483         vm_map_steal_memory();
 484         zone_steal_memory();
 485
 486         /*
 487          *      Allocate (and initialize) the virtual-to-physical
 488          *      table hash buckets.
 489          *
 490          *      The number of buckets should be a power of two to
 491          *      get a good hash function.  The following computation
 492          *      chooses the first power of two that is greater
 493          *      than the number of physical pages in the system.
 494          */
 495
 496         simple_lock_init(&vm_page_bucket_lock, 0);
 497
 498         if (vm_page_bucket_count == 0) {
 499                 unsigned int npages = pmap_free_pages();
 500
 501                 vm_page_bucket_count = 1;
 502                 while (vm_page_bucket_count < npages)
 503                         vm_page_bucket_count <<= 1;
 504         }
 505
 506         vm_page_hash_mask = vm_page_bucket_count - 1;
 507
 508         /*
 509          *      Calculate object shift value for hashing algorithm:
 510          *              O = log2(sizeof(struct vm_object))
 511          *              B = log2(vm_page_bucket_count)
 512          *              hash shifts the object left by
 513          *              B/2 - O
 514          */
 515         size = vm_page_bucket_count;
 516         for (log1 = 0; size > 1; log1++)
 517                 size /= 2;
 518         size = sizeof(struct vm_object);
 519         for (log2 = 0; size > 1; log2++)
 520                 size /= 2;
 521         vm_page_hash_shift = log1/2 - log2 + 1;
 522
 523         vm_page_bucket_hash = 1 << ((log1 + 1) >> 1);           /* Get (ceiling of sqrt of table size) */
 524         vm_page_bucket_hash |= 1 << ((log1 + 1) >> 2);          /* Get (ceiling of quadroot of table size) */
 525         vm_page_bucket_hash |= 1;                                                       /* Set bit and add 1 - always must be 1 to insure unique series */
 526
 527         if (vm_page_hash_mask & vm_page_bucket_count)
 528                 printf("vm_page_bootstrap: WARNING -- strange page hash\n");
 529
 530         vm_page_buckets = (vm_page_bucket_t *)
 531                 pmap_steal_memory(vm_page_bucket_count *
 532                                   sizeof(vm_page_bucket_t));
 533
 534         for (i = 0; i < vm_page_bucket_count; i++) {
 535                 register vm_page_bucket_t *bucket = &vm_page_buckets[i];
 536
 537                 bucket->pages = VM_PAGE_NULL;
 538 #if     MACH_PAGE_HASH_STATS
 539                 bucket->cur_count = 0;
 540                 bucket->hi_count = 0;
 541 #endif /* MACH_PAGE_HASH_STATS */
 542         }
 543
 544         /*
 545          *      Machine-dependent code allocates the resident page table.
 546          *      It uses vm_page_init to initialize the page frames.
 547          *      The code also returns to us the virtual space available
 548          *      to the kernel.  We don't trust the pmap module
 549          *      to get the alignment right.
 550          */
 551
 552         pmap_startup(&virtual_space_start, &virtual_space_end);
 553         virtual_space_start = round_page(virtual_space_start);
 554         virtual_space_end = trunc_page(virtual_space_end);
 555
 556         *startp = virtual_space_start;
 557         *endp = virtual_space_end;
 558
 559         /*
 560          *      Compute the initial "wire" count.
 561          *      Up until now, the pages which have been set aside are not under
 562          *      the VM system's control, so although they aren't explicitly
 563          *      wired, they nonetheless can't be moved. At this moment,
 564          *      all VM managed pages are "free", courtesy of pmap_startup.
 565          */
 566         vm_page_wire_count = atop_64(max_mem) - vm_page_free_count;     /* initial value */
 567         vm_page_free_count_minimum = vm_page_free_count;
 568
 569         printf("vm_page_bootstrap: %d free pages and %d wired pages\n",
 570                vm_page_free_count, vm_page_wire_count);
 571
 572         simple_lock_init(&vm_paging_lock, 0);
 573 }
 574
 575 #ifndef MACHINE_PAGES
 576 /*
 577  *      We implement pmap_steal_memory and pmap_startup with the help
 578  *      of two simpler functions, pmap_virtual_space and pmap_next_page.
 579  */
 580
 581 void *
 582 pmap_steal_memory(
 583         vm_size_t size)
 584 {
 585         vm_offset_t addr, vaddr;
 586         ppnum_t phys_page;
 587
 588         /*
 589          *      We round the size to a round multiple.
 590          */
 591
 592         size = (size + sizeof (void *) - 1) &~ (sizeof (void *) - 1);
 593
 594         /*
 595          *      If this is the first call to pmap_steal_memory,
 596          *      we have to initialize ourself.
 597          */
 598
 599         if (virtual_space_start == virtual_space_end) {
 600                 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
 601
 602                 /*
 603                  *      The initial values must be aligned properly, and
 604                  *      we don't trust the pmap module to do it right.
 605                  */
 606
 607                 virtual_space_start = round_page(virtual_space_start);
 608                 virtual_space_end = trunc_page(virtual_space_end);
 609         }
 610
 611         /*
 612          *      Allocate virtual memory for this request.
 613          */
 614
 615         addr = virtual_space_start;
 616         virtual_space_start += size;
 617
 618         kprintf("pmap_steal_memory: %08X - %08X; size=%08X\n", addr, virtual_space_start, size);        /* (TEST/DEBUG) */
 619
 620         /*
 621          *      Allocate and map physical pages to back new virtual pages.
 622          */
 623
 624         for (vaddr = round_page(addr);
 625              vaddr < addr + size;
 626              vaddr += PAGE_SIZE) {
 627                 if (!pmap_next_page(&phys_page))
 628                         panic("pmap_steal_memory");
 629
 630                 /*
 631                  *      XXX Logically, these mappings should be wired,
 632                  *      but some pmap modules barf if they are.
 633                  */
 634
 635                 pmap_enter(kernel_pmap, vaddr, phys_page,
 636                            VM_PROT_READ|VM_PROT_WRITE,
 637                                 VM_WIMG_USE_DEFAULT, FALSE);
 638                 /*
 639                  * Account for newly stolen memory
 640                  */
 641                 vm_page_wire_count++;
 642
 643         }
 644
 645         return (void *) addr;
 646 }
 647
 648 void
 649 pmap_startup(
 650         vm_offset_t *startp,
 651         vm_offset_t *endp)
 652 {
 653         unsigned int i, npages, pages_initialized, fill, fillval;
 654         ppnum_t         phys_page;
 655         addr64_t        tmpaddr;
 656         unsigned int    num_of_lopages = 0;
 657         unsigned int    last_index;
 658
 659         /*
 660          *      We calculate how many page frames we will have
 661          *      and then allocate the page structures in one chunk.
 662          */
 663
 664         tmpaddr = (addr64_t)pmap_free_pages() * (addr64_t)PAGE_SIZE;    /* Get the amount of memory left */
 665         tmpaddr = tmpaddr + (addr64_t)(round_page_32(virtual_space_start) - virtual_space_start);       /* Account for any slop */
 666         npages = (unsigned int)(tmpaddr / (addr64_t)(PAGE_SIZE + sizeof(*vm_pages)));   /* Figure size of all vm_page_ts, including enough to hold the vm_page_ts */
 667
 668         vm_pages = (vm_page_t) pmap_steal_memory(npages * sizeof *vm_pages);
 669
 670         /*
 671          *      Initialize the page frames.
 672          */
 673         for (i = 0, pages_initialized = 0; i < npages; i++) {
 674                 if (!pmap_next_page(&phys_page))
 675                         break;
 676
 677                 vm_page_init(&vm_pages[i], phys_page);
 678                 vm_page_pages++;
 679                 pages_initialized++;
 680         }
 681         vm_pages_count = pages_initialized;
 682
 683         /*
 684          * Check if we want to initialize pages to a known value
 685          */
 686         fill = 0;                                                               /* Assume no fill */
 687         if (PE_parse_boot_argn("fill", &fillval, sizeof (fillval))) fill = 1;                   /* Set fill */
 688
 689
 690         /*
 691          * if vm_lopage_poolsize is non-zero, than we need to reserve
 692          * a pool of pages whose addresess are less than 4G... this pool
 693          * is used by drivers whose hardware can't DMA beyond 32 bits...
 694          *
 695          * note that I'm assuming that the page list is ascending and
 696          * ordered w/r to the physical address
 697          */
 698         for (i = 0, num_of_lopages = vm_lopage_poolsize; num_of_lopages && i < pages_initialized; num_of_lopages--, i++) {
 699                 vm_page_t m;
 700
 701                 m = &vm_pages[i];
 702
 703                 if (m->phys_page >= (1 << (32 - PAGE_SHIFT)))
 704                         panic("couldn't reserve the lopage pool: not enough lo pages\n");
 705
 706                 if (m->phys_page < vm_lopage_poolend)
 707                         panic("couldn't reserve the lopage pool: page list out of order\n");
 708
 709                 vm_lopage_poolend = m->phys_page;
 710
 711                 if (vm_lopage_poolstart == 0)
 712                         vm_lopage_poolstart = m->phys_page;
 713                 else {
 714                         if (m->phys_page < vm_lopage_poolstart)
 715                                 panic("couldn't reserve the lopage pool: page list out of order\n");
 716                 }
 717
 718                 if (fill)
 719                         fillPage(m->phys_page, fillval);                /* Fill the page with a know value if requested at boot */
 720
 721                 vm_page_release(m);
 722         }
 723         last_index = i;
 724
 725         // -debug code remove
 726         if (2 == vm_himemory_mode) {
 727                 // free low -> high so high is preferred
 728                 for (i = last_index + 1; i <= pages_initialized; i++) {
 729                         if(fill) fillPage(vm_pages[i - 1].phys_page, fillval);          /* Fill the page with a know value if requested at boot */
 730                         vm_page_release(&vm_pages[i - 1]);
 731                 }
 732         }
 733         else
 734         // debug code remove-
 735
 736         /*
 737          * Release pages in reverse order so that physical pages
 738          * initially get allocated in ascending addresses. This keeps
 739          * the devices (which must address physical memory) happy if
 740          * they require several consecutive pages.
 741          */
 742         for (i = pages_initialized; i > last_index; i--) {
 743                 if(fill) fillPage(vm_pages[i - 1].phys_page, fillval);          /* Fill the page with a know value if requested at boot */
 744                 vm_page_release(&vm_pages[i - 1]);
 745         }
 746
 747 #if 0
 748         {
 749                 vm_page_t xx, xxo, xxl;
 750                 int i, j, k, l;
 751
 752                 j = 0;                                                                                                  /* (BRINGUP) */
 753                 xxl = 0;
 754
 755                 for( i = 0; i < vm_colors; i++ ) {
 756                         queue_iterate(&vm_page_queue_free[i],
 757                                       xx,
 758                                       vm_page_t,
 759                                       pageq) {  /* BRINGUP */
 760                                 j++;                                                                                            /* (BRINGUP) */
 761                                 if(j > vm_page_free_count) {                                            /* (BRINGUP) */
 762                                         panic("pmap_startup: too many pages, xx = %08X, xxl = %08X\n", xx, xxl);
 763                                 }
 764
 765                                 l = vm_page_free_count - j;                                                     /* (BRINGUP) */
 766                                 k = 0;                                                                                          /* (BRINGUP) */
 767
 768                                 if(((j - 1) & 0xFFFF) == 0) kprintf("checking number %d of %d\n", j, vm_page_free_count);
 769
 770                                 for(xxo = xx->pageq.next; xxo != &vm_page_queue_free[i]; xxo = xxo->pageq.next) {       /* (BRINGUP) */
 771                                         k++;
 772                                         if(k > l) panic("pmap_startup: too many in secondary check %d %d\n", k, l);
 773                                         if((xx->phys_page & 0xFFFFFFFF) == (xxo->phys_page & 0xFFFFFFFF)) {     /* (BRINGUP) */
 774                                                 panic("pmap_startup: duplicate physaddr, xx = %08X, xxo = %08X\n", xx, xxo);
 775                                         }
 776                                 }
 777
 778                                 xxl = xx;
 779                         }
 780                 }
 781
 782                 if(j != vm_page_free_count) {                                           /* (BRINGUP) */
 783                         panic("pmap_startup: vm_page_free_count does not match, calc =  %d, vm_page_free_count = %08X\n", j, vm_page_free_count);
 784                 }
 785         }
 786 #endif
 787
 788
 789         /*
 790          *      We have to re-align virtual_space_start,
 791          *      because pmap_steal_memory has been using it.
 792          */
 793
 794         virtual_space_start = round_page_32(virtual_space_start);
 795
 796         *startp = virtual_space_start;
 797         *endp = virtual_space_end;
 798 }
 799 #endif  /* MACHINE_PAGES */
 800
 801 /*
 802  *      Routine:        vm_page_module_init
 803  *      Purpose:
 804  *              Second initialization pass, to be done after
 805  *              the basic VM system is ready.
 806  */
 807 void
 808 vm_page_module_init(void)
 809 {
 810         vm_page_zone = zinit((vm_size_t) sizeof(struct vm_page),
 811                              0, PAGE_SIZE, "vm pages");
 812
 813 #if     ZONE_DEBUG
 814         zone_debug_disable(vm_page_zone);
 815 #endif  /* ZONE_DEBUG */
 816
 817         zone_change(vm_page_zone, Z_EXPAND, FALSE);
 818         zone_change(vm_page_zone, Z_EXHAUST, TRUE);
 819         zone_change(vm_page_zone, Z_FOREIGN, TRUE);
 820
 821         /*
 822          * Adjust zone statistics to account for the real pages allocated
 823          * in vm_page_create(). [Q: is this really what we want?]
 824          */
 825         vm_page_zone->count += vm_page_pages;
 826         vm_page_zone->cur_size += vm_page_pages * vm_page_zone->elem_size;
 827
 828         mutex_init(&vm_page_alloc_lock, 0);
 829 }
 830
 831 /*
 832  *      Routine:        vm_page_create
 833  *      Purpose:
 834  *              After the VM system is up, machine-dependent code
 835  *              may stumble across more physical memory.  For example,
 836  *              memory that it was reserving for a frame buffer.
 837  *              vm_page_create turns this memory into available pages.
 838  */
 839
 840 void
 841 vm_page_create(
 842         ppnum_t start,
 843         ppnum_t end)
 844 {
 845         ppnum_t         phys_page;
 846         vm_page_t       m;
 847
 848         for (phys_page = start;
 849              phys_page < end;
 850              phys_page++) {
 851                 while ((m = (vm_page_t) vm_page_grab_fictitious())
 852                         == VM_PAGE_NULL)
 853                         vm_page_more_fictitious();
 854
 855                 vm_page_init(m, phys_page);
 856                 vm_page_pages++;
 857                 vm_page_release(m);
 858         }
 859 }
 860
 861 /*
 862  *      vm_page_hash:
 863  *
 864  *      Distributes the object/offset key pair among hash buckets.
 865  *
 866  *      NOTE:   The bucket count must be a power of 2
 867  */
 868 #define vm_page_hash(object, offset) (\
 869         ( (natural_t)((uint32_t)object * vm_page_bucket_hash) + ((uint32_t)atop_64(offset) ^ vm_page_bucket_hash))\
 870          & vm_page_hash_mask)
 871
 872
 873 /*
 874  *      vm_page_insert:         [ internal use only ]
 875  *
 876  *      Inserts the given mem entry into the object/object-page
 877  *      table and object list.
 878  *
 879  *      The object must be locked.
 880  */
 881 void
 882 vm_page_insert(
 883         vm_page_t               mem,
 884         vm_object_t             object,
 885         vm_object_offset_t      offset)
 886 {
 887         vm_page_insert_internal(mem, object, offset, FALSE);
 888 }
 889
 890
 891 void
 892 vm_page_insert_internal(
 893         vm_page_t               mem,
 894         vm_object_t             object,
 895         vm_object_offset_t      offset,
 896         boolean_t       queues_lock_held)
 897 {
 898         register vm_page_bucket_t *bucket;
 899
 900         XPR(XPR_VM_PAGE,
 901                 "vm_page_insert, object 0x%X offset 0x%X page 0x%X\n",
 902                 (integer_t)object, (integer_t)offset, (integer_t)mem, 0,0);
 903
 904         VM_PAGE_CHECK(mem);
 905
 906         if (object == vm_submap_object) {
 907                 /* the vm_submap_object is only a placeholder for submaps */
 908                 panic("vm_page_insert(vm_submap_object,0x%llx)\n", offset);
 909         }
 910
 911         vm_object_lock_assert_exclusive(object);
 912 #if DEBUG
 913         if (mem->tabled || mem->object != VM_OBJECT_NULL)
 914                 panic("vm_page_insert: page %p for (obj=%p,off=0x%llx) "
 915                       "already in (obj=%p,off=0x%llx)",
 916                       mem, object, offset, mem->object, mem->offset);
 917 #endif
 918         assert(!object->internal || offset < object->size);
 919
 920         /* only insert "pageout" pages into "pageout" objects,
 921          * and normal pages into normal objects */
 922         assert(object->pageout == mem->pageout);
 923
 924         assert(vm_page_lookup(object, offset) == VM_PAGE_NULL);
 925
 926         /*
 927          *      Record the object/offset pair in this page
 928          */
 929
 930         mem->object = object;
 931         mem->offset = offset;
 932
 933         /*
 934          *      Insert it into the object_object/offset hash table
 935          */
 936
 937         bucket = &vm_page_buckets[vm_page_hash(object, offset)];
 938         simple_lock(&vm_page_bucket_lock);
 939         mem->next = bucket->pages;
 940         bucket->pages = mem;
 941 #if     MACH_PAGE_HASH_STATS
 942         if (++bucket->cur_count > bucket->hi_count)
 943                 bucket->hi_count = bucket->cur_count;
 944 #endif /* MACH_PAGE_HASH_STATS */
 945         simple_unlock(&vm_page_bucket_lock);
 946
 947         /*
 948          *      Now link into the object's list of backed pages.
 949          */
 950
 951         VM_PAGE_INSERT(mem, object);
 952         mem->tabled = TRUE;
 953
 954         /*
 955          *      Show that the object has one more resident page.
 956          */
 957
 958         object->resident_page_count++;
 959
 960         if (object->purgable == VM_PURGABLE_VOLATILE) {
 961                 if (queues_lock_held == FALSE)
 962                         vm_page_lockspin_queues();
 963
 964                 vm_page_purgeable_count++;
 965
 966                 if (queues_lock_held == FALSE)
 967                         vm_page_unlock_queues();
 968         } else if (object->purgable == VM_PURGABLE_EMPTY &&
 969                    mem->throttled) {
 970                 if (queues_lock_held == FALSE)
 971                         vm_page_lock_queues();
 972                 vm_page_deactivate(mem);
 973                 if (queues_lock_held == FALSE)
 974                         vm_page_unlock_queues();
 975         }
 976 }
 977
 978 /*
 979  *      vm_page_replace:
 980  *
 981  *      Exactly like vm_page_insert, except that we first
 982  *      remove any existing page at the given offset in object.
 983  *
 984  *      The object and page queues must be locked.
 985  */
 986
 987 void
 988 vm_page_replace(
 989         register vm_page_t              mem,
 990         register vm_object_t            object,
 991         register vm_object_offset_t     offset)
 992 {
 993         vm_page_bucket_t *bucket;
 994         vm_page_t        found_m = VM_PAGE_NULL;
 995
 996         VM_PAGE_CHECK(mem);
 997         vm_object_lock_assert_exclusive(object);
 998 #if DEBUG
 999         _mutex_assert(&vm_page_queue_lock, MA_OWNED);
1000
1001         if (mem->tabled || mem->object != VM_OBJECT_NULL)
1002                 panic("vm_page_replace: page %p for (obj=%p,off=0x%llx) "
1003                       "already in (obj=%p,off=0x%llx)",
1004                       mem, object, offset, mem->object, mem->offset);
1005 #endif
1006         /*
1007          *      Record the object/offset pair in this page
1008          */
1009
1010         mem->object = object;
1011         mem->offset = offset;
1012
1013         /*
1014          *      Insert it into the object_object/offset hash table,
1015          *      replacing any page that might have been there.
1016          */
1017
1018         bucket = &vm_page_buckets[vm_page_hash(object, offset)];
1019         simple_lock(&vm_page_bucket_lock);
1020
1021         if (bucket->pages) {
1022                 vm_page_t *mp = &bucket->pages;
1023                 register vm_page_t m = *mp;
1024
1025                 do {
1026                         if (m->object == object && m->offset == offset) {
1027                                 /*
1028                                  * Remove old page from hash list
1029                                  */
1030                                 *mp = m->next;
1031
1032                                 found_m = m;
1033                                 break;
1034                         }
1035                         mp = &m->next;
1036                 } while ((m = *mp));
1037
1038                 mem->next = bucket->pages;
1039         } else {
1040                 mem->next = VM_PAGE_NULL;
1041         }
1042         /*
1043          * insert new page at head of hash list
1044          */
1045         bucket->pages = mem;
1046
1047         simple_unlock(&vm_page_bucket_lock);
1048
1049         if (found_m) {
1050                 /*
1051                  * there was already a page at the specified
1052                  * offset for this object... remove it from
1053                  * the object and free it back to the free list
1054                  */
1055                 VM_PAGE_REMOVE(found_m);
1056                 found_m->tabled = FALSE;
1057
1058                 found_m->object = VM_OBJECT_NULL;
1059                 found_m->offset = (vm_object_offset_t) -1;
1060                 object->resident_page_count--;
1061
1062                 if (object->purgable == VM_PURGABLE_VOLATILE) {
1063                         assert(vm_page_purgeable_count > 0);
1064                         vm_page_purgeable_count--;
1065                 }
1066
1067                 /*
1068                  * Return page to the free list.
1069                  * Note the page is not tabled now
1070                  */
1071                 vm_page_free(found_m);
1072         }
1073         /*
1074          *      Now link into the object's list of backed pages.
1075          */
1076
1077         VM_PAGE_INSERT(mem, object);
1078         mem->tabled = TRUE;
1079
1080         /*
1081          *      And show that the object has one more resident
1082          *      page.
1083          */
1084
1085         object->resident_page_count++;
1086
1087         if (object->purgable == VM_PURGABLE_VOLATILE) {
1088                 vm_page_purgeable_count++;
1089         } else if (object->purgable == VM_PURGABLE_EMPTY) {
1090                 if (mem->throttled) {
1091                         vm_page_deactivate(mem);
1092                 }
1093         }
1094 }
1095
1096 /*
1097  *      vm_page_remove:         [ internal use only ]
1098  *
1099  *      Removes the given mem entry from the object/offset-page
1100  *      table and the object page list.
1101  *
1102  *      The object and page queues must be locked.
1103  */
1104
1105 void
1106 vm_page_remove(
1107         register vm_page_t      mem)
1108 {
1109         register vm_page_bucket_t       *bucket;
1110         register vm_page_t      this;
1111
1112         XPR(XPR_VM_PAGE,
1113                 "vm_page_remove, object 0x%X offset 0x%X page 0x%X\n",
1114                 (integer_t)mem->object, (integer_t)mem->offset,
1115                 (integer_t)mem, 0,0);
1116 #if DEBUG
1117         _mutex_assert(&vm_page_queue_lock, MA_OWNED);
1118 #endif
1119         vm_object_lock_assert_exclusive(mem->object);
1120         assert(mem->tabled);
1121         assert(!mem->cleaning);
1122         VM_PAGE_CHECK(mem);
1123
1124
1125         /*
1126          *      Remove from the object_object/offset hash table
1127          */
1128
1129         bucket = &vm_page_buckets[vm_page_hash(mem->object, mem->offset)];
1130         simple_lock(&vm_page_bucket_lock);
1131         if ((this = bucket->pages) == mem) {
1132                 /* optimize for common case */
1133
1134                 bucket->pages = mem->next;
1135         } else {
1136                 register vm_page_t      *prev;
1137
1138                 for (prev = &this->next;
1139                      (this = *prev) != mem;
1140                      prev = &this->next)
1141                         continue;
1142                 *prev = this->next;
1143         }
1144 #if     MACH_PAGE_HASH_STATS
1145         bucket->cur_count--;
1146 #endif /* MACH_PAGE_HASH_STATS */
1147         simple_unlock(&vm_page_bucket_lock);
1148
1149         /*
1150          *      Now remove from the object's list of backed pages.
1151          */
1152
1153         VM_PAGE_REMOVE(mem);
1154
1155         /*
1156          *      And show that the object has one fewer resident
1157          *      page.
1158          */
1159
1160         mem->object->resident_page_count--;
1161
1162         if (mem->object->purgable == VM_PURGABLE_VOLATILE) {
1163                 assert(vm_page_purgeable_count > 0);
1164                 vm_page_purgeable_count--;
1165         }
1166         mem->tabled = FALSE;
1167         mem->object = VM_OBJECT_NULL;
1168         mem->offset = (vm_object_offset_t) -1;
1169 }
1170
1171 /*
1172  *      vm_page_lookup:
1173  *
1174  *      Returns the page associated with the object/offset
1175  *      pair specified; if none is found, VM_PAGE_NULL is returned.
1176  *
1177  *      The object must be locked.  No side effects.
1178  */
1179
1180 unsigned long vm_page_lookup_hint = 0;
1181 unsigned long vm_page_lookup_hint_next = 0;
1182 unsigned long vm_page_lookup_hint_prev = 0;
1183 unsigned long vm_page_lookup_hint_miss = 0;
1184 unsigned long vm_page_lookup_bucket_NULL = 0;
1185 unsigned long vm_page_lookup_miss = 0;
1186
1187
1188 vm_page_t
1189 vm_page_lookup(
1190         register vm_object_t            object,
1191         register vm_object_offset_t     offset)
1192 {
1193         register vm_page_t      mem;
1194         register vm_page_bucket_t *bucket;
1195         queue_entry_t           qe;
1196
1197         vm_object_lock_assert_held(object);
1198         mem = object->memq_hint;
1199
1200         if (mem != VM_PAGE_NULL) {
1201                 assert(mem->object == object);
1202
1203                 if (mem->offset == offset) {
1204                         vm_page_lookup_hint++;
1205                         return mem;
1206                 }
1207                 qe = queue_next(&mem->listq);
1208
1209                 if (! queue_end(&object->memq, qe)) {
1210                         vm_page_t       next_page;
1211
1212                         next_page = (vm_page_t) qe;
1213                         assert(next_page->object == object);
1214
1215                         if (next_page->offset == offset) {
1216                                 vm_page_lookup_hint_next++;
1217                                 object->memq_hint = next_page; /* new hint */
1218                                 return next_page;
1219                         }
1220                 }
1221                 qe = queue_prev(&mem->listq);
1222
1223                 if (! queue_end(&object->memq, qe)) {
1224                         vm_page_t prev_page;
1225
1226                         prev_page = (vm_page_t) qe;
1227                         assert(prev_page->object == object);
1228
1229                         if (prev_page->offset == offset) {
1230                                 vm_page_lookup_hint_prev++;
1231                                 object->memq_hint = prev_page; /* new hint */
1232                                 return prev_page;
1233                         }
1234                 }
1235         }
1236         /*
1237          * Search the hash table for this object/offset pair
1238          */
1239         bucket = &vm_page_buckets[vm_page_hash(object, offset)];
1240
1241         /*
1242          * since we hold the object lock, we are guaranteed that no
1243          * new pages can be inserted into this object... this in turn
1244          * guarantess that the page we're looking for can't exist
1245          * if the bucket it hashes to is currently NULL even when looked
1246          * at outside the scope of the hash bucket lock... this is a
1247          * really cheap optimiztion to avoid taking the lock
1248          */
1249         if (bucket->pages == VM_PAGE_NULL) {
1250                 vm_page_lookup_bucket_NULL++;
1251
1252                 return (VM_PAGE_NULL);
1253         }
1254         simple_lock(&vm_page_bucket_lock);
1255
1256         for (mem = bucket->pages; mem != VM_PAGE_NULL; mem = mem->next) {
1257                 VM_PAGE_CHECK(mem);
1258                 if ((mem->object == object) && (mem->offset == offset))
1259                         break;
1260         }
1261         simple_unlock(&vm_page_bucket_lock);
1262
1263         if (mem != VM_PAGE_NULL) {
1264                 if (object->memq_hint != VM_PAGE_NULL) {
1265                         vm_page_lookup_hint_miss++;
1266                 }
1267                 assert(mem->object == object);
1268                 object->memq_hint = mem;
1269         } else
1270                 vm_page_lookup_miss++;
1271
1272         return(mem);
1273 }
1274
1275
1276 /*
1277  *      vm_page_rename:
1278  *
1279  *      Move the given memory entry from its
1280  *      current object to the specified target object/offset.
1281  *
1282  *      The object must be locked.
1283  */
1284 void
1285 vm_page_rename(
1286         register vm_page_t              mem,
1287         register vm_object_t            new_object,
1288         vm_object_offset_t              new_offset,
1289         boolean_t                       encrypted_ok)
1290 {
1291         assert(mem->object != new_object);
1292
1293         /*
1294          * ENCRYPTED SWAP:
1295          * The encryption key is based on the page's memory object
1296          * (aka "pager") and paging offset.  Moving the page to
1297          * another VM object changes its "pager" and "paging_offset"
1298          * so it has to be decrypted first, or we would lose the key.
1299          *
1300          * One exception is VM object collapsing, where we transfer pages
1301          * from one backing object to its parent object.  This operation also
1302          * transfers the paging information, so the <pager,paging_offset> info
1303          * should remain consistent.  The caller (vm_object_do_collapse())
1304          * sets "encrypted_ok" in this case.
1305          */
1306         if (!encrypted_ok && mem->encrypted) {
1307                 panic("vm_page_rename: page %p is encrypted\n", mem);
1308         }
1309
1310         /*
1311          *      Changes to mem->object require the page lock because
1312          *      the pageout daemon uses that lock to get the object.
1313          */
1314
1315         XPR(XPR_VM_PAGE,
1316                 "vm_page_rename, new object 0x%X, offset 0x%X page 0x%X\n",
1317                 (integer_t)new_object, (integer_t)new_offset,
1318                 (integer_t)mem, 0,0);
1319
1320         vm_page_lockspin_queues();
1321         vm_page_remove(mem);
1322         vm_page_insert(mem, new_object, new_offset);
1323         vm_page_unlock_queues();
1324 }
1325
1326 /*
1327  *      vm_page_init:
1328  *
1329  *      Initialize the fields in a new page.
1330  *      This takes a structure with random values and initializes it
1331  *      so that it can be given to vm_page_release or vm_page_insert.
1332  */
1333 void
1334 vm_page_init(
1335         vm_page_t       mem,
1336         ppnum_t phys_page)
1337 {
1338         assert(phys_page);
1339         *mem = vm_page_template;
1340         mem->phys_page = phys_page;
1341 }
1342
1343 /*
1344  *      vm_page_grab_fictitious:
1345  *
1346  *      Remove a fictitious page from the free list.
1347  *      Returns VM_PAGE_NULL if there are no free pages.
1348  */
1349 int     c_vm_page_grab_fictitious = 0;
1350 int     c_vm_page_release_fictitious = 0;
1351 int     c_vm_page_more_fictitious = 0;
1352
1353 extern vm_page_t vm_page_grab_fictitious_common(vm_offset_t phys_addr);
1354
1355 vm_page_t
1356 vm_page_grab_fictitious_common(
1357         vm_offset_t phys_addr)
1358 {
1359         register vm_page_t m;
1360
1361         m = (vm_page_t)zget(vm_page_zone);
1362         if (m) {
1363                 vm_page_init(m, phys_addr);
1364                 m->fictitious = TRUE;
1365         }
1366
1367         c_vm_page_grab_fictitious++;
1368         return m;
1369 }
1370
1371 vm_page_t
1372 vm_page_grab_fictitious(void)
1373 {
1374         return vm_page_grab_fictitious_common(vm_page_fictitious_addr);
1375 }
1376
1377 vm_page_t
1378 vm_page_grab_guard(void)
1379 {
1380         return vm_page_grab_fictitious_common(vm_page_guard_addr);
1381 }
1382
1383 /*
1384  *      vm_page_release_fictitious:
1385  *
1386  *      Release a fictitious page to the free list.
1387  */
1388
1389 void
1390 vm_page_release_fictitious(
1391         register vm_page_t m)
1392 {
1393         assert(!m->free);
1394         assert(m->busy);
1395         assert(m->fictitious);
1396         assert(m->phys_page == vm_page_fictitious_addr ||
1397                m->phys_page == vm_page_guard_addr);
1398
1399         c_vm_page_release_fictitious++;
1400 #if DEBUG
1401         if (m->free)
1402                 panic("vm_page_release_fictitious");
1403 #endif
1404         m->free = TRUE;
1405         zfree(vm_page_zone, m);
1406 }
1407
1408 /*
1409  *      vm_page_more_fictitious:
1410  *
1411  *      Add more fictitious pages to the free list.
1412  *      Allowed to block. This routine is way intimate
1413  *      with the zones code, for several reasons:
1414  *      1. we need to carve some page structures out of physical
1415  *         memory before zones work, so they _cannot_ come from
1416  *         the zone_map.
1417  *      2. the zone needs to be collectable in order to prevent
1418  *         growth without bound. These structures are used by
1419  *         the device pager (by the hundreds and thousands), as
1420  *         private pages for pageout, and as blocking pages for
1421  *         pagein. Temporary bursts in demand should not result in
1422  *         permanent allocation of a resource.
1423  *      3. To smooth allocation humps, we allocate single pages
1424  *         with kernel_memory_allocate(), and cram them into the
1425  *         zone. This also allows us to initialize the vm_page_t's
1426  *         on the way into the zone, so that zget() always returns
1427  *         an initialized structure. The zone free element pointer
1428  *         and the free page pointer are both the first item in the
1429  *         vm_page_t.
1430  *      4. By having the pages in the zone pre-initialized, we need
1431  *         not keep 2 levels of lists. The garbage collector simply
1432  *         scans our list, and reduces physical memory usage as it
1433  *         sees fit.
1434  */
1435
1436 void vm_page_more_fictitious(void)
1437 {
1438         register vm_page_t m;
1439         vm_offset_t addr;
1440         kern_return_t retval;
1441         int i;
1442
1443         c_vm_page_more_fictitious++;
1444
1445         /*
1446          * Allocate a single page from the zone_map. Do not wait if no physical
1447          * pages are immediately available, and do not zero the space. We need
1448          * our own blocking lock here to prevent having multiple,
1449          * simultaneous requests from piling up on the zone_map lock. Exactly
1450          * one (of our) threads should be potentially waiting on the map lock.
1451          * If winner is not vm-privileged, then the page allocation will fail,
1452          * and it will temporarily block here in the vm_page_wait().
1453          */
1454         mutex_lock(&vm_page_alloc_lock);
1455         /*
1456          * If another thread allocated space, just bail out now.
1457          */
1458         if (zone_free_count(vm_page_zone) > 5) {
1459                 /*
1460                  * The number "5" is a small number that is larger than the
1461                  * number of fictitious pages that any single caller will
1462                  * attempt to allocate. Otherwise, a thread will attempt to
1463                  * acquire a fictitious page (vm_page_grab_fictitious), fail,
1464                  * release all of the resources and locks already acquired,
1465                  * and then call this routine. This routine finds the pages
1466                  * that the caller released, so fails to allocate new space.
1467                  * The process repeats infinitely. The largest known number
1468                  * of fictitious pages required in this manner is 2. 5 is
1469                  * simply a somewhat larger number.
1470                  */
1471                 mutex_unlock(&vm_page_alloc_lock);
1472                 return;
1473         }
1474
1475         retval = kernel_memory_allocate(zone_map,
1476                                         &addr, PAGE_SIZE, VM_PROT_ALL,
1477                                         KMA_KOBJECT|KMA_NOPAGEWAIT);
1478         if (retval != KERN_SUCCESS) {
1479                 /*
1480                  * No page was available. Tell the pageout daemon, drop the
1481                  * lock to give another thread a chance at it, and
1482                  * wait for the pageout daemon to make progress.
1483                  */
1484                 mutex_unlock(&vm_page_alloc_lock);
1485                 vm_page_wait(THREAD_UNINT);
1486                 return;
1487         }
1488         /*
1489          * Initialize as many vm_page_t's as will fit on this page. This
1490          * depends on the zone code disturbing ONLY the first item of
1491          * each zone element.
1492          */
1493         m = (vm_page_t)addr;
1494         for (i = PAGE_SIZE/sizeof(struct vm_page); i > 0; i--) {
1495                 vm_page_init(m, vm_page_fictitious_addr);
1496                 m->fictitious = TRUE;
1497                 m++;
1498         }
1499         zcram(vm_page_zone, (void *) addr, PAGE_SIZE);
1500         mutex_unlock(&vm_page_alloc_lock);
1501 }
1502
1503
1504 /*
1505  *      vm_pool_low():
1506  *
1507  *      Return true if it is not likely that a non-vm_privileged thread
1508  *      can get memory without blocking.  Advisory only, since the
1509  *      situation may change under us.
1510  */
1511 int
1512 vm_pool_low(void)
1513 {
1514         /* No locking, at worst we will fib. */
1515         return( vm_page_free_count < vm_page_free_reserved );
1516 }
1517
1518
1519
1520 /*
1521  * this is an interface to support bring-up of drivers
1522  * on platforms with physical memory > 4G...
1523  */
1524 int             vm_himemory_mode = 0;
1525
1526
1527 /*
1528  * this interface exists to support hardware controllers
1529  * incapable of generating DMAs with more than 32 bits
1530  * of address on platforms with physical memory > 4G...
1531  */
1532 unsigned int    vm_lopage_free_count = 0;
1533 unsigned int    vm_lopage_max_count = 0;
1534 queue_head_t    vm_lopage_queue_free;
1535
1536 vm_page_t
1537 vm_page_grablo(void)
1538 {
1539         register vm_page_t      mem;
1540         unsigned int vm_lopage_alloc_count;
1541
1542         if (vm_lopage_poolsize == 0)
1543                 return (vm_page_grab());
1544
1545         mutex_lock(&vm_page_queue_free_lock);
1546
1547         if (! queue_empty(&vm_lopage_queue_free)) {
1548                 queue_remove_first(&vm_lopage_queue_free,
1549                                    mem,
1550                                    vm_page_t,
1551                                    pageq);
1552                 assert(mem->free);
1553                 assert(mem->busy);
1554                 assert(!mem->pmapped);
1555                 assert(!mem->wpmapped);
1556
1557                 mem->pageq.next = NULL;
1558                 mem->pageq.prev = NULL;
1559                 mem->free = FALSE;
1560
1561                 vm_lopage_free_count--;
1562                 vm_lopage_alloc_count = (vm_lopage_poolend - vm_lopage_poolstart) - vm_lopage_free_count;
1563                 if (vm_lopage_alloc_count > vm_lopage_max_count)
1564                         vm_lopage_max_count = vm_lopage_alloc_count;
1565         } else {
1566                 mem = VM_PAGE_NULL;
1567         }
1568         mutex_unlock(&vm_page_queue_free_lock);
1569
1570         return (mem);
1571 }
1572
1573
1574 /*
1575  *      vm_page_grab:
1576  *
1577  *      first try to grab a page from the per-cpu free list...
1578  *      this must be done while pre-emption is disabled... if
1579  *      a page is available, we're done...
1580  *      if no page is available, grab the vm_page_queue_free_lock
1581  *      and see if current number of free pages would allow us
1582  *      to grab at least 1... if not, return VM_PAGE_NULL as before...
1583  *      if there are pages available, disable preemption and
1584  *      recheck the state of the per-cpu free list... we could
1585  *      have been preempted and moved to a different cpu, or
1586  *      some other thread could have re-filled it... if still
1587  *      empty, figure out how many pages we can steal from the
1588  *      global free queue and move to the per-cpu queue...
1589  *      return 1 of these pages when done... only wakeup the
1590  *      pageout_scan thread if we moved pages from the global
1591  *      list... no need for the wakeup if we've satisfied the
1592  *      request from the per-cpu queue.
1593  */
1594
1595 #define COLOR_GROUPS_TO_STEAL   4
1596
1597
1598 vm_page_t
1599 vm_page_grab( void )
1600 {
1601         vm_page_t       mem;
1602
1603
1604         disable_preemption();
1605
1606         if ((mem = PROCESSOR_DATA(current_processor(), free_pages))) {
1607 return_page_from_cpu_list:
1608                 PROCESSOR_DATA(current_processor(), page_grab_count) += 1;
1609                 PROCESSOR_DATA(current_processor(), free_pages) = mem->pageq.next;
1610                 mem->pageq.next = NULL;
1611
1612                 enable_preemption();
1613
1614                 assert(mem->listq.next == NULL && mem->listq.prev == NULL);
1615                 assert(mem->tabled == FALSE);
1616                 assert(mem->object == VM_OBJECT_NULL);
1617                 assert(!mem->laundry);
1618                 assert(!mem->free);
1619                 assert(pmap_verify_free(mem->phys_page));
1620                 assert(mem->busy);
1621                 assert(!mem->encrypted);
1622                 assert(!mem->pmapped);
1623                 assert(!mem->wpmapped);
1624
1625                 return mem;
1626         }
1627         enable_preemption();
1628
1629
1630         mutex_lock(&vm_page_queue_free_lock);
1631
1632         /*
1633          *      Optionally produce warnings if the wire or gobble
1634          *      counts exceed some threshold.
1635          */
1636         if (vm_page_wire_count_warning > 0
1637             && vm_page_wire_count >= vm_page_wire_count_warning) {
1638                 printf("mk: vm_page_grab(): high wired page count of %d\n",
1639                         vm_page_wire_count);
1640                 assert(vm_page_wire_count < vm_page_wire_count_warning);
1641         }
1642         if (vm_page_gobble_count_warning > 0
1643             && vm_page_gobble_count >= vm_page_gobble_count_warning) {
1644                 printf("mk: vm_page_grab(): high gobbled page count of %d\n",
1645                         vm_page_gobble_count);
1646                 assert(vm_page_gobble_count < vm_page_gobble_count_warning);
1647         }
1648
1649         /*
1650          *      Only let privileged threads (involved in pageout)
1651          *      dip into the reserved pool.
1652          */
1653         if ((vm_page_free_count < vm_page_free_reserved) &&
1654             !(current_thread()->options & TH_OPT_VMPRIV)) {
1655                 mutex_unlock(&vm_page_queue_free_lock);
1656                 mem = VM_PAGE_NULL;
1657         }
1658         else {
1659                vm_page_t        head;
1660                vm_page_t        tail;
1661                unsigned int     pages_to_steal;
1662                unsigned int     color;
1663
1664                while ( vm_page_free_count == 0 ) {
1665
1666                         mutex_unlock(&vm_page_queue_free_lock);
1667                         /*
1668                          * must be a privileged thread to be
1669                          * in this state since a non-privileged
1670                          * thread would have bailed if we were
1671                          * under the vm_page_free_reserved mark
1672                          */
1673                         VM_PAGE_WAIT();
1674                         mutex_lock(&vm_page_queue_free_lock);
1675                 }
1676
1677                 disable_preemption();
1678
1679                 if ((mem = PROCESSOR_DATA(current_processor(), free_pages))) {
1680                         mutex_unlock(&vm_page_queue_free_lock);
1681
1682                         /*
1683                          * we got preempted and moved to another processor
1684                          * or we got preempted and someone else ran and filled the cache
1685                          */
1686                         goto return_page_from_cpu_list;
1687                 }
1688                 if (vm_page_free_count <= vm_page_free_reserved)
1689                         pages_to_steal = 1;
1690                 else {
1691                         pages_to_steal = COLOR_GROUPS_TO_STEAL * vm_colors;
1692
1693                         if (pages_to_steal > (vm_page_free_count - vm_page_free_reserved))
1694                                 pages_to_steal = (vm_page_free_count - vm_page_free_reserved);
1695                 }
1696                 color = PROCESSOR_DATA(current_processor(), start_color);
1697                 head = tail = NULL;
1698
1699                 while (pages_to_steal--) {
1700                         if (--vm_page_free_count < vm_page_free_count_minimum)
1701                                 vm_page_free_count_minimum = vm_page_free_count;
1702
1703                         while (queue_empty(&vm_page_queue_free[color]))
1704                                 color = (color + 1) & vm_color_mask;
1705
1706                         queue_remove_first(&vm_page_queue_free[color],
1707                                            mem,
1708                                            vm_page_t,
1709                                            pageq);
1710                         mem->pageq.next = NULL;
1711                         mem->pageq.prev = NULL;
1712
1713                         color = (color + 1) & vm_color_mask;
1714
1715                         if (head == NULL)
1716                                 head = mem;
1717                         else
1718                                 tail->pageq.next = (queue_t)mem;
1719                         tail = mem;
1720
1721                         mem->pageq.prev = NULL;
1722                         assert(mem->listq.next == NULL && mem->listq.prev == NULL);
1723                         assert(mem->tabled == FALSE);
1724                         assert(mem->object == VM_OBJECT_NULL);
1725                         assert(!mem->laundry);
1726                         assert(mem->free);
1727                         mem->free = FALSE;
1728
1729                         assert(pmap_verify_free(mem->phys_page));
1730                         assert(mem->busy);
1731                         assert(!mem->free);
1732                         assert(!mem->encrypted);
1733                         assert(!mem->pmapped);
1734                         assert(!mem->wpmapped);
1735                 }
1736                 PROCESSOR_DATA(current_processor(), free_pages) = head->pageq.next;
1737                 PROCESSOR_DATA(current_processor(), start_color) = color;
1738
1739                 /*
1740                  * satisfy this request
1741                  */
1742                 PROCESSOR_DATA(current_processor(), page_grab_count) += 1;
1743                 mem = head;
1744                 mem->pageq.next = NULL;
1745
1746                 mutex_unlock(&vm_page_queue_free_lock);
1747
1748                 enable_preemption();
1749         }
1750         /*
1751          *      Decide if we should poke the pageout daemon.
1752          *      We do this if the free count is less than the low
1753          *      water mark, or if the free count is less than the high
1754          *      water mark (but above the low water mark) and the inactive
1755          *      count is less than its target.
1756          *
1757          *      We don't have the counts locked ... if they change a little,
1758          *      it doesn't really matter.
1759          */
1760         if ((vm_page_free_count < vm_page_free_min) ||
1761             ((vm_page_free_count < vm_page_free_target) &&
1762              ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min)))
1763                 thread_wakeup((event_t) &vm_page_free_wanted);
1764
1765 #if CONFIG_EMBEDDED
1766         {
1767         int     percent_avail;
1768
1769         /*
1770          * Decide if we need to poke the memorystatus notification thread.
1771          */
1772         percent_avail =
1773                 (vm_page_active_count + vm_page_inactive_count +
1774                  vm_page_speculative_count + vm_page_free_count +
1775                  (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 /
1776                 atop_64(max_mem);
1777         if (percent_avail <= (kern_memorystatus_level - 5)) {
1778                 kern_memorystatus_level = percent_avail;
1779                 thread_wakeup((event_t)&kern_memorystatus_wakeup);
1780         }
1781         }
1782 #endif
1783
1784 //      dbgLog(mem->phys_page, vm_page_free_count, vm_page_wire_count, 4);      /* (TEST/DEBUG) */
1785
1786         return mem;
1787 }
1788
1789 /*
1790  *      vm_page_release:
1791  *
1792  *      Return a page to the free list.
1793  */
1794
1795 void
1796 vm_page_release(
1797         register vm_page_t      mem)
1798 {
1799         unsigned int    color;
1800 #if 0
1801         unsigned int pindex;
1802         phys_entry *physent;
1803
1804         physent = mapping_phys_lookup(mem->phys_page, &pindex);         /* (BRINGUP) */
1805         if(physent->ppLink & ppN) {                                                                                     /* (BRINGUP) */
1806                 panic("vm_page_release: already released - %08X %08X\n", mem, mem->phys_page);
1807         }
1808         physent->ppLink = physent->ppLink | ppN;                                                        /* (BRINGUP) */
1809 #endif
1810         assert(!mem->private && !mem->fictitious);
1811
1812 //      dbgLog(mem->phys_page, vm_page_free_count, vm_page_wire_count, 5);      /* (TEST/DEBUG) */
1813
1814         mutex_lock(&vm_page_queue_free_lock);
1815 #if DEBUG
1816         if (mem->free)
1817                 panic("vm_page_release");
1818 #endif
1819         mem->free = TRUE;
1820
1821         assert(mem->busy);
1822         assert(!mem->laundry);
1823         assert(mem->object == VM_OBJECT_NULL);
1824         assert(mem->pageq.next == NULL &&
1825                mem->pageq.prev == NULL);
1826         assert(mem->listq.next == NULL &&
1827                mem->listq.prev == NULL);
1828
1829         if (mem->phys_page <= vm_lopage_poolend && mem->phys_page >= vm_lopage_poolstart) {
1830                 /*
1831                  * this exists to support hardware controllers
1832                  * incapable of generating DMAs with more than 32 bits
1833                  * of address on platforms with physical memory > 4G...
1834                  */
1835                 queue_enter_first(&vm_lopage_queue_free,
1836                                   mem,
1837                                   vm_page_t,
1838                                   pageq);
1839                 vm_lopage_free_count++;
1840         } else {
1841                 color = mem->phys_page & vm_color_mask;
1842                 queue_enter_first(&vm_page_queue_free[color],
1843                                   mem,
1844                                   vm_page_t,
1845                                   pageq);
1846                 vm_page_free_count++;
1847                 /*
1848                  *      Check if we should wake up someone waiting for page.
1849                  *      But don't bother waking them unless they can allocate.
1850                  *
1851                  *      We wakeup only one thread, to prevent starvation.
1852                  *      Because the scheduling system handles wait queues FIFO,
1853                  *      if we wakeup all waiting threads, one greedy thread
1854                  *      can starve multiple niceguy threads.  When the threads
1855                  *      all wakeup, the greedy threads runs first, grabs the page,
1856                  *      and waits for another page.  It will be the first to run
1857                  *      when the next page is freed.
1858                  *
1859                  *      However, there is a slight danger here.
1860                  *      The thread we wake might not use the free page.
1861                  *      Then the other threads could wait indefinitely
1862                  *      while the page goes unused.  To forestall this,
1863                  *      the pageout daemon will keep making free pages
1864                  *      as long as vm_page_free_wanted is non-zero.
1865                  */
1866
1867                 if ((vm_page_free_wanted_privileged > 0) && vm_page_free_count) {
1868                         vm_page_free_wanted_privileged--;
1869                         thread_wakeup_one((event_t) &vm_page_free_wanted_privileged);
1870                 } else if ((vm_page_free_wanted > 0) &&
1871                            (vm_page_free_count >= vm_page_free_reserved)) {
1872                         vm_page_free_wanted--;
1873                         thread_wakeup_one((event_t) &vm_page_free_count);
1874                 }
1875         }
1876         mutex_unlock(&vm_page_queue_free_lock);
1877
1878 #if CONFIG_EMBEDDED
1879         {
1880         int     percent_avail;
1881
1882         /*
1883          * Decide if we need to poke the memorystatus notification thread.
1884          * Locking is not a big issue, as only a single thread delivers these.
1885          */
1886         percent_avail =
1887                 (vm_page_active_count + vm_page_inactive_count +
1888                  vm_page_speculative_count + vm_page_free_count +
1889                  (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count)  ) * 100 /
1890                 atop_64(max_mem);
1891         if (percent_avail >= (kern_memorystatus_level + 5)) {
1892                 kern_memorystatus_level = percent_avail;
1893                 thread_wakeup((event_t)&kern_memorystatus_wakeup);
1894         }
1895         }
1896 #endif
1897 }
1898
1899 /*
1900  *      vm_page_wait:
1901  *
1902  *      Wait for a page to become available.
1903  *      If there are plenty of free pages, then we don't sleep.
1904  *
1905  *      Returns:
1906  *              TRUE:  There may be another page, try again
1907  *              FALSE: We were interrupted out of our wait, don't try again
1908  */
1909
1910 boolean_t
1911 vm_page_wait(
1912         int     interruptible )
1913 {
1914         /*
1915          *      We can't use vm_page_free_reserved to make this
1916          *      determination.  Consider: some thread might
1917          *      need to allocate two pages.  The first allocation
1918          *      succeeds, the second fails.  After the first page is freed,
1919          *      a call to vm_page_wait must really block.
1920          */
1921         kern_return_t   wait_result;
1922         int             need_wakeup = 0;
1923         int             is_privileged = current_thread()->options & TH_OPT_VMPRIV;
1924
1925         mutex_lock(&vm_page_queue_free_lock);
1926
1927         if (is_privileged && vm_page_free_count) {
1928                 mutex_unlock(&vm_page_queue_free_lock);
1929                 return TRUE;
1930         }
1931         if (vm_page_free_count < vm_page_free_target) {
1932
1933                 if (is_privileged) {
1934                         if (vm_page_free_wanted_privileged++ == 0)
1935                                 need_wakeup = 1;
1936                         wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, interruptible);
1937                 } else {
1938                         if (vm_page_free_wanted++ == 0)
1939                                 need_wakeup = 1;
1940                         wait_result = assert_wait((event_t)&vm_page_free_count, interruptible);
1941                 }
1942                 mutex_unlock(&vm_page_queue_free_lock);
1943                 counter(c_vm_page_wait_block++);
1944
1945                 if (need_wakeup)
1946                         thread_wakeup((event_t)&vm_page_free_wanted);
1947
1948                 if (wait_result == THREAD_WAITING)
1949                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1950
1951                 return(wait_result == THREAD_AWAKENED);
1952         } else {
1953                 mutex_unlock(&vm_page_queue_free_lock);
1954                 return TRUE;
1955         }
1956 }
1957
1958 /*
1959  *      vm_page_alloc:
1960  *
1961  *      Allocate and return a memory cell associated
1962  *      with this VM object/offset pair.
1963  *
1964  *      Object must be locked.
1965  */
1966
1967 vm_page_t
1968 vm_page_alloc(
1969         vm_object_t             object,
1970         vm_object_offset_t      offset)
1971 {
1972         register vm_page_t      mem;
1973
1974         vm_object_lock_assert_exclusive(object);
1975         mem = vm_page_grab();
1976         if (mem == VM_PAGE_NULL)
1977                 return VM_PAGE_NULL;
1978
1979         vm_page_insert(mem, object, offset);
1980
1981         return(mem);
1982 }
1983
1984 vm_page_t
1985 vm_page_alloclo(
1986         vm_object_t             object,
1987         vm_object_offset_t      offset)
1988 {
1989         register vm_page_t      mem;
1990
1991         vm_object_lock_assert_exclusive(object);
1992         mem = vm_page_grablo();
1993         if (mem == VM_PAGE_NULL)
1994                 return VM_PAGE_NULL;
1995
1996         vm_page_insert(mem, object, offset);
1997
1998         return(mem);
1999 }
2000
2001
2002 /*
2003  *      vm_page_alloc_guard:
2004  *
2005  *      Allocate a ficticious page which will be used
2006  *      as a guard page.  The page will be inserted into
2007  *      the object and returned to the caller.
2008  */
2009
2010 vm_page_t
2011 vm_page_alloc_guard(
2012         vm_object_t             object,
2013         vm_object_offset_t      offset)
2014 {
2015         register vm_page_t      mem;
2016
2017         vm_object_lock_assert_exclusive(object);
2018         mem = vm_page_grab_guard();
2019         if (mem == VM_PAGE_NULL)
2020                 return VM_PAGE_NULL;
2021
2022         vm_page_insert(mem, object, offset);
2023
2024         return(mem);
2025 }
2026
2027
2028 counter(unsigned int c_laundry_pages_freed = 0;)
2029
2030 boolean_t       vm_page_free_verify = TRUE;
2031 /*
2032  *      vm_page_free:
2033  *
2034  *      Returns the given page to the free list,
2035  *      disassociating it with any VM object.
2036  *
2037  *      Object and page queues must be locked prior to entry.
2038  */
2039 void
2040 vm_page_free_prepare(
2041         register vm_page_t      mem)
2042 {
2043         VM_PAGE_CHECK(mem);
2044         assert(!mem->free);
2045         assert(!mem->cleaning);
2046         assert(!mem->pageout);
2047
2048 #if DEBUG
2049         if (vm_page_free_verify && !mem->fictitious && !mem->private) {
2050                 assert(pmap_verify_free(mem->phys_page));
2051         }
2052         if (mem->object)
2053                 vm_object_lock_assert_exclusive(mem->object);
2054         _mutex_assert(&vm_page_queue_lock, MA_OWNED);
2055
2056         if (mem->free)
2057                panic("vm_page_free: freeing page on free list\n");
2058 #endif
2059
2060         if (mem->laundry) {
2061                 /*
2062                  * We may have to free a page while it's being laundered
2063                  * if we lost its pager (due to a forced unmount, for example).
2064                  * We need to call vm_pageout_throttle_up() before removing
2065                  * the page from its VM object, so that we can find out on
2066                  * which pageout queue the page is.
2067                  */
2068                 vm_pageout_throttle_up(mem);
2069                 counter(++c_laundry_pages_freed);
2070         }
2071
2072         if (mem->tabled)
2073                 vm_page_remove(mem);    /* clears tabled, object, offset */
2074
2075         VM_PAGE_QUEUES_REMOVE(mem);     /* clears active/inactive/throttled/speculative */
2076
2077         if (mem->wire_count) {
2078                 if (!mem->private && !mem->fictitious)
2079                         vm_page_wire_count--;
2080                 mem->wire_count = 0;
2081                 assert(!mem->gobbled);
2082         } else if (mem->gobbled) {
2083                 if (!mem->private && !mem->fictitious)
2084                         vm_page_wire_count--;
2085                 vm_page_gobble_count--;
2086         }
2087         mem->gobbled = FALSE;
2088
2089         PAGE_WAKEUP(mem);       /* clears wanted */
2090
2091         /* Some of these may be unnecessary */
2092         mem->busy = TRUE;
2093         mem->absent = FALSE;
2094         mem->error = FALSE;
2095         mem->dirty = FALSE;
2096         mem->precious = FALSE;
2097         mem->reference = FALSE;
2098         mem->encrypted = FALSE;
2099         mem->encrypted_cleaning = FALSE;
2100         mem->deactivated = FALSE;
2101         mem->pmapped = FALSE;
2102         mem->wpmapped = FALSE;
2103
2104         if (mem->private) {
2105                 mem->private = FALSE;
2106                 mem->fictitious = TRUE;
2107                 mem->phys_page = vm_page_fictitious_addr;
2108         }
2109         if (!mem->fictitious) {
2110                 if (mem->zero_fill == TRUE) {
2111                         mem->zero_fill = FALSE;
2112                         OSAddAtomic(-1, (SInt32 *)&vm_zf_count);
2113                 }
2114                 vm_page_init(mem, mem->phys_page);
2115         }
2116 }
2117
2118 void
2119 vm_page_free(
2120         vm_page_t       mem)
2121 {
2122         vm_page_free_prepare(mem);
2123         if (mem->fictitious) {
2124                 vm_page_release_fictitious(mem);
2125         } else {
2126                 vm_page_release(mem);
2127         }
2128 }
2129
2130 /*
2131  * Free a list of pages.  The list can be up to several hundred pages,
2132  * as blocked up by vm_pageout_scan().
2133  * The big win is not having to take the page q and free list locks once
2134  * per page.  We sort the incoming pages into n lists, one for
2135  * each color.
2136  *
2137  * The page queues must be locked, and are kept locked.
2138  */
2139 void
2140 vm_page_free_list(
2141         vm_page_t       mem)
2142 {
2143         vm_page_t       nxt;
2144         int             pg_count = 0;
2145         int             color;
2146         int             inuse_list_head = -1;
2147
2148         queue_head_t    free_list[MAX_COLORS];
2149         int             inuse[MAX_COLORS];
2150
2151         for (color = 0; color < (signed) vm_colors; color++) {
2152                 queue_init(&free_list[color]);
2153         }
2154
2155 #if DEBUG
2156         _mutex_assert(&vm_page_queue_lock, MA_OWNED);
2157 #endif
2158         while (mem) {
2159 #if DEBUG
2160                 if (mem->tabled || mem->object)
2161                         panic("vm_page_free_list: freeing tabled page\n");
2162                 if (mem->inactive || mem->active || mem->throttled || mem->free)
2163                         panic("vm_page_free_list: freeing page on list\n");
2164                 if (vm_page_free_verify && !mem->fictitious && !mem->private) {
2165                         assert(pmap_verify_free(mem->phys_page));
2166                 }
2167 #endif
2168                 assert(mem->pageq.prev == NULL);
2169                 assert(mem->busy);
2170                 assert(!mem->free);
2171                 nxt = (vm_page_t)(mem->pageq.next);
2172
2173                 if (!mem->fictitious) {
2174                         if (mem->phys_page <= vm_lopage_poolend && mem->phys_page >= vm_lopage_poolstart) {
2175                                 mem->pageq.next = NULL;
2176                                 vm_page_release(mem);
2177                         } else {
2178                                 mem->free = TRUE;
2179
2180                                 color = mem->phys_page & vm_color_mask;
2181                                 if (queue_empty(&free_list[color])) {
2182                                         inuse[color] = inuse_list_head;
2183                                         inuse_list_head = color;
2184                                 }
2185                                 queue_enter_first(&free_list[color],
2186                                                   mem,
2187                                                   vm_page_t,
2188                                                   pageq);
2189                                 pg_count++;
2190                         }
2191                 } else {
2192                         assert(mem->phys_page == vm_page_fictitious_addr ||
2193                                mem->phys_page == vm_page_guard_addr);
2194                         vm_page_release_fictitious(mem);
2195                 }
2196                 mem = nxt;
2197         }
2198         if (pg_count) {
2199                 unsigned int    avail_free_count;
2200
2201                 mutex_lock(&vm_page_queue_free_lock);
2202
2203                 color = inuse_list_head;
2204
2205                 while( color != -1 ) {
2206                         vm_page_t first, last;
2207                         vm_page_t first_free;
2208
2209                         first = (vm_page_t) queue_first(&free_list[color]);
2210                         last = (vm_page_t) queue_last(&free_list[color]);
2211                         first_free = (vm_page_t) queue_first(&vm_page_queue_free[color]);
2212
2213                         if (queue_empty(&vm_page_queue_free[color])) {
2214                                 queue_last(&vm_page_queue_free[color]) =
2215                                         (queue_entry_t) last;
2216                         } else {
2217                                 queue_prev(&first_free->pageq) =
2218                                         (queue_entry_t) last;
2219                         }
2220                         queue_first(&vm_page_queue_free[color]) =
2221                                 (queue_entry_t) first;
2222                         queue_prev(&first->pageq) =
2223                                 (queue_entry_t) &vm_page_queue_free[color];
2224                         queue_next(&last->pageq) =
2225                                 (queue_entry_t) first_free;
2226                         color = inuse[color];
2227                 }
2228
2229                 vm_page_free_count += pg_count;
2230                 avail_free_count = vm_page_free_count;
2231
2232                 while ((vm_page_free_wanted_privileged > 0) && avail_free_count) {
2233                         vm_page_free_wanted_privileged--;
2234                         avail_free_count--;
2235
2236                         thread_wakeup_one((event_t) &vm_page_free_wanted_privileged);
2237                 }
2238
2239                 if ((vm_page_free_wanted > 0) &&
2240                     (avail_free_count >= vm_page_free_reserved)) {
2241                         unsigned int  available_pages;
2242
2243                         if (avail_free_count >= vm_page_free_reserved) {
2244                                 available_pages = (avail_free_count - vm_page_free_reserved);
2245                         } else {
2246                                 available_pages = 0;
2247                         }
2248
2249                         if (available_pages >= vm_page_free_wanted) {
2250                                 vm_page_free_wanted = 0;
2251                                 thread_wakeup((event_t) &vm_page_free_count);
2252                         } else {
2253                                 while (available_pages--) {
2254                                         vm_page_free_wanted--;
2255                                         thread_wakeup_one((event_t) &vm_page_free_count);
2256                                 }
2257                         }
2258                 }
2259                 mutex_unlock(&vm_page_queue_free_lock);
2260
2261 #if CONFIG_EMBEDDED
2262                 {
2263                 int percent_avail;
2264
2265                 /*
2266                  * Decide if we need to poke the memorystatus notification thread.
2267                  */
2268                 percent_avail =
2269                         (vm_page_active_count + vm_page_inactive_count +
2270                          vm_page_speculative_count + vm_page_free_count +
2271                          (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count)  ) * 100 /
2272                         atop_64(max_mem);
2273                 if (percent_avail >= (kern_memorystatus_level + 5)) {
2274                         kern_memorystatus_level = percent_avail;
2275                         thread_wakeup((event_t)&kern_memorystatus_wakeup);
2276                 }
2277                 }
2278 #endif
2279         }
2280 }
2281
2282
2283 /*
2284  *      vm_page_wire:
2285  *
2286  *      Mark this page as wired down by yet
2287  *      another map, removing it from paging queues
2288  *      as necessary.
2289  *
2290  *      The page's object and the page queues must be locked.
2291  */
2292 void
2293 vm_page_wire(
2294         register vm_page_t      mem)
2295 {
2296
2297 //      dbgLog(current_thread(), mem->offset, mem->object, 1);  /* (TEST/DEBUG) */
2298
2299         VM_PAGE_CHECK(mem);
2300 #if DEBUG
2301         if (mem->object)
2302                 vm_object_lock_assert_exclusive(mem->object);
2303         _mutex_assert(&vm_page_queue_lock, MA_OWNED);
2304 #endif
2305         if (mem->wire_count == 0) {
2306                 VM_PAGE_QUEUES_REMOVE(mem);
2307                 if (!mem->private && !mem->fictitious && !mem->gobbled)
2308                         vm_page_wire_count++;
2309                 if (mem->gobbled)
2310                         vm_page_gobble_count--;
2311                 mem->gobbled = FALSE;
2312                 if (mem->zero_fill == TRUE) {
2313                         mem->zero_fill = FALSE;
2314                         OSAddAtomic(-1, (SInt32 *)&vm_zf_count);
2315                 }
2316 #if CONFIG_EMBEDDED
2317                 {
2318                 int     percent_avail;
2319
2320                 /*
2321                  * Decide if we need to poke the memorystatus notification thread.
2322                  */
2323                 percent_avail =
2324                         (vm_page_active_count + vm_page_inactive_count +
2325                          vm_page_speculative_count + vm_page_free_count +
2326                          (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 /
2327                         atop_64(max_mem);
2328                 if (percent_avail <= (kern_memorystatus_level - 5)) {
2329                         kern_memorystatus_level = percent_avail;
2330                         thread_wakeup((event_t)&kern_memorystatus_wakeup);
2331                 }
2332                 }
2333 #endif
2334                 /*
2335                  * ENCRYPTED SWAP:
2336                  * The page could be encrypted, but
2337                  * We don't have to decrypt it here
2338                  * because we don't guarantee that the
2339                  * data is actually valid at this point.
2340                  * The page will get decrypted in
2341                  * vm_fault_wire() if needed.
2342                  */
2343         }
2344         assert(!mem->gobbled);
2345         mem->wire_count++;
2346 }
2347
2348 /*
2349  *      vm_page_gobble:
2350  *
2351  *      Mark this page as consumed by the vm/ipc/xmm subsystems.
2352  *
2353  *      Called only for freshly vm_page_grab()ed pages - w/ nothing locked.
2354  */
2355 void
2356 vm_page_gobble(
2357         register vm_page_t      mem)
2358 {
2359         vm_page_lockspin_queues();
2360         VM_PAGE_CHECK(mem);
2361
2362         assert(!mem->gobbled);
2363         assert(mem->wire_count == 0);
2364
2365         if (!mem->gobbled && mem->wire_count == 0) {
2366                 if (!mem->private && !mem->fictitious)
2367                         vm_page_wire_count++;
2368         }
2369         vm_page_gobble_count++;
2370         mem->gobbled = TRUE;
2371         vm_page_unlock_queues();
2372 }
2373
2374 /*
2375  *      vm_page_unwire:
2376  *
2377  *      Release one wiring of this page, potentially
2378  *      enabling it to be paged again.
2379  *
2380  *      The page's object and the page queues must be locked.
2381  */
2382 void
2383 vm_page_unwire(
2384         register vm_page_t      mem)
2385 {
2386
2387 //      dbgLog(current_thread(), mem->offset, mem->object, 0);  /* (TEST/DEBUG) */
2388
2389         VM_PAGE_CHECK(mem);
2390         assert(mem->wire_count > 0);
2391 #if DEBUG
2392         if (mem->object)
2393                 vm_object_lock_assert_exclusive(mem->object);
2394         _mutex_assert(&vm_page_queue_lock, MA_OWNED);
2395 #endif
2396         if (--mem->wire_count == 0) {
2397                 assert(!mem->private && !mem->fictitious);
2398                 vm_page_wire_count--;
2399                 assert(!mem->laundry);
2400                 assert(mem->object != kernel_object);
2401                 assert(mem->pageq.next == NULL && mem->pageq.prev == NULL);
2402                 if (mem->object->purgable == VM_PURGABLE_EMPTY) {
2403                         vm_page_deactivate(mem);
2404                 } else {
2405                         vm_page_activate(mem);
2406                 }
2407 #if CONFIG_EMBEDDED
2408                 {
2409                 int     percent_avail;
2410
2411                 /*
2412                  * Decide if we need to poke the memorystatus notification thread.
2413                  */
2414                 percent_avail =
2415                         (vm_page_active_count + vm_page_inactive_count +
2416                          vm_page_speculative_count + vm_page_free_count +
2417                          (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 /
2418                         atop_64(max_mem);
2419                 if (percent_avail >= (kern_memorystatus_level + 5)) {
2420                         kern_memorystatus_level = percent_avail;
2421                         thread_wakeup((event_t)&kern_memorystatus_wakeup);
2422                 }
2423                 }
2424 #endif
2425         }
2426 }
2427
2428
2429 /*
2430  *      vm_page_deactivate:
2431  *
2432  *      Returns the given page to the inactive list,
2433  *      indicating that no physical maps have access
2434  *      to this page.  [Used by the physical mapping system.]
2435  *
2436  *      The page queues must be locked.
2437  */
2438 void
2439 vm_page_deactivate(
2440         register vm_page_t      m)
2441 {
2442         boolean_t rapid_age = FALSE;
2443
2444         VM_PAGE_CHECK(m);
2445         assert(m->object != kernel_object);
2446         assert(m->phys_page != vm_page_guard_addr);
2447
2448 //      dbgLog(m->phys_page, vm_page_free_count, vm_page_wire_count, 6);        /* (TEST/DEBUG) */
2449 #if DEBUG
2450         _mutex_assert(&vm_page_queue_lock, MA_OWNED);
2451 #endif
2452         /*
2453          *      This page is no longer very interesting.  If it was
2454          *      interesting (active or inactive/referenced), then we
2455          *      clear the reference bit and (re)enter it in the
2456          *      inactive queue.  Note wired pages should not have
2457          *      their reference bit cleared.
2458          */
2459         if (m->gobbled) {               /* can this happen? */
2460                 assert(m->wire_count == 0);
2461
2462                 if (!m->private && !m->fictitious)
2463                         vm_page_wire_count--;
2464                 vm_page_gobble_count--;
2465                 m->gobbled = FALSE;
2466         }
2467         if (m->private || (m->wire_count != 0))
2468                 return;
2469
2470         if (m->active && m->deactivated == TRUE) {
2471                 if (!pmap_is_referenced(m->phys_page))
2472                         rapid_age = TRUE;
2473         }
2474         if (rapid_age == FALSE && !m->fictitious && !m->absent)
2475                 pmap_clear_reference(m->phys_page);
2476
2477         m->reference = FALSE;
2478         m->deactivated = FALSE;
2479         m->no_cache = FALSE;
2480
2481         if (!m->inactive) {
2482                 VM_PAGE_QUEUES_REMOVE(m);
2483
2484                 assert(!m->laundry);
2485                 assert(m->pageq.next == NULL && m->pageq.prev == NULL);
2486
2487                 if (!IP_VALID(memory_manager_default) &&
2488                         m->dirty && m->object->internal &&
2489                         (m->object->purgable == VM_PURGABLE_DENY ||
2490                          m->object->purgable == VM_PURGABLE_NONVOLATILE ||
2491                          m->object->purgable == VM_PURGABLE_VOLATILE )) {
2492                         queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
2493                         m->throttled = TRUE;
2494                         vm_page_throttled_count++;
2495                 } else {
2496                         if (rapid_age == TRUE ||
2497                             (!m->fictitious && m->object->named && m->object->ref_count == 1)) {
2498                                 vm_page_speculate(m, FALSE);
2499                                 vm_page_speculative_recreated++;
2500                                 return;
2501                         } else {
2502                                 if (m->zero_fill) {
2503                                         queue_enter(&vm_page_queue_zf, m, vm_page_t, pageq);
2504                                         vm_zf_queue_count++;
2505                                 } else {
2506                                         queue_enter(&vm_page_queue_inactive, m, vm_page_t, pageq);
2507                                 }
2508                         }
2509                         m->inactive = TRUE;
2510                         if (!m->fictitious) {
2511                                 vm_page_inactive_count++;
2512                                 token_new_pagecount++;
2513                         }
2514                 }
2515         }
2516 }
2517
2518 /*
2519  *      vm_page_activate:
2520  *
2521  *      Put the specified page on the active list (if appropriate).
2522  *
2523  *      The page queues must be locked.
2524  */
2525
2526 void
2527 vm_page_activate(
2528         register vm_page_t      m)
2529 {
2530         VM_PAGE_CHECK(m);
2531 #ifdef  FIXME_4778297
2532         assert(m->object != kernel_object);
2533 #endif
2534         assert(m->phys_page != vm_page_guard_addr);
2535 #if DEBUG
2536         _mutex_assert(&vm_page_queue_lock, MA_OWNED);
2537 #endif
2538         if (m->gobbled) {
2539                 assert(m->wire_count == 0);
2540                 if (!m->private && !m->fictitious)
2541                         vm_page_wire_count--;
2542                 vm_page_gobble_count--;
2543                 m->gobbled = FALSE;
2544         }
2545         if (m->private)
2546                 return;
2547
2548 #if DEBUG
2549         if (m->active)
2550                 panic("vm_page_activate: already active");
2551 #endif
2552
2553         if (m->speculative) {
2554                 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
2555                 DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL);
2556         }
2557
2558         VM_PAGE_QUEUES_REMOVE(m);
2559
2560         if (m->wire_count == 0) {
2561                 assert(!m->laundry);
2562                 assert(m->pageq.next == NULL && m->pageq.prev == NULL);
2563                 if (!IP_VALID(memory_manager_default) &&
2564                         !m->fictitious && m->dirty && m->object->internal &&
2565                         (m->object->purgable == VM_PURGABLE_DENY ||
2566                          m->object->purgable == VM_PURGABLE_NONVOLATILE ||
2567                          m->object->purgable == VM_PURGABLE_VOLATILE )) {
2568                         queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
2569                         m->throttled = TRUE;
2570                         vm_page_throttled_count++;
2571                 } else {
2572                         queue_enter(&vm_page_queue_active, m, vm_page_t, pageq);
2573                         m->active = TRUE;
2574                         if (!m->fictitious)
2575                                 vm_page_active_count++;
2576                 }
2577                 m->reference = TRUE;
2578                 m->no_cache = FALSE;
2579         }
2580 }
2581
2582
2583 /*
2584  *      vm_page_speculate:
2585  *
2586  *      Put the specified page on the speculative list (if appropriate).
2587  *
2588  *      The page queues must be locked.
2589  */
2590 void
2591 vm_page_speculate(
2592         vm_page_t       m,
2593         boolean_t       new)
2594 {
2595         struct vm_speculative_age_q     *aq;
2596
2597         VM_PAGE_CHECK(m);
2598         assert(m->object != kernel_object);
2599         assert(!m->speculative && !m->active && !m->inactive && !m->throttled);
2600         assert(m->phys_page != vm_page_guard_addr);
2601         assert(m->pageq.next == NULL && m->pageq.prev == NULL);
2602 #if DEBUG
2603         _mutex_assert(&vm_page_queue_lock, MA_OWNED);
2604 #endif
2605         if (m->wire_count == 0) {
2606                 mach_timespec_t         ts;
2607
2608                 clock_get_system_nanotime(&ts.tv_sec, (unsigned *)&ts.tv_nsec);
2609
2610                 if (vm_page_speculative_count == 0) {
2611
2612                         speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2613                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2614
2615                         aq = &vm_page_queue_speculative[speculative_age_index];
2616
2617                         /*
2618                          * set the timer to begin a new group
2619                          */
2620                         aq->age_ts.tv_sec = VM_PAGE_SPECULATIVE_Q_AGE_MS / 1000;
2621                         aq->age_ts.tv_nsec = (VM_PAGE_SPECULATIVE_Q_AGE_MS % 1000) * 1000 * NSEC_PER_USEC;
2622
2623                         ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
2624                 } else {
2625                         aq = &vm_page_queue_speculative[speculative_age_index];
2626
2627                         if (CMP_MACH_TIMESPEC(&ts, &aq->age_ts) >= 0) {
2628
2629                                 speculative_age_index++;
2630
2631                                 if (speculative_age_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
2632                                         speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2633                                 if (speculative_age_index == speculative_steal_index) {
2634                                         speculative_steal_index = speculative_age_index + 1;
2635
2636                                         if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
2637                                                 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2638                                 }
2639                                 aq = &vm_page_queue_speculative[speculative_age_index];
2640
2641                                 if (!queue_empty(&aq->age_q))
2642                                         vm_page_speculate_ageit(aq);
2643
2644                                 aq->age_ts.tv_sec = VM_PAGE_SPECULATIVE_Q_AGE_MS / 1000;
2645                                 aq->age_ts.tv_nsec = (VM_PAGE_SPECULATIVE_Q_AGE_MS % 1000) * 1000 * NSEC_PER_USEC;
2646
2647                                 ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
2648                         }
2649                 }
2650                 enqueue_tail(&aq->age_q, &m->pageq);
2651                 m->speculative = TRUE;
2652                 vm_page_speculative_count++;
2653
2654                 if (new == TRUE) {
2655                         m->object->pages_created++;
2656                         vm_page_speculative_created++;
2657                 }
2658         }
2659 }
2660
2661
2662 /*
2663  * move pages from the specified aging bin to
2664  * the speculative bin that pageout_scan claims from
2665  *
2666  *      The page queues must be locked.
2667  */
2668 void
2669 vm_page_speculate_ageit(struct vm_speculative_age_q *aq)
2670 {
2671         struct vm_speculative_age_q     *sq;
2672         vm_page_t       t;
2673
2674         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2675
2676         if (queue_empty(&sq->age_q)) {
2677                 sq->age_q.next = aq->age_q.next;
2678                 sq->age_q.prev = aq->age_q.prev;
2679
2680                 t = (vm_page_t)sq->age_q.next;
2681                 t->pageq.prev = &sq->age_q;
2682
2683                 t = (vm_page_t)sq->age_q.prev;
2684                 t->pageq.next = &sq->age_q;
2685         } else {
2686                 t = (vm_page_t)sq->age_q.prev;
2687                 t->pageq.next = aq->age_q.next;
2688
2689                 t = (vm_page_t)aq->age_q.next;
2690                 t->pageq.prev = sq->age_q.prev;
2691
2692                 t = (vm_page_t)aq->age_q.prev;
2693                 t->pageq.next = &sq->age_q;
2694
2695                 sq->age_q.prev = aq->age_q.prev;
2696         }
2697         queue_init(&aq->age_q);
2698 }
2699
2700
2701 void
2702 vm_page_lru(
2703         vm_page_t       m)
2704 {
2705         VM_PAGE_CHECK(m);
2706         assert(m->object != kernel_object);
2707         assert(m->phys_page != vm_page_guard_addr);
2708
2709 #if DEBUG
2710         _mutex_assert(&vm_page_queue_lock, MA_OWNED);
2711 #endif
2712         if (m->active || m->reference)
2713                 return;
2714
2715         if (m->private || (m->wire_count != 0))
2716                 return;
2717
2718         m->no_cache = FALSE;
2719
2720         VM_PAGE_QUEUES_REMOVE(m);
2721
2722         assert(!m->laundry);
2723         assert(m->pageq.next == NULL && m->pageq.prev == NULL);
2724
2725         queue_enter(&vm_page_queue_inactive, m, vm_page_t, pageq);
2726         m->inactive = TRUE;
2727
2728         vm_page_inactive_count++;
2729         token_new_pagecount++;
2730 }
2731
2732
2733 /*
2734  *      vm_page_part_zero_fill:
2735  *
2736  *      Zero-fill a part of the page.
2737  */
2738 void
2739 vm_page_part_zero_fill(
2740         vm_page_t       m,
2741         vm_offset_t     m_pa,
2742         vm_size_t       len)
2743 {
2744         vm_page_t       tmp;
2745
2746         VM_PAGE_CHECK(m);
2747 #ifdef PMAP_ZERO_PART_PAGE_IMPLEMENTED
2748         pmap_zero_part_page(m->phys_page, m_pa, len);
2749 #else
2750         while (1) {
2751                 tmp = vm_page_grab();
2752                 if (tmp == VM_PAGE_NULL) {
2753                         vm_page_wait(THREAD_UNINT);
2754                         continue;
2755                 }
2756                 break;
2757         }
2758         vm_page_zero_fill(tmp);
2759         if(m_pa != 0) {
2760                 vm_page_part_copy(m, 0, tmp, 0, m_pa);
2761         }
2762         if((m_pa + len) <  PAGE_SIZE) {
2763                 vm_page_part_copy(m, m_pa + len, tmp,
2764                                 m_pa + len, PAGE_SIZE - (m_pa + len));
2765         }
2766         vm_page_copy(tmp,m);
2767         vm_page_lock_queues();
2768         vm_page_free(tmp);
2769         vm_page_unlock_queues();
2770 #endif
2771
2772 }
2773
2774 /*
2775  *      vm_page_zero_fill:
2776  *
2777  *      Zero-fill the specified page.
2778  */
2779 void
2780 vm_page_zero_fill(
2781         vm_page_t       m)
2782 {
2783         XPR(XPR_VM_PAGE,
2784                 "vm_page_zero_fill, object 0x%X offset 0x%X page 0x%X\n",
2785                 (integer_t)m->object, (integer_t)m->offset, (integer_t)m, 0,0);
2786
2787         VM_PAGE_CHECK(m);
2788
2789 //      dbgTrace(0xAEAEAEAE, m->phys_page, 0);          /* (BRINGUP) */
2790         pmap_zero_page(m->phys_page);
2791 }
2792
2793 /*
2794  *      vm_page_part_copy:
2795  *
2796  *      copy part of one page to another
2797  */
2798
2799 void
2800 vm_page_part_copy(
2801         vm_page_t       src_m,
2802         vm_offset_t     src_pa,
2803         vm_page_t       dst_m,
2804         vm_offset_t     dst_pa,
2805         vm_size_t       len)
2806 {
2807         VM_PAGE_CHECK(src_m);
2808         VM_PAGE_CHECK(dst_m);
2809
2810         pmap_copy_part_page(src_m->phys_page, src_pa,
2811                         dst_m->phys_page, dst_pa, len);
2812 }
2813
2814 /*
2815  *      vm_page_copy:
2816  *
2817  *      Copy one page to another
2818  *
2819  * ENCRYPTED SWAP:
2820  * The source page should not be encrypted.  The caller should
2821  * make sure the page is decrypted first, if necessary.
2822  */
2823
2824 int vm_page_copy_cs_validations = 0;
2825 int vm_page_copy_cs_tainted = 0;
2826
2827 void
2828 vm_page_copy(
2829         vm_page_t       src_m,
2830         vm_page_t       dest_m)
2831 {
2832         XPR(XPR_VM_PAGE,
2833         "vm_page_copy, object 0x%X offset 0x%X to object 0x%X offset 0x%X\n",
2834         (integer_t)src_m->object, src_m->offset,
2835         (integer_t)dest_m->object, dest_m->offset,
2836         0);
2837
2838         VM_PAGE_CHECK(src_m);
2839         VM_PAGE_CHECK(dest_m);
2840
2841         /*
2842          * ENCRYPTED SWAP:
2843          * The source page should not be encrypted at this point.
2844          * The destination page will therefore not contain encrypted
2845          * data after the copy.
2846          */
2847         if (src_m->encrypted) {
2848                 panic("vm_page_copy: source page %p is encrypted\n", src_m);
2849         }
2850         dest_m->encrypted = FALSE;
2851
2852         if (src_m->object != VM_OBJECT_NULL &&
2853             src_m->object->code_signed) {
2854                 /*
2855                  * We're copying a page from a code-signed object.
2856                  * Whoever ends up mapping the copy page might care about
2857                  * the original page's integrity, so let's validate the
2858                  * source page now.
2859                  */
2860                 vm_page_copy_cs_validations++;
2861                 vm_page_validate_cs(src_m);
2862         }
2863         /*
2864          * Propagate the code-signing bits to the copy page.
2865          */
2866         dest_m->cs_validated = src_m->cs_validated;
2867         dest_m->cs_tainted = src_m->cs_tainted;
2868         if (dest_m->cs_tainted) {
2869                 assert(dest_m->cs_validated);
2870                 vm_page_copy_cs_tainted++;
2871         }
2872
2873         pmap_copy_page(src_m->phys_page, dest_m->phys_page);
2874 }
2875
2876 #if MACH_ASSERT
2877 /*
2878  *      Check that the list of pages is ordered by
2879  *      ascending physical address and has no holes.
2880  */
2881 static int
2882 vm_page_verify_contiguous(
2883         vm_page_t       pages,
2884         unsigned int    npages)
2885 {
2886         register vm_page_t      m;
2887         unsigned int            page_count;
2888         vm_offset_t             prev_addr;
2889
2890         prev_addr = pages->phys_page;
2891         page_count = 1;
2892         for (m = NEXT_PAGE(pages); m != VM_PAGE_NULL; m = NEXT_PAGE(m)) {
2893                 if (m->phys_page != prev_addr + 1) {
2894                         printf("m %p prev_addr 0x%x, current addr 0x%x\n",
2895                                m, prev_addr, m->phys_page);
2896                         printf("pages %p page_count %d\n", pages, page_count);
2897                         panic("vm_page_verify_contiguous:  not contiguous!");
2898                 }
2899                 prev_addr = m->phys_page;
2900                 ++page_count;
2901         }
2902         if (page_count != npages) {
2903                 printf("pages %p actual count 0x%x but requested 0x%x\n",
2904                        pages, page_count, npages);
2905                 panic("vm_page_verify_contiguous:  count error");
2906         }
2907         return 1;
2908 }
2909 #endif  /* MACH_ASSERT */
2910
2911
2912 #if MACH_ASSERT
2913 /*
2914  *      Check the free lists for proper length etc.
2915  */
2916 static void
2917 vm_page_verify_free_lists( void )
2918 {
2919         unsigned int    color, npages;
2920         vm_page_t       m;
2921         vm_page_t       prev_m;
2922
2923         npages = 0;
2924
2925         mutex_lock(&vm_page_queue_free_lock);
2926
2927         for( color = 0; color < vm_colors; color++ ) {
2928                 prev_m = (vm_page_t) &vm_page_queue_free[color];
2929                 queue_iterate(&vm_page_queue_free[color],
2930                               m,
2931                               vm_page_t,
2932                               pageq) {
2933                         if ((vm_page_t) m->pageq.prev != prev_m)
2934                                 panic("vm_page_verify_free_lists: corrupted prev ptr");
2935                         if ( ! m->free )
2936                                 panic("vm_page_verify_free_lists: not free");
2937                         if ( ! m->busy )
2938                                 panic("vm_page_verify_free_lists: not busy");
2939                         if ( (m->phys_page & vm_color_mask) != color)
2940                                 panic("vm_page_verify_free_lists: wrong color");
2941                         ++npages;
2942                         prev_m = m;
2943                 }
2944         }
2945         if (npages != vm_page_free_count)
2946                 panic("vm_page_verify_free_lists:  npages %u free_count %d",
2947                       npages, vm_page_free_count);
2948
2949         mutex_unlock(&vm_page_queue_free_lock);
2950 }
2951 #endif  /* MACH_ASSERT */
2952
2953
2954
2955 /*
2956  *      CONTIGUOUS PAGE ALLOCATION
2957  *      Additional levels of effort:
2958  *              + consider pages that are currently 'pmapped'
2959  *                  this could be expensive since we'd have
2960  *                  to ask the pmap layer about there state
2961  *              + consider dirty pages
2962  *                  either clean them or
2963  *                  copy them to other locations...
2964  *
2965  *      Find a region large enough to contain at least n pages
2966  *      of contiguous physical memory.
2967  *
2968  *      This is done by traversing the vm_page_t array in a linear fashion
2969  *      we assume that the vm_page_t array has the avaiable physical pages in an
2970  *      ordered, ascending list... this is currently true of all our implementations
2971  *      and must remain so... there can be 'holes' in the array...  we also can
2972  *      no longer tolerate the vm_page_t's in the list being 'freed' and reclaimed
2973  *      which use to happen via 'vm_page_convert'... that function was no longer
2974  *      being called and was removed...
2975  *
2976  *      The basic flow consists of stabilizing some of the interesting state of
2977  *      a vm_page_t behind the vm_page_queue and vm_page_free locks... we start our
2978  *      sweep at the beginning of the array looking for pages that meet our criterea
2979  *      for a 'stealable' page... currently we are pretty conservative... if the page
2980  *      meets this criterea and is physically contiguous to the previous page in the 'run'
2981  *      we keep developing it.  If we hit a page that doesn't fit, we reset our state
2982  *      and start to develop a new run... if at this point we've already considered
2983  *      at least MAX_CONSIDERED_BEFORE_YIELD pages, we'll drop the 2 locks we hold,
2984  *      and mutex_pause (which will yield the processor), to keep the latency low w/r
2985  *      to other threads trying to acquire free pages (or move pages from q to q),
2986  *      and then continue from the spot we left off... we only make 1 pass through the
2987  *      array.  Once we have a 'run' that is long enough, we'll go into the loop which
2988  *      which steals the pages from the queues they're currently on... pages on the free
2989  *      queue can be stolen directly... pages that are on any of the other queues
2990  *      must be removed from the object they are tabled on... this requires taking the
2991  *      object lock... we do this as a 'try' to prevent deadlocks... if the 'try' fails
2992  *      or if the state of the page behind the vm_object lock is no longer viable, we'll
2993  *      dump the pages we've currently stolen back to the free list, and pick up our
2994  *      scan from the point where we aborted the 'current' run.
2995  *
2996  *
2997  *      Requirements:
2998  *              - neither vm_page_queue nor vm_free_list lock can be held on entry
2999  *
3000  *      Returns a pointer to a list of gobbled/wired pages or VM_PAGE_NULL.
3001  *
3002  * Algorithm:
3003  */
3004
3005 #define MAX_CONSIDERED_BEFORE_YIELD     1000
3006
3007
3008 #define RESET_STATE_OF_RUN()    \
3009         MACRO_BEGIN             \
3010         prevcontaddr = -2;      \
3011         free_considered = 0;    \
3012         substitute_needed = 0;  \
3013         npages = 0;             \
3014         MACRO_END
3015
3016
3017 static vm_page_t
3018 vm_page_find_contiguous(
3019         unsigned int    contig_pages,
3020         ppnum_t         max_pnum,
3021         boolean_t       wire)
3022 {
3023         vm_page_t       m = NULL;
3024         ppnum_t         prevcontaddr;
3025         unsigned int    npages, considered;
3026         unsigned int    page_idx, start_idx;
3027         int             free_considered, free_available;
3028         int             substitute_needed;
3029 #if DEBUG
3030         uint32_t        tv_start_sec, tv_start_usec, tv_end_sec, tv_end_usec;
3031 #endif
3032 #if MACH_ASSERT
3033         int             yielded = 0;
3034         int             dumped_run = 0;
3035         int             stolen_pages = 0;
3036 #endif
3037
3038         if (contig_pages == 0)
3039                 return VM_PAGE_NULL;
3040
3041 #if MACH_ASSERT
3042         vm_page_verify_free_lists();
3043 #endif
3044 #if DEBUG
3045         clock_get_system_microtime(&tv_start_sec, &tv_start_usec);
3046 #endif
3047         vm_page_lock_queues();
3048         mutex_lock(&vm_page_queue_free_lock);
3049
3050         RESET_STATE_OF_RUN();
3051
3052         considered = 0;
3053         free_available = vm_page_free_count - vm_page_free_reserved;
3054
3055         for (page_idx = 0, start_idx = 0;
3056              npages < contig_pages && page_idx < vm_pages_count;
3057              page_idx++) {
3058 retry:
3059                 m = &vm_pages[page_idx];
3060
3061                 if (max_pnum && m->phys_page > max_pnum) {
3062                         /* no more low pages... */
3063                         break;
3064                 }
3065                 if (m->phys_page <= vm_lopage_poolend &&
3066                     m->phys_page >= vm_lopage_poolstart) {
3067                         /*
3068                          * don't want to take pages from our
3069                          * reserved pool of low memory
3070                          * so don't consider it which
3071                          * means starting a new run
3072                          */
3073                         RESET_STATE_OF_RUN();
3074
3075                 } else if (m->wire_count || m->gobbled ||
3076                            m->encrypted || m->encrypted_cleaning || m->cs_validated || m->cs_tainted ||
3077                            m->error || m->absent || m->pageout_queue || m->laundry || m->wanted || m->precious ||
3078                            m->cleaning || m->overwriting || m->restart || m->unusual || m->list_req_pending) {
3079                         /*
3080                          * page is in a transient state
3081                          * or a state we don't want to deal
3082                          * with, so don't consider it which
3083                          * means starting a new run
3084                          */
3085                         RESET_STATE_OF_RUN();
3086
3087                 } else if (!m->free && !m->active && !m->inactive && !m->speculative && !m->throttled) {
3088                         /*
3089                          * page needs to be on one of our queues
3090                          * in order for it to be stable behind the
3091                          * locks we hold at this point...
3092                          * if not, don't consider it which
3093                          * means starting a new run
3094                          */
3095                         RESET_STATE_OF_RUN();
3096
3097                 } else if (!m->free && (!m->tabled || m->busy)) {
3098                         /*
3099                          * pages on the free list are always 'busy'
3100                          * so we couldn't test for 'busy' in the check
3101                          * for the transient states... pages that are
3102                          * 'free' are never 'tabled', so we also couldn't
3103                          * test for 'tabled'.  So we check here to make
3104                          * sure that a non-free page is not busy and is
3105                          * tabled on an object...
3106                          * if not, don't consider it which
3107                          * means starting a new run
3108                          */
3109                         RESET_STATE_OF_RUN();
3110
3111                 } else {
3112                         if (m->phys_page != prevcontaddr + 1) {
3113                                 npages = 1;
3114                                 start_idx = page_idx;
3115                         } else {
3116                                 npages++;
3117                         }
3118                         prevcontaddr = m->phys_page;
3119
3120                         if (m->pmapped || m->dirty)
3121                                 substitute_needed++;
3122
3123                         if (m->free) {
3124                                 free_considered++;
3125                         }
3126                         if ((free_considered + substitute_needed) > free_available) {
3127                                 /*
3128                                  * if we let this run continue
3129                                  * we will end up dropping the vm_page_free_count
3130                                  * below the reserve limit... we need to abort
3131                                  * this run, but we can at least re-consider this
3132                                  * page... thus the jump back to 'retry'
3133                                  */
3134                                 RESET_STATE_OF_RUN();
3135
3136                                 if (free_available && considered <= MAX_CONSIDERED_BEFORE_YIELD) {
3137                                         considered++;
3138                                         goto retry;
3139                                 }
3140                                 /*
3141                                  * free_available == 0
3142                                  * so can't consider any free pages... if
3143                                  * we went to retry in this case, we'd
3144                                  * get stuck looking at the same page
3145                                  * w/o making any forward progress
3146                                  * we also want to take this path if we've already
3147                                  * reached our limit that controls the lock latency
3148                                  */
3149                         }
3150                 }
3151                 if (considered > MAX_CONSIDERED_BEFORE_YIELD && npages <= 1) {
3152
3153                         mutex_unlock(&vm_page_queue_free_lock);
3154                         vm_page_unlock_queues();
3155
3156                         mutex_pause(0);
3157
3158                         vm_page_lock_queues();
3159                         mutex_lock(&vm_page_queue_free_lock);
3160
3161                         RESET_STATE_OF_RUN();
3162                         /*
3163                          * reset our free page limit since we
3164                          * dropped the lock protecting the vm_page_free_queue
3165                          */
3166                         free_available = vm_page_free_count - vm_page_free_reserved;
3167                         considered = 0;
3168 #if MACH_ASSERT
3169                         yielded++;
3170 #endif
3171                         goto retry;
3172                 }
3173                 considered++;
3174         }
3175         m = VM_PAGE_NULL;
3176
3177         if (npages != contig_pages)
3178                 mutex_unlock(&vm_page_queue_free_lock);
3179         else {
3180                 vm_page_t       m1;
3181                 vm_page_t       m2;
3182                 unsigned int    cur_idx;
3183                 unsigned int    tmp_start_idx;
3184                 vm_object_t     locked_object = VM_OBJECT_NULL;
3185                 boolean_t       abort_run = FALSE;
3186
3187                 tmp_start_idx = start_idx;
3188
3189                 /*
3190                  * first pass through to pull the free pages
3191                  * off of the free queue so that in case we
3192                  * need substitute pages, we won't grab any
3193                  * of the free pages in the run... we'll clear
3194                  * the 'free' bit in the 2nd pass, and even in
3195                  * an abort_run case, we'll collect all of the
3196                  * free pages in this run and return them to the free list
3197                  */
3198                 while (start_idx < page_idx) {
3199
3200                         m1 = &vm_pages[start_idx++];
3201
3202                         if (m1->free) {
3203                                 unsigned int color;
3204
3205                                 color = m1->phys_page & vm_color_mask;
3206                                 queue_remove(&vm_page_queue_free[color],
3207                                              m1,
3208                                              vm_page_t,
3209                                              pageq);
3210
3211                                 vm_page_free_count--;
3212                         }
3213                 }
3214                 /*
3215                  * adjust global freelist counts
3216                  */
3217                 if (vm_page_free_count < vm_page_free_count_minimum)
3218                         vm_page_free_count_minimum = vm_page_free_count;
3219
3220                 /*
3221                  * we can drop the free queue lock at this point since
3222                  * we've pulled any 'free' candidates off of the list
3223                  * we need it dropped so that we can do a vm_page_grab
3224                  * when substituing for pmapped/dirty pages
3225                  */
3226                 mutex_unlock(&vm_page_queue_free_lock);
3227
3228                 start_idx = tmp_start_idx;
3229                 cur_idx = page_idx - 1;
3230
3231                 while (start_idx++ < page_idx) {
3232                         /*
3233                          * must go through the list from back to front
3234                          * so that the page list is created in the
3235                          * correct order - low -> high phys addresses
3236                          */
3237                         m1 = &vm_pages[cur_idx--];
3238
3239                         if (m1->free) {
3240                                 /*
3241                                  * pages have already been removed from
3242                                  * the free list in the 1st pass
3243                                  */
3244                                 assert(m1->free);
3245                                 assert(m1->busy);
3246                                 assert(!m1->wanted);
3247                                 assert(!m1->laundry);
3248                                 m1->free = FALSE;
3249
3250                         } else {
3251                                 vm_object_t object;
3252
3253                                 if (abort_run == TRUE)
3254                                         continue;
3255
3256                                 object = m1->object;
3257
3258                                 if (object != locked_object) {
3259                                         if (locked_object) {
3260                                                 vm_object_unlock(locked_object);
3261                                                 locked_object = VM_OBJECT_NULL;
3262                                         }
3263                                         if (vm_object_lock_try(object))
3264                                                 locked_object = object;
3265                                 }
3266                                 if (locked_object == VM_OBJECT_NULL ||
3267                                     (m1->wire_count || m1->gobbled ||
3268                                      m1->encrypted || m1->encrypted_cleaning || m1->cs_validated || m1->cs_tainted ||
3269                                      m1->error || m1->absent || m1->pageout_queue || m1->laundry || m1->wanted || m1->precious ||
3270                                      m1->cleaning || m1->overwriting || m1->restart || m1->unusual || m1->list_req_pending || m1->busy)) {
3271
3272                                         if (locked_object) {
3273                                                 vm_object_unlock(locked_object);
3274                                                 locked_object = VM_OBJECT_NULL;
3275                                         }
3276                                         tmp_start_idx = cur_idx;
3277                                         abort_run = TRUE;
3278                                         continue;
3279                                 }
3280                                 if (m1->pmapped || m1->dirty) {
3281                                         int refmod;
3282                                         vm_object_offset_t offset;
3283
3284                                         m2 = vm_page_grab();
3285
3286                                         if (m2 == VM_PAGE_NULL) {
3287                                                 if (locked_object) {
3288                                                         vm_object_unlock(locked_object);
3289                                                         locked_object = VM_OBJECT_NULL;
3290                                                 }
3291                                                 tmp_start_idx = cur_idx;
3292                                                 abort_run = TRUE;
3293                                                 continue;
3294                                         }
3295                                         if (m1->pmapped)
3296                                                 refmod = pmap_disconnect(m1->phys_page);
3297                                         else
3298                                                 refmod = 0;
3299                                         vm_page_copy(m1, m2);
3300
3301                                         m2->reference = m1->reference;
3302                                         m2->dirty     = m1->dirty;
3303
3304                                         if (refmod & VM_MEM_REFERENCED)
3305                                                 m2->reference = TRUE;
3306                                         if (refmod & VM_MEM_MODIFIED)
3307                                                 m2->dirty = TRUE;
3308                                         offset = m1->offset;
3309
3310                                         /*
3311                                          * completely cleans up the state
3312                                          * of the page so that it is ready
3313                                          * to be put onto the free list, or
3314                                          * for this purpose it looks like it
3315                                          * just came off of the free list
3316                                          */
3317                                         vm_page_free_prepare(m1);
3318
3319                                         /*
3320                                          * make sure we clear the ref/mod state
3321                                          * from the pmap layer... else we risk
3322                                          * inheriting state from the last time
3323                                          * this page was used...
3324                                          */
3325                                         pmap_clear_refmod(m2->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
3326                                         /*
3327                                          * now put the substitute page on the object
3328                                          */
3329                                         vm_page_insert_internal(m2, locked_object, offset, TRUE);
3330
3331                                         if (m2->reference)
3332                                                 vm_page_activate(m2);
3333                                         else
3334                                                 vm_page_deactivate(m2);
3335
3336                                         PAGE_WAKEUP_DONE(m2);
3337
3338                                 } else {
3339                                         /*
3340                                          * completely cleans up the state
3341                                          * of the page so that it is ready
3342                                          * to be put onto the free list, or
3343                                          * for this purpose it looks like it
3344                                          * just came off of the free list
3345                                          */
3346                                         vm_page_free_prepare(m1);
3347                                 }
3348 #if MACH_ASSERT
3349                                 stolen_pages++;
3350 #endif
3351                         }
3352                         m1->pageq.next = (queue_entry_t) m;
3353                         m1->pageq.prev = NULL;
3354                         m = m1;
3355                 }
3356                 if (locked_object) {
3357                         vm_object_unlock(locked_object);
3358                         locked_object = VM_OBJECT_NULL;
3359                 }
3360
3361                 if (abort_run == TRUE) {
3362                         if (m != VM_PAGE_NULL) {
3363                                 vm_page_free_list(m);
3364                         }
3365 #if MACH_ASSERT
3366                         dumped_run++;
3367 #endif
3368                         /*
3369                          * want the index of the last
3370                          * page in this run that was
3371                          * successfully 'stolen', so back
3372                          * it up 1 for the auto-decrement on use
3373                          * and 1 more to bump back over this page
3374                          */
3375                         page_idx = tmp_start_idx + 2;
3376
3377                         if (page_idx >= vm_pages_count)
3378                                 goto done_scanning;
3379
3380                         mutex_lock(&vm_page_queue_free_lock);
3381
3382                         RESET_STATE_OF_RUN();
3383
3384                         /*
3385                          * reset our free page limit since we
3386                          * dropped the lock protecting the vm_page_free_queue
3387                          */
3388                         free_available = vm_page_free_count - vm_page_free_reserved;
3389
3390                         goto retry;
3391                 }
3392
3393                 for (m1 = m; m1 != VM_PAGE_NULL; m1 = NEXT_PAGE(m1)) {
3394
3395                         if (wire == TRUE)
3396                                 m1->wire_count++;
3397                         else
3398                                 m1->gobbled = TRUE;
3399                 }
3400                 if (wire == FALSE)
3401                         vm_page_gobble_count += npages;
3402
3403                 /*
3404                  * gobbled pages are also counted as wired pages
3405                  */
3406                 vm_page_wire_count += npages;
3407
3408                 assert(vm_page_verify_contiguous(m, npages));
3409         }
3410 done_scanning:
3411         vm_page_unlock_queues();
3412
3413 #if DEBUG
3414         clock_get_system_microtime(&tv_end_sec, &tv_end_usec);
3415
3416         tv_end_sec -= tv_start_sec;
3417         if (tv_end_usec < tv_start_usec) {
3418                 tv_end_sec--;
3419                 tv_end_usec += 1000000;
3420         }
3421         tv_end_usec -= tv_start_usec;
3422         if (tv_end_usec >= 1000000) {
3423                 tv_end_sec++;
3424                 tv_end_sec -= 1000000;
3425         }
3426         printf("vm_find_page_contiguous(num=%d,low=%d): found %d pages in %d.%06ds...  scanned %d pages...  yielded %d times...  dumped run %d times... stole %d pages\n",
3427                contig_pages, max_pnum, npages, tv_end_sec, tv_end_usec, page_idx, yielded, dumped_run, stolen_pages);
3428
3429 #endif
3430 #if MACH_ASSERT
3431         vm_page_verify_free_lists();
3432 #endif
3433         return m;
3434 }
3435
3436 /*
3437  *      Allocate a list of contiguous, wired pages.
3438  */
3439 kern_return_t
3440 cpm_allocate(
3441         vm_size_t       size,
3442         vm_page_t       *list,
3443         ppnum_t         max_pnum,
3444         boolean_t       wire)
3445 {
3446         vm_page_t               pages;
3447         unsigned int            npages;
3448
3449         if (size % page_size != 0)
3450                 return KERN_INVALID_ARGUMENT;
3451
3452         npages = size / page_size;
3453
3454         /*
3455          *      Obtain a pointer to a subset of the free
3456          *      list large enough to satisfy the request;
3457          *      the region will be physically contiguous.
3458          */
3459         pages = vm_page_find_contiguous(npages, max_pnum, wire);
3460
3461         if (pages == VM_PAGE_NULL)
3462                 return KERN_NO_SPACE;
3463         /*
3464          * determine need for wakeups
3465          */
3466         if ((vm_page_free_count < vm_page_free_min) ||
3467             ((vm_page_free_count < vm_page_free_target) &&
3468              ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min)))
3469                 thread_wakeup((event_t) &vm_page_free_wanted);
3470
3471 #if CONFIG_EMBEDDED
3472         {
3473         int                     percent_avail;
3474
3475         /*
3476          * Decide if we need to poke the memorystatus notification thread.
3477          */
3478         percent_avail =
3479                 (vm_page_active_count + vm_page_inactive_count +
3480                  vm_page_speculative_count + vm_page_free_count +
3481                  (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count)  ) * 100 /
3482                 atop_64(max_mem);
3483         if (percent_avail <= (kern_memorystatus_level - 5)) {
3484                 kern_memorystatus_level = percent_avail;
3485                 thread_wakeup((event_t)&kern_memorystatus_wakeup);
3486         }
3487         }
3488 #endif
3489         /*
3490          *      The CPM pages should now be available and
3491          *      ordered by ascending physical address.
3492          */
3493         assert(vm_page_verify_contiguous(pages, npages));
3494
3495         *list = pages;
3496         return KERN_SUCCESS;
3497 }
3498
3499
3500 #include <mach_vm_debug.h>
3501 #if     MACH_VM_DEBUG
3502
3503 #include <mach_debug/hash_info.h>
3504 #include <vm/vm_debug.h>
3505
3506 /*
3507  *      Routine:        vm_page_info
3508  *      Purpose:
3509  *              Return information about the global VP table.
3510  *              Fills the buffer with as much information as possible
3511  *              and returns the desired size of the buffer.
3512  *      Conditions:
3513  *              Nothing locked.  The caller should provide
3514  *              possibly-pageable memory.
3515  */
3516
3517 unsigned int
3518 vm_page_info(
3519         hash_info_bucket_t *info,
3520         unsigned int count)
3521 {
3522         unsigned int i;
3523
3524         if (vm_page_bucket_count < count)
3525                 count = vm_page_bucket_count;
3526
3527         for (i = 0; i < count; i++) {
3528                 vm_page_bucket_t *bucket = &vm_page_buckets[i];
3529                 unsigned int bucket_count = 0;
3530                 vm_page_t m;
3531
3532                 simple_lock(&vm_page_bucket_lock);
3533                 for (m = bucket->pages; m != VM_PAGE_NULL; m = m->next)
3534                         bucket_count++;
3535                 simple_unlock(&vm_page_bucket_lock);
3536
3537                 /* don't touch pageable memory while holding locks */
3538                 info[i].hib_count = bucket_count;
3539         }
3540
3541         return vm_page_bucket_count;
3542 }
3543 #endif  /* MACH_VM_DEBUG */
3544
3545 #include <mach_kdb.h>
3546 #if     MACH_KDB
3547
3548 #include <ddb/db_output.h>
3549 #include <vm/vm_print.h>
3550 #define printf  kdbprintf
3551
3552 /*
3553  *      Routine:        vm_page_print [exported]
3554  */
3555 void
3556 vm_page_print(
3557         db_addr_t       db_addr)
3558 {
3559         vm_page_t       p;
3560
3561         p = (vm_page_t) (long) db_addr;
3562
3563         iprintf("page 0x%x\n", p);
3564
3565         db_indent += 2;
3566
3567         iprintf("object=0x%x", p->object);
3568         printf(", offset=0x%x", p->offset);
3569         printf(", wire_count=%d", p->wire_count);
3570
3571         iprintf("%sinactive, %sactive, %sthrottled, %sgobbled, %slaundry, %sfree, %sref, %sencrypted\n",
3572                 (p->inactive ? "" : "!"),
3573                 (p->active ? "" : "!"),
3574                 (p->throttled ? "" : "!"),
3575                 (p->gobbled ? "" : "!"),
3576                 (p->laundry ? "" : "!"),
3577                 (p->free ? "" : "!"),
3578                 (p->reference ? "" : "!"),
3579                 (p->encrypted ? "" : "!"));
3580         iprintf("%sbusy, %swanted, %stabled, %sfictitious, %sprivate, %sprecious\n",
3581                 (p->busy ? "" : "!"),
3582                 (p->wanted ? "" : "!"),
3583                 (p->tabled ? "" : "!"),
3584                 (p->fictitious ? "" : "!"),
3585                 (p->private ? "" : "!"),
3586                 (p->precious ? "" : "!"));
3587         iprintf("%sabsent, %serror, %sdirty, %scleaning, %spageout, %sclustered\n",
3588                 (p->absent ? "" : "!"),
3589                 (p->error ? "" : "!"),
3590                 (p->dirty ? "" : "!"),
3591                 (p->cleaning ? "" : "!"),
3592                 (p->pageout ? "" : "!"),
3593                 (p->clustered ? "" : "!"));
3594         iprintf("%soverwriting, %srestart, %sunusual\n",
3595                 (p->overwriting ? "" : "!"),
3596                 (p->restart ? "" : "!"),
3597                 (p->unusual ? "" : "!"));
3598
3599         iprintf("phys_page=0x%x", p->phys_page);
3600
3601         db_indent -= 2;
3602 }
3603 #endif  /* MACH_KDB */