osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /*
  31  * @OSF_COPYRIGHT@
  32  */
  33 /*
  34  * Mach Operating System
  35  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  36  * All Rights Reserved.
  37  *
  38  * Permission to use, copy, modify and distribute this software and its
  39  * documentation is hereby granted, provided that both the copyright
  40  * notice and this permission notice appear in all copies of the
  41  * software, derivative works or modified versions, and any portions
  42  * thereof, and that both notices appear in supporting documentation.
  43  *
  44  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  45  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  46  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  47  *
  48  * Carnegie Mellon requests users of this software to return to
  49  *
  50  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  51  *  School of Computer Science
  52  *  Carnegie Mellon University
  53  *  Pittsburgh PA 15213-3890
  54  *
  55  * any improvements or extensions that they make and grant Carnegie Mellon
  56  * the rights to redistribute these changes.
  57  */
  58
  59 /*
  60  *      Default Pager.
  61  *              Paging File Management.
  62  */
  63
  64 #include <mach/host_priv.h>
  65 #include <mach/memory_object_control.h>
  66 #include <mach/memory_object_server.h>
  67 #include <mach/upl.h>
  68 #include <default_pager/default_pager_internal.h>
  69 #include <default_pager/default_pager_alerts.h>
  70 #include <default_pager/default_pager_object_server.h>
  71
  72 #include <ipc/ipc_types.h>
  73 #include <ipc/ipc_port.h>
  74 #include <ipc/ipc_space.h>
  75
  76 #include <kern/kern_types.h>
  77 #include <kern/host.h>
  78 #include <kern/queue.h>
  79 #include <kern/counters.h>
  80 #include <kern/sched_prim.h>
  81
  82 #include <vm/vm_kern.h>
  83 #include <vm/vm_pageout.h>
  84 #include <vm/vm_map.h>
  85 #include <vm/vm_object.h>
  86 #include <vm/vm_protos.h>
  87
  88 /* LP64todo - need large internal object support */
  89
  90 /*
  91  * ALLOC_STRIDE... the maximum number of bytes allocated from
  92  * a swap file before moving on to the next swap file... if
  93  * all swap files reside on a single disk, this value should
  94  * be very large (this is the default assumption)... if the
  95  * swap files are spread across multiple disks, than this value
  96  * should be small (128 * 1024)...
  97  *
  98  * This should be determined dynamically in the future
  99  */
 100
 101 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
 102 int physical_transfer_cluster_count = 0;
 103
 104 #define VM_SUPER_CLUSTER        0x40000
 105 #define VM_SUPER_PAGES          64
 106
 107 /*
 108  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
 109  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
 110  */
 111 #define VSTRUCT_DEF_CLSHIFT     2
 112 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
 113 int default_pager_clsize = 0;
 114
 115 /* statistics */
 116 unsigned int clustered_writes[VM_SUPER_PAGES+1];
 117 unsigned int clustered_reads[VM_SUPER_PAGES+1];
 118
 119 /*
 120  * Globals used for asynchronous paging operations:
 121  *      vs_async_list:  head of list of to-be-completed I/O ops
 122  *      async_num_queued: number of pages completed, but not yet
 123  *              processed by async thread.
 124  *      async_requests_out: number of pages of requests not completed.
 125  */
 126
 127 #if 0
 128 struct vs_async *vs_async_list;
 129 int     async_num_queued;
 130 int     async_requests_out;
 131 #endif
 132
 133
 134 #define VS_ASYNC_REUSE 1
 135 struct vs_async *vs_async_free_list;
 136
 137 mutex_t default_pager_async_lock;       /* Protects globals above */
 138
 139
 140 int vs_alloc_async_failed = 0;                  /* statistics */
 141 int vs_alloc_async_count = 0;                   /* statistics */
 142 struct vs_async *vs_alloc_async(void);          /* forward */
 143 void vs_free_async(struct vs_async *vsa);       /* forward */
 144
 145
 146 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 147 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 148
 149 #define VS_ASYNC_LOCK()         mutex_lock(&default_pager_async_lock)
 150 #define VS_ASYNC_UNLOCK()       mutex_unlock(&default_pager_async_lock)
 151 #define VS_ASYNC_LOCK_INIT()    mutex_init(&default_pager_async_lock, 0)
 152 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 153 /*
 154  *  Paging Space Hysteresis triggers and the target notification port
 155  *
 156  */
 157
 158 unsigned int    minimum_pages_remaining = 0;
 159 unsigned int    maximum_pages_free = 0;
 160 ipc_port_t      min_pages_trigger_port = NULL;
 161 ipc_port_t      max_pages_trigger_port = NULL;
 162
 163 boolean_t       bs_low = FALSE;
 164 int             backing_store_release_trigger_disable = 0;
 165
 166
 167 /* Have we decided if swap needs to be encrypted yet ? */
 168 boolean_t       dp_encryption_inited = FALSE;
 169 /* Should we encrypt swap ? */
 170 boolean_t       dp_encryption = FALSE;
 171
 172
 173 /*
 174  * Object sizes are rounded up to the next power of 2,
 175  * unless they are bigger than a given maximum size.
 176  */
 177 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 178
 179 /*
 180  * List of all backing store and segments.
 181  */
 182 struct backing_store_list_head backing_store_list;
 183 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 184 mutex_t                 paging_segments_lock;
 185 int                     paging_segment_max = 0;
 186 int                     paging_segment_count = 0;
 187 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 188
 189
 190 /*
 191  * Total pages free in system
 192  * This differs from clusters committed/avail which is a measure of the
 193  * over commitment of paging segments to backing store.  An idea which is
 194  * likely to be deprecated.
 195  */
 196 unsigned  int   dp_pages_free = 0;
 197 unsigned  int   cluster_transfer_minimum = 100;
 198
 199 /* forward declarations */
 200 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, int);     /* forward */
 201 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, unsigned int *, int);     /* forward */
 202 default_pager_thread_t *get_read_buffer( void );
 203 kern_return_t ps_vstruct_transfer_from_segment(
 204         vstruct_t        vs,
 205         paging_segment_t segment,
 206         upl_t            upl);
 207 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
 208 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
 209 kern_return_t vs_cluster_transfer(
 210         vstruct_t       vs,
 211         upl_offset_t    offset,
 212         upl_size_t      cnt,
 213         upl_t           upl);
 214 vs_map_t vs_get_map_entry(
 215         vstruct_t       vs,
 216         vm_offset_t     offset);
 217
 218
 219 default_pager_thread_t *
 220 get_read_buffer( void )
 221 {
 222         int     i;
 223
 224         DPT_LOCK(dpt_lock);
 225         while(TRUE) {
 226                 for (i=0; i<default_pager_internal_count; i++) {
 227                         if(dpt_array[i]->checked_out == FALSE) {
 228                           dpt_array[i]->checked_out = TRUE;
 229                           DPT_UNLOCK(dpt_lock);
 230                           return  dpt_array[i];
 231                         }
 232                 }
 233                 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
 234         }
 235 }
 236
 237 void
 238 bs_initialize(void)
 239 {
 240         int i;
 241
 242         /*
 243          * List of all backing store.
 244          */
 245         BSL_LOCK_INIT();
 246         queue_init(&backing_store_list.bsl_queue);
 247         PSL_LOCK_INIT();
 248
 249         VS_ASYNC_LOCK_INIT();
 250 #if     VS_ASYNC_REUSE
 251         vs_async_free_list = NULL;
 252 #endif  /* VS_ASYNC_REUSE */
 253
 254         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 255                 clustered_writes[i] = 0;
 256                 clustered_reads[i] = 0;
 257         }
 258
 259 }
 260
 261 /*
 262  * When things do not quite workout...
 263  */
 264 void bs_no_paging_space(boolean_t);     /* forward */
 265
 266 void
 267 bs_no_paging_space(
 268         boolean_t out_of_memory)
 269 {
 270
 271         if (out_of_memory)
 272                 dprintf(("*** OUT OF MEMORY ***\n"));
 273         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 274 }
 275
 276 void bs_more_space(int);        /* forward */
 277 void bs_commit(int);            /* forward */
 278
 279 boolean_t       user_warned = FALSE;
 280 unsigned int    clusters_committed = 0;
 281 unsigned int    clusters_available = 0;
 282 unsigned int    clusters_committed_peak = 0;
 283
 284 void
 285 bs_more_space(
 286         int     nclusters)
 287 {
 288         BSL_LOCK();
 289         /*
 290          * Account for new paging space.
 291          */
 292         clusters_available += nclusters;
 293
 294         if (clusters_available >= clusters_committed) {
 295                 if (verbose && user_warned) {
 296                         printf("%s%s - %d excess clusters now.\n",
 297                                my_name,
 298                                "paging space is OK now",
 299                                clusters_available - clusters_committed);
 300                         user_warned = FALSE;
 301                         clusters_committed_peak = 0;
 302                 }
 303         } else {
 304                 if (verbose && user_warned) {
 305                         printf("%s%s - still short of %d clusters.\n",
 306                                my_name,
 307                                "WARNING: paging space over-committed",
 308                                clusters_committed - clusters_available);
 309                         clusters_committed_peak -= nclusters;
 310                 }
 311         }
 312         BSL_UNLOCK();
 313
 314         return;
 315 }
 316
 317 void
 318 bs_commit(
 319         int     nclusters)
 320 {
 321         BSL_LOCK();
 322         clusters_committed += nclusters;
 323         if (clusters_committed > clusters_available) {
 324                 if (verbose && !user_warned) {
 325                         user_warned = TRUE;
 326                         printf("%s%s - short of %d clusters.\n",
 327                                my_name,
 328                                "WARNING: paging space over-committed",
 329                                clusters_committed - clusters_available);
 330                 }
 331                 if (clusters_committed > clusters_committed_peak) {
 332                         clusters_committed_peak = clusters_committed;
 333                 }
 334         } else {
 335                 if (verbose && user_warned) {
 336                         printf("%s%s - was short of up to %d clusters.\n",
 337                                my_name,
 338                                "paging space is OK now",
 339                                clusters_committed_peak - clusters_available);
 340                         user_warned = FALSE;
 341                         clusters_committed_peak = 0;
 342                 }
 343         }
 344         BSL_UNLOCK();
 345
 346         return;
 347 }
 348
 349 int default_pager_info_verbose = 1;
 350
 351 void
 352 bs_global_info(
 353         vm_size_t       *totalp,
 354         vm_size_t       *freep)
 355 {
 356         vm_size_t               pages_total, pages_free;
 357         paging_segment_t        ps;
 358         int                     i;
 359
 360         PSL_LOCK();
 361         pages_total = pages_free = 0;
 362         for (i = 0; i <= paging_segment_max; i++) {
 363                 ps = paging_segments[i];
 364                 if (ps == PAGING_SEGMENT_NULL)
 365                         continue;
 366
 367                 /*
 368                  * no need to lock: by the time this data
 369                  * gets back to any remote requestor it
 370                  * will be obsolete anyways
 371                  */
 372                 pages_total += ps->ps_pgnum;
 373                 pages_free += ps->ps_clcount << ps->ps_clshift;
 374                 DP_DEBUG(DEBUG_BS_INTERNAL,
 375                          ("segment #%d: %d total, %d free\n",
 376                           i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 377         }
 378         *totalp = pages_total;
 379         *freep = pages_free;
 380         if (verbose && user_warned && default_pager_info_verbose) {
 381                 if (clusters_available < clusters_committed) {
 382                         printf("%s %d clusters committed, %d available.\n",
 383                                my_name,
 384                                clusters_committed,
 385                                clusters_available);
 386                 }
 387         }
 388         PSL_UNLOCK();
 389 }
 390
 391 backing_store_t backing_store_alloc(void);      /* forward */
 392
 393 backing_store_t
 394 backing_store_alloc(void)
 395 {
 396         backing_store_t bs;
 397
 398         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 399         if (bs == BACKING_STORE_NULL)
 400                 panic("backing_store_alloc: no memory");
 401
 402         BS_LOCK_INIT(bs);
 403         bs->bs_port = MACH_PORT_NULL;
 404         bs->bs_priority = 0;
 405         bs->bs_clsize = 0;
 406         bs->bs_pages_total = 0;
 407         bs->bs_pages_in = 0;
 408         bs->bs_pages_in_fail = 0;
 409         bs->bs_pages_out = 0;
 410         bs->bs_pages_out_fail = 0;
 411
 412         return bs;
 413 }
 414
 415 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 416
 417 /* Even in both the component space and external versions of this pager, */
 418 /* backing_store_lookup will be called from tasks in the application space */
 419 backing_store_t
 420 backing_store_lookup(
 421         MACH_PORT_FACE port)
 422 {
 423         backing_store_t bs;
 424
 425 /*
 426         port is currently backed with a vs structure in the alias field
 427         we could create an ISBS alias and a port_is_bs call but frankly
 428         I see no reason for the test, the bs->port == port check below
 429         will work properly on junk entries.
 430
 431         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 432 */
 433         if ((port == MACH_PORT_NULL))
 434                 return BACKING_STORE_NULL;
 435
 436         BSL_LOCK();
 437         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 438                       bs_links) {
 439                 BS_LOCK(bs);
 440                 if (bs->bs_port == port) {
 441                         BSL_UNLOCK();
 442                         /* Success, return it locked. */
 443                         return bs;
 444                 }
 445                 BS_UNLOCK(bs);
 446         }
 447         BSL_UNLOCK();
 448         return BACKING_STORE_NULL;
 449 }
 450
 451 void backing_store_add(backing_store_t);        /* forward */
 452
 453 void
 454 backing_store_add(
 455         __unused backing_store_t bs)
 456 {
 457 //      MACH_PORT_FACE          port = bs->bs_port;
 458 //      MACH_PORT_FACE          pset = default_pager_default_set;
 459         kern_return_t           kr = KERN_SUCCESS;
 460
 461         if (kr != KERN_SUCCESS)
 462                 panic("backing_store_add: add to set");
 463
 464 }
 465
 466 /*
 467  * Set up default page shift, but only if not already
 468  * set and argument is within range.
 469  */
 470 boolean_t
 471 bs_set_default_clsize(unsigned int npages)
 472 {
 473         switch(npages){
 474             case 1:
 475             case 2:
 476             case 4:
 477             case 8:
 478                 if (default_pager_clsize == 0)  /* if not yet set */
 479                         vstruct_def_clshift = local_log2(npages);
 480                 return(TRUE);
 481         }
 482         return(FALSE);
 483 }
 484
 485 int bs_get_global_clsize(int clsize);   /* forward */
 486
 487 int
 488 bs_get_global_clsize(
 489         int     clsize)
 490 {
 491         int                     i;
 492         memory_object_default_t dmm;
 493         kern_return_t           kr;
 494
 495         /*
 496          * Only allow setting of cluster size once. If called
 497          * with no cluster size (default), we use the compiled-in default
 498          * for the duration. The same cluster size is used for all
 499          * paging segments.
 500          */
 501         if (default_pager_clsize == 0) {
 502                 /*
 503                  * Keep cluster size in bit shift because it's quicker
 504                  * arithmetic, and easier to keep at a power of 2.
 505                  */
 506                 if (clsize != NO_CLSIZE) {
 507                         for (i = 0; (1 << i) < clsize; i++);
 508                         if (i > MAX_CLUSTER_SHIFT)
 509                                 i = MAX_CLUSTER_SHIFT;
 510                         vstruct_def_clshift = i;
 511                 }
 512                 default_pager_clsize = (1 << vstruct_def_clshift);
 513
 514                 /*
 515                  * Let the user know the new (and definitive) cluster size.
 516                  */
 517                 if (verbose)
 518                         printf("%scluster size = %d page%s\n",
 519                                 my_name, default_pager_clsize,
 520                                 (default_pager_clsize == 1) ? "" : "s");
 521
 522                 /*
 523                  * Let the kernel know too, in case it hasn't used the
 524                  * default value provided in main() yet.
 525                  */
 526                 dmm = default_pager_object;
 527                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 528                 kr = host_default_memory_manager(host_priv_self(),
 529                                                  &dmm,
 530                                                  clsize);
 531                 memory_object_default_deallocate(dmm);
 532
 533                 if (kr != KERN_SUCCESS) {
 534                    panic("bs_get_global_cl_size:host_default_memory_manager");
 535                 }
 536                 if (dmm != default_pager_object) {
 537                   panic("bs_get_global_cl_size:there is another default pager");
 538                 }
 539         }
 540         ASSERT(default_pager_clsize > 0 &&
 541                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 542
 543         return default_pager_clsize;
 544 }
 545
 546 kern_return_t
 547 default_pager_backing_store_create(
 548         memory_object_default_t pager,
 549         int                     priority,
 550         int                     clsize,         /* in bytes */
 551         MACH_PORT_FACE          *backing_store)
 552 {
 553         backing_store_t bs;
 554         MACH_PORT_FACE  port;
 555 //      kern_return_t   kr;
 556         struct vstruct_alias *alias_struct;
 557
 558         if (pager != default_pager_object)
 559                 return KERN_INVALID_ARGUMENT;
 560
 561         bs = backing_store_alloc();
 562         port = ipc_port_alloc_kernel();
 563         ipc_port_make_send(port);
 564         assert (port != IP_NULL);
 565
 566         DP_DEBUG(DEBUG_BS_EXTERNAL,
 567                  ("priority=%d clsize=%d bs_port=0x%x\n",
 568                   priority, clsize, (int) backing_store));
 569
 570         alias_struct = (struct vstruct_alias *)
 571                                 kalloc(sizeof (struct vstruct_alias));
 572         if(alias_struct != NULL) {
 573                 alias_struct->vs = (struct vstruct *)bs;
 574                 alias_struct->name = ISVS;
 575                 port->alias = (int) alias_struct;
 576         }
 577         else {
 578                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 579                 kfree(bs, sizeof (struct backing_store));
 580                 return KERN_RESOURCE_SHORTAGE;
 581         }
 582
 583         bs->bs_port = port;
 584         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 585                 priority = BS_MAXPRI;
 586         else if (priority == BS_NOPRI)
 587                 priority = BS_MAXPRI;
 588         else
 589                 priority = BS_MINPRI;
 590         bs->bs_priority = priority;
 591
 592         bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
 593
 594         BSL_LOCK();
 595         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 596                     bs_links);
 597         BSL_UNLOCK();
 598
 599         backing_store_add(bs);
 600
 601         *backing_store = port;
 602         return KERN_SUCCESS;
 603 }
 604
 605 kern_return_t
 606 default_pager_backing_store_info(
 607         MACH_PORT_FACE          backing_store,
 608         backing_store_flavor_t  flavour,
 609         backing_store_info_t    info,
 610         mach_msg_type_number_t  *size)
 611 {
 612         backing_store_t                 bs;
 613         backing_store_basic_info_t      basic;
 614         int                             i;
 615         paging_segment_t                ps;
 616
 617         if (flavour != BACKING_STORE_BASIC_INFO ||
 618             *size < BACKING_STORE_BASIC_INFO_COUNT)
 619                 return KERN_INVALID_ARGUMENT;
 620
 621         basic = (backing_store_basic_info_t)info;
 622         *size = BACKING_STORE_BASIC_INFO_COUNT;
 623
 624         VSTATS_LOCK(&global_stats.gs_lock);
 625         basic->pageout_calls    = global_stats.gs_pageout_calls;
 626         basic->pagein_calls     = global_stats.gs_pagein_calls;
 627         basic->pages_in         = global_stats.gs_pages_in;
 628         basic->pages_out        = global_stats.gs_pages_out;
 629         basic->pages_unavail    = global_stats.gs_pages_unavail;
 630         basic->pages_init       = global_stats.gs_pages_init;
 631         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 632         VSTATS_UNLOCK(&global_stats.gs_lock);
 633
 634         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 635                 return KERN_INVALID_ARGUMENT;
 636
 637         basic->bs_pages_total   = bs->bs_pages_total;
 638         PSL_LOCK();
 639         bs->bs_pages_free = 0;
 640         for (i = 0; i <= paging_segment_max; i++) {
 641                 ps = paging_segments[i];
 642                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 643                         PS_LOCK(ps);
 644                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 645                         PS_UNLOCK(ps);
 646                 }
 647         }
 648         PSL_UNLOCK();
 649         basic->bs_pages_free    = bs->bs_pages_free;
 650         basic->bs_pages_in      = bs->bs_pages_in;
 651         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 652         basic->bs_pages_out     = bs->bs_pages_out;
 653         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 654
 655         basic->bs_priority      = bs->bs_priority;
 656         basic->bs_clsize        = ptoa_32(bs->bs_clsize);       /* in bytes */
 657
 658         BS_UNLOCK(bs);
 659
 660         return KERN_SUCCESS;
 661 }
 662
 663 int ps_delete(paging_segment_t);        /* forward */
 664
 665 int
 666 ps_delete(
 667         paging_segment_t ps)
 668 {
 669         vstruct_t       vs;
 670         kern_return_t   error = KERN_SUCCESS;
 671         int             vs_count;
 672
 673         VSL_LOCK();             /* get the lock on the list of vs's      */
 674
 675         /* The lock relationship and sequence is farily complicated      */
 676         /* this code looks at a live list, locking and unlocking the list */
 677         /* as it traverses it.  It depends on the locking behavior of    */
 678         /* default_pager_no_senders.  no_senders always locks the vstruct */
 679         /* targeted for removal before locking the vstruct list.  However */
 680         /* it will remove that member of the list without locking its    */
 681         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 682         /* it cannot be removed from the list but we must hold the list  */
 683         /* lock to be sure that its pointers to its neighbors are valid. */
 684         /* Also, we can hold off destruction of a vstruct when the list  */
 685         /* lock and the vs locks are not being held by bumping the       */
 686         /* vs_async_pending count.      */
 687
 688
 689         while(backing_store_release_trigger_disable != 0) {
 690                 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
 691         }
 692
 693         /* we will choose instead to hold a send right */
 694         vs_count = vstruct_list.vsl_count;
 695         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 696         if(vs == (vstruct_t)&vstruct_list)  {
 697                 VSL_UNLOCK();
 698                 return KERN_SUCCESS;
 699         }
 700         VS_LOCK(vs);
 701         vs_async_wait(vs);  /* wait for any pending async writes */
 702         if ((vs_count != 0) && (vs != NULL))
 703                 vs->vs_async_pending += 1;  /* hold parties calling  */
 704                                             /* vs_async_wait */
 705         VS_UNLOCK(vs);
 706         VSL_UNLOCK();
 707         while((vs_count != 0) && (vs != NULL)) {
 708                 /* We take the count of AMO's before beginning the         */
 709                 /* transfer of of the target segment.                      */
 710                 /* We are guaranteed that the target segment cannot get    */
 711                 /* more users.  We also know that queue entries are        */
 712                 /* made at the back of the list.  If some of the entries   */
 713                 /* we would check disappear while we are traversing the    */
 714                 /* list then we will either check new entries which        */
 715                 /* do not have any backing store in the target segment     */
 716                 /* or re-check old entries.  This might not be optimal     */
 717                 /* but it will always be correct. The alternative is to    */
 718                 /* take a snapshot of the list.                            */
 719                 vstruct_t       next_vs;
 720
 721                 if(dp_pages_free < cluster_transfer_minimum)
 722                         error = KERN_FAILURE;
 723                 else {
 724                         vm_object_t     transfer_object;
 725                         int             count;
 726                         upl_t           upl;
 727
 728                         transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
 729                         count = 0;
 730                         error = vm_object_upl_request(transfer_object,
 731                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 732                                 &upl, NULL, &count,
 733                                 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
 734                                             | UPL_SET_INTERNAL);
 735                         if(error == KERN_SUCCESS) {
 736                                 error = ps_vstruct_transfer_from_segment(
 737                                                         vs, ps, upl);
 738                                 upl_commit(upl, NULL, 0);
 739                                 upl_deallocate(upl);
 740                         } else {
 741                                 error = KERN_FAILURE;
 742                         }
 743                         vm_object_deallocate(transfer_object);
 744                 }
 745                 if(error) {
 746                         VS_LOCK(vs);
 747                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 748                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 749                                 vs->vs_waiting_async = FALSE;
 750                                 VS_UNLOCK(vs);
 751                                 thread_wakeup(&vs->vs_async_pending);
 752                         } else {
 753                                 VS_UNLOCK(vs);
 754                         }
 755                         return KERN_FAILURE;
 756                 }
 757
 758                 VSL_LOCK();
 759
 760                 while(backing_store_release_trigger_disable != 0) {
 761                         VSL_SLEEP(&backing_store_release_trigger_disable,
 762                                   THREAD_UNINT);
 763                 }
 764
 765                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 766                 if((next_vs != (vstruct_t)&vstruct_list) &&
 767                                 (vs != next_vs) && (vs_count != 1)) {
 768                         VS_LOCK(next_vs);
 769                         vs_async_wait(next_vs);  /* wait for any  */
 770                                                  /* pending async writes */
 771                         next_vs->vs_async_pending += 1; /* hold parties  */
 772                                                 /* calling vs_async_wait */
 773                         VS_UNLOCK(next_vs);
 774                 }
 775                 VSL_UNLOCK();
 776                 VS_LOCK(vs);
 777                 vs->vs_async_pending -= 1;
 778                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 779                         vs->vs_waiting_async = FALSE;
 780                         VS_UNLOCK(vs);
 781                         thread_wakeup(&vs->vs_async_pending);
 782                 } else {
 783                         VS_UNLOCK(vs);
 784                 }
 785                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 786                         vs = NULL;
 787                 else
 788                         vs = next_vs;
 789                 vs_count--;
 790         }
 791         return KERN_SUCCESS;
 792 }
 793
 794
 795 kern_return_t
 796 default_pager_backing_store_delete(
 797         MACH_PORT_FACE backing_store)
 798 {
 799         backing_store_t         bs;
 800         int                     i;
 801         paging_segment_t        ps;
 802         int                     error;
 803         int                     interim_pages_removed = 0;
 804 //      kern_return_t           kr;
 805
 806         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 807                 return KERN_INVALID_ARGUMENT;
 808
 809 #if 0
 810         /* not implemented */
 811         BS_UNLOCK(bs);
 812         return KERN_FAILURE;
 813 #endif
 814
 815     restart:
 816         PSL_LOCK();
 817         error = KERN_SUCCESS;
 818         for (i = 0; i <= paging_segment_max; i++) {
 819                 ps = paging_segments[i];
 820                 if (ps != PAGING_SEGMENT_NULL &&
 821                     ps->ps_bs == bs &&
 822                     ! ps->ps_going_away) {
 823                         PS_LOCK(ps);
 824                         /* disable access to this segment */
 825                         ps->ps_going_away = TRUE;
 826                         PS_UNLOCK(ps);
 827                         /*
 828                          * The "ps" segment is "off-line" now,
 829                          * we can try and delete it...
 830                          */
 831                         if(dp_pages_free < (cluster_transfer_minimum
 832                                                         + ps->ps_pgcount)) {
 833                                 error = KERN_FAILURE;
 834                                 PSL_UNLOCK();
 835                         }
 836                         else {
 837                                 /* remove all pages associated with the  */
 838                                 /* segment from the list of free pages   */
 839                                 /* when transfer is through, all target  */
 840                                 /* segment pages will appear to be free  */
 841
 842                                 dp_pages_free -=  ps->ps_pgcount;
 843                                 interim_pages_removed += ps->ps_pgcount;
 844                                 PSL_UNLOCK();
 845                                 error = ps_delete(ps);
 846                         }
 847                         if (error != KERN_SUCCESS) {
 848                                 /*
 849                                  * We couldn't delete the segment,
 850                                  * probably because there's not enough
 851                                  * virtual memory left.
 852                                  * Re-enable all the segments.
 853                                  */
 854                                 PSL_LOCK();
 855                                 break;
 856                         }
 857                         goto restart;
 858                 }
 859         }
 860
 861         if (error != KERN_SUCCESS) {
 862                 for (i = 0; i <= paging_segment_max; i++) {
 863                         ps = paging_segments[i];
 864                         if (ps != PAGING_SEGMENT_NULL &&
 865                             ps->ps_bs == bs &&
 866                             ps->ps_going_away) {
 867                                 PS_LOCK(ps);
 868                                 /* re-enable access to this segment */
 869                                 ps->ps_going_away = FALSE;
 870                                 PS_UNLOCK(ps);
 871                         }
 872                 }
 873                 dp_pages_free += interim_pages_removed;
 874                 PSL_UNLOCK();
 875                 BS_UNLOCK(bs);
 876                 return error;
 877         }
 878
 879         for (i = 0; i <= paging_segment_max; i++) {
 880                 ps = paging_segments[i];
 881                 if (ps != PAGING_SEGMENT_NULL &&
 882                     ps->ps_bs == bs) {
 883                         if(ps->ps_going_away) {
 884                                 paging_segments[i] = PAGING_SEGMENT_NULL;
 885                                 paging_segment_count--;
 886                                 PS_LOCK(ps);
 887                                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
 888                                 kfree(ps, sizeof *ps);
 889                         }
 890                 }
 891         }
 892
 893         /* Scan the entire ps array separately to make certain we find the */
 894         /* proper paging_segment_max                                       */
 895         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 896                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 897                    paging_segment_max = i;
 898         }
 899
 900         PSL_UNLOCK();
 901
 902         /*
 903          * All the segments have been deleted.
 904          * We can remove the backing store.
 905          */
 906
 907         /*
 908          * Disable lookups of this backing store.
 909          */
 910         if((void *)bs->bs_port->alias != NULL)
 911                 kfree((void *) bs->bs_port->alias,
 912                       sizeof (struct vstruct_alias));
 913         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 914         bs->bs_port = MACH_PORT_NULL;
 915         BS_UNLOCK(bs);
 916
 917         /*
 918          * Remove backing store from backing_store list.
 919          */
 920         BSL_LOCK();
 921         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
 922                      bs_links);
 923         BSL_UNLOCK();
 924
 925         /*
 926          * Free the backing store structure.
 927          */
 928         kfree(bs, sizeof *bs);
 929
 930         return KERN_SUCCESS;
 931 }
 932
 933 int     ps_enter(paging_segment_t);     /* forward */
 934
 935 int
 936 ps_enter(
 937         paging_segment_t ps)
 938 {
 939         int i;
 940
 941         PSL_LOCK();
 942
 943         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 944                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
 945                         break;
 946         }
 947
 948         if (i < MAX_NUM_PAGING_SEGMENTS) {
 949                 paging_segments[i] = ps;
 950                 if (i > paging_segment_max)
 951                         paging_segment_max = i;
 952                 paging_segment_count++;
 953                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
 954                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
 955                         ps_select_array[ps->ps_bs->bs_priority] = 0;
 956                 i = 0;
 957         } else {
 958                 PSL_UNLOCK();
 959                 return KERN_RESOURCE_SHORTAGE;
 960         }
 961
 962         PSL_UNLOCK();
 963         return i;
 964 }
 965
 966 #ifdef DEVICE_PAGING
 967 kern_return_t
 968 default_pager_add_segment(
 969         MACH_PORT_FACE  backing_store,
 970         MACH_PORT_FACE  device,
 971         recnum_t        offset,
 972         recnum_t        count,
 973         int             record_size)
 974 {
 975         backing_store_t         bs;
 976         paging_segment_t        ps;
 977         int                     i;
 978         int                     error;
 979
 980         if ((bs = backing_store_lookup(backing_store))
 981             == BACKING_STORE_NULL)
 982                 return KERN_INVALID_ARGUMENT;
 983
 984         PSL_LOCK();
 985         for (i = 0; i <= paging_segment_max; i++) {
 986                 ps = paging_segments[i];
 987                 if (ps == PAGING_SEGMENT_NULL)
 988                         continue;
 989
 990                 /*
 991                  * Check for overlap on same device.
 992                  */
 993                 if (!(ps->ps_device != device
 994                       || offset >= ps->ps_offset + ps->ps_recnum
 995                       || offset + count <= ps->ps_offset)) {
 996                         PSL_UNLOCK();
 997                         BS_UNLOCK(bs);
 998                         return KERN_INVALID_ARGUMENT;
 999                 }
1000         }
1001         PSL_UNLOCK();
1002
1003         /*
1004          * Set up the paging segment
1005          */
1006         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1007         if (ps == PAGING_SEGMENT_NULL) {
1008                 BS_UNLOCK(bs);
1009                 return KERN_RESOURCE_SHORTAGE;
1010         }
1011
1012         ps->ps_segtype = PS_PARTITION;
1013         ps->ps_device = device;
1014         ps->ps_offset = offset;
1015         ps->ps_record_shift = local_log2(vm_page_size / record_size);
1016         ps->ps_recnum = count;
1017         ps->ps_pgnum = count >> ps->ps_record_shift;
1018
1019         ps->ps_pgcount = ps->ps_pgnum;
1020         ps->ps_clshift = local_log2(bs->bs_clsize);
1021         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1022         ps->ps_hint = 0;
1023
1024         PS_LOCK_INIT(ps);
1025         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1026         if (!ps->ps_bmap) {
1027                 kfree(ps, sizeof *ps);
1028                 BS_UNLOCK(bs);
1029                 return KERN_RESOURCE_SHORTAGE;
1030         }
1031         for (i = 0; i < ps->ps_ncls; i++) {
1032                 clrbit(ps->ps_bmap, i);
1033         }
1034
1035         ps->ps_going_away = FALSE;
1036         ps->ps_bs = bs;
1037
1038         if ((error = ps_enter(ps)) != 0) {
1039                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1040                 kfree(ps, sizeof *ps);
1041                 BS_UNLOCK(bs);
1042                 return KERN_RESOURCE_SHORTAGE;
1043         }
1044
1045         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1046         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1047         BS_UNLOCK(bs);
1048
1049         PSL_LOCK();
1050         dp_pages_free += ps->ps_pgcount;
1051         PSL_UNLOCK();
1052
1053         bs_more_space(ps->ps_clcount);
1054
1055         DP_DEBUG(DEBUG_BS_INTERNAL,
1056                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1057                   device, offset, count, record_size,
1058                   ps->ps_record_shift, ps->ps_pgnum));
1059
1060         return KERN_SUCCESS;
1061 }
1062
1063 boolean_t
1064 bs_add_device(
1065         char            *dev_name,
1066         MACH_PORT_FACE  master)
1067 {
1068         security_token_t        null_security_token = {
1069                 { 0, 0 }
1070         };
1071         MACH_PORT_FACE  device;
1072         int             info[DEV_GET_SIZE_COUNT];
1073         mach_msg_type_number_t info_count;
1074         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1075         unsigned int    rec_size;
1076         recnum_t        count;
1077         int             clsize;
1078         MACH_PORT_FACE  reply_port;
1079
1080         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1081                         null_security_token, dev_name, &device))
1082                 return FALSE;
1083
1084         info_count = DEV_GET_SIZE_COUNT;
1085         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1086                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1087                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1088                 clsize = bs_get_global_clsize(0);
1089                 if (!default_pager_backing_store_create(
1090                                         default_pager_object,
1091                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1092                                         (clsize * vm_page_size),
1093                                         &bs)) {
1094                         if (!default_pager_add_segment(bs, device,
1095                                                        0, count, rec_size)) {
1096                                 return TRUE;
1097                         }
1098                         ipc_port_release_receive(bs);
1099                 }
1100         }
1101
1102         ipc_port_release_send(device);
1103         return FALSE;
1104 }
1105 #endif /* DEVICE_PAGING */
1106
1107 #if     VS_ASYNC_REUSE
1108
1109 struct vs_async *
1110 vs_alloc_async(void)
1111 {
1112         struct vs_async *vsa;
1113         MACH_PORT_FACE  reply_port;
1114 //      kern_return_t   kr;
1115
1116         VS_ASYNC_LOCK();
1117         if (vs_async_free_list == NULL) {
1118                 VS_ASYNC_UNLOCK();
1119                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1120                 if (vsa != NULL) {
1121                         /*
1122                          * Try allocating a reply port named after the
1123                          * address of the vs_async structure.
1124                          */
1125                         struct vstruct_alias    *alias_struct;
1126
1127                         reply_port = ipc_port_alloc_kernel();
1128                         alias_struct = (struct vstruct_alias *)
1129                                 kalloc(sizeof (struct vstruct_alias));
1130                         if(alias_struct != NULL) {
1131                                 alias_struct->vs = (struct vstruct *)vsa;
1132                                 alias_struct->name = ISVS;
1133                                 reply_port->alias = (int) alias_struct;
1134                                 vsa->reply_port = reply_port;
1135                                 vs_alloc_async_count++;
1136                         }
1137                         else {
1138                                 vs_alloc_async_failed++;
1139                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1140                                                                 (reply_port));
1141                                 kfree(vsa, sizeof (struct vs_async));
1142                                 vsa = NULL;
1143                         }
1144                 }
1145         } else {
1146                 vsa = vs_async_free_list;
1147                 vs_async_free_list = vs_async_free_list->vsa_next;
1148                 VS_ASYNC_UNLOCK();
1149         }
1150
1151         return vsa;
1152 }
1153
1154 void
1155 vs_free_async(
1156         struct vs_async *vsa)
1157 {
1158         VS_ASYNC_LOCK();
1159         vsa->vsa_next = vs_async_free_list;
1160         vs_async_free_list = vsa;
1161         VS_ASYNC_UNLOCK();
1162 }
1163
1164 #else   /* VS_ASYNC_REUSE */
1165
1166 struct vs_async *
1167 vs_alloc_async(void)
1168 {
1169         struct vs_async *vsa;
1170         MACH_PORT_FACE  reply_port;
1171         kern_return_t   kr;
1172
1173         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1174         if (vsa != NULL) {
1175                 /*
1176                  * Try allocating a reply port named after the
1177                  * address of the vs_async structure.
1178                  */
1179                         reply_port = ipc_port_alloc_kernel();
1180                         alias_struct = (vstruct_alias *)
1181                                 kalloc(sizeof (struct vstruct_alias));
1182                         if(alias_struct != NULL) {
1183                                 alias_struct->vs = reply_port;
1184                                 alias_struct->name = ISVS;
1185                                 reply_port->alias = (int) vsa;
1186                                 vsa->reply_port = reply_port;
1187                                 vs_alloc_async_count++;
1188                         }
1189                         else {
1190                                 vs_alloc_async_failed++;
1191                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1192                                                                 (reply_port));
1193                                 kfree(vsa, sizeof (struct vs_async));
1194                                 vsa = NULL;
1195                         }
1196         }
1197
1198         return vsa;
1199 }
1200
1201 void
1202 vs_free_async(
1203         struct vs_async *vsa)
1204 {
1205         MACH_PORT_FACE  reply_port;
1206         kern_return_t   kr;
1207
1208         reply_port = vsa->reply_port;
1209         kfree(reply_port->alias, sizeof (struct vstuct_alias));
1210         kfree(vsa, sizeof (struct vs_async));
1211         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1212 #if 0
1213         VS_ASYNC_LOCK();
1214         vs_alloc_async_count--;
1215         VS_ASYNC_UNLOCK();
1216 #endif
1217 }
1218
1219 #endif  /* VS_ASYNC_REUSE */
1220
1221 zone_t  vstruct_zone;
1222
1223 vstruct_t
1224 ps_vstruct_create(
1225         vm_size_t size)
1226 {
1227         vstruct_t       vs;
1228         unsigned int    i;
1229
1230         vs = (vstruct_t) zalloc(vstruct_zone);
1231         if (vs == VSTRUCT_NULL) {
1232                 return VSTRUCT_NULL;
1233         }
1234
1235         VS_LOCK_INIT(vs);
1236
1237         /*
1238          * The following fields will be provided later.
1239          */
1240         vs->vs_mem_obj = NULL;
1241         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1242         vs->vs_references = 1;
1243         vs->vs_seqno = 0;
1244
1245 #ifdef MACH_KERNEL
1246         vs->vs_waiting_seqno = FALSE;
1247         vs->vs_waiting_read = FALSE;
1248         vs->vs_waiting_write = FALSE;
1249         vs->vs_waiting_async = FALSE;
1250 #else
1251         mutex_init(&vs->vs_waiting_seqno, 0);
1252         mutex_init(&vs->vs_waiting_read, 0);
1253         mutex_init(&vs->vs_waiting_write, 0);
1254         mutex_init(&vs->vs_waiting_refs, 0);
1255         mutex_init(&vs->vs_waiting_async, 0);
1256 #endif
1257
1258         vs->vs_readers = 0;
1259         vs->vs_writers = 0;
1260
1261         vs->vs_errors = 0;
1262
1263         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1264         vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1265         vs->vs_async_pending = 0;
1266
1267         /*
1268          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1269          * depending on the size of the memory object.
1270          */
1271         if (INDIRECT_CLMAP(vs->vs_size)) {
1272                 vs->vs_imap = (struct vs_map **)
1273                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1274                 vs->vs_indirect = TRUE;
1275         } else {
1276                 vs->vs_dmap = (struct vs_map *)
1277                         kalloc(CLMAP_SIZE(vs->vs_size));
1278                 vs->vs_indirect = FALSE;
1279         }
1280         vs->vs_xfer_pending = FALSE;
1281         DP_DEBUG(DEBUG_VS_INTERNAL,
1282                  ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1283
1284         /*
1285          * Check to see that we got the space.
1286          */
1287         if (!vs->vs_dmap) {
1288                 kfree(vs, sizeof *vs);
1289                 return VSTRUCT_NULL;
1290         }
1291
1292         /*
1293          * Zero the indirect pointers, or clear the direct pointers.
1294          */
1295         if (vs->vs_indirect)
1296                 memset(vs->vs_imap, 0,
1297                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1298         else
1299                 for (i = 0; i < vs->vs_size; i++)
1300                         VSM_CLR(vs->vs_dmap[i]);
1301
1302         VS_MAP_LOCK_INIT(vs);
1303
1304         bs_commit(vs->vs_size);
1305
1306         return vs;
1307 }
1308
1309 paging_segment_t ps_select_segment(unsigned int, int *);        /* forward */
1310
1311 paging_segment_t
1312 ps_select_segment(
1313         unsigned int    shift,
1314         int             *psindex)
1315 {
1316         paging_segment_t        ps;
1317         int                     i;
1318         int                     j;
1319
1320         /*
1321          * Optimize case where there's only one segment.
1322          * paging_segment_max will index the one and only segment.
1323          */
1324
1325         PSL_LOCK();
1326         if (paging_segment_count == 1) {
1327                 paging_segment_t lps;   /* used to avoid extra PS_UNLOCK */
1328                 ipc_port_t trigger = IP_NULL;
1329
1330                 ps = paging_segments[paging_segment_max];
1331                 *psindex = paging_segment_max;
1332                 PS_LOCK(ps);
1333                 if (ps->ps_going_away) {
1334                         /* this segment is being turned off */
1335                         lps = PAGING_SEGMENT_NULL;
1336                 } else {
1337                         ASSERT(ps->ps_clshift >= shift);
1338                         if (ps->ps_clcount) {
1339                                 ps->ps_clcount--;
1340                                 dp_pages_free -=  1 << ps->ps_clshift;
1341                                 if(min_pages_trigger_port &&
1342                                   (dp_pages_free < minimum_pages_remaining)) {
1343                                         trigger = min_pages_trigger_port;
1344                                         min_pages_trigger_port = NULL;
1345                                         bs_low = TRUE;
1346                                 }
1347                                 lps = ps;
1348                         } else
1349                                 lps = PAGING_SEGMENT_NULL;
1350                 }
1351                 PS_UNLOCK(ps);
1352                 PSL_UNLOCK();
1353
1354                 if (trigger != IP_NULL) {
1355                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1356                         ipc_port_release_send(trigger);
1357                 }
1358                 return lps;
1359         }
1360
1361         if (paging_segment_count == 0) {
1362                 PSL_UNLOCK();
1363                 return PAGING_SEGMENT_NULL;
1364         }
1365
1366         for (i = BS_MAXPRI;
1367              i >= BS_MINPRI; i--) {
1368                 int start_index;
1369
1370                 if ((ps_select_array[i] == BS_NOPRI) ||
1371                                 (ps_select_array[i] == BS_FULLPRI))
1372                         continue;
1373                 start_index = ps_select_array[i];
1374
1375                 if(!(paging_segments[start_index])) {
1376                         j = start_index+1;
1377                         physical_transfer_cluster_count = 0;
1378                 }
1379                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1380                                 (((paging_segments[start_index])->ps_clshift)
1381                                 + vm_page_shift))) {
1382                         physical_transfer_cluster_count = 0;
1383                         j = start_index + 1;
1384                 } else {
1385                         physical_transfer_cluster_count+=1;
1386                         j = start_index;
1387                         if(start_index == 0)
1388                                 start_index = paging_segment_max;
1389                         else
1390                                 start_index = start_index - 1;
1391                 }
1392
1393                 while (1) {
1394                         if (j > paging_segment_max)
1395                                 j = 0;
1396                         if ((ps = paging_segments[j]) &&
1397                             (ps->ps_bs->bs_priority == i)) {
1398                                 /*
1399                                  * Force the ps cluster size to be
1400                                  * >= that of the vstruct.
1401                                  */
1402                                 PS_LOCK(ps);
1403                                 if (ps->ps_going_away) {
1404                                         /* this segment is being turned off */
1405                                 } else if ((ps->ps_clcount) &&
1406                                            (ps->ps_clshift >= shift)) {
1407                                         ipc_port_t trigger = IP_NULL;
1408
1409                                         ps->ps_clcount--;
1410                                         dp_pages_free -=  1 << ps->ps_clshift;
1411                                         if(min_pages_trigger_port &&
1412                                                 (dp_pages_free <
1413                                                 minimum_pages_remaining)) {
1414                                                 trigger = min_pages_trigger_port;
1415                                                 min_pages_trigger_port = NULL;
1416                                         }
1417                                         PS_UNLOCK(ps);
1418                                         /*
1419                                          * found one, quit looking.
1420                                          */
1421                                         ps_select_array[i] = j;
1422                                         PSL_UNLOCK();
1423
1424                                         if (trigger != IP_NULL) {
1425                                                 default_pager_space_alert(
1426                                                         trigger,
1427                                                         HI_WAT_ALERT);
1428                                                 ipc_port_release_send(trigger);
1429                                         }
1430                                         *psindex = j;
1431                                         return ps;
1432                                 }
1433                                 PS_UNLOCK(ps);
1434                         }
1435                         if (j == start_index) {
1436                                 /*
1437                                  * none at this priority -- mark it full
1438                                  */
1439                                 ps_select_array[i] = BS_FULLPRI;
1440                                 break;
1441                         }
1442                         j++;
1443                 }
1444         }
1445         PSL_UNLOCK();
1446         return PAGING_SEGMENT_NULL;
1447 }
1448
1449 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1450
1451 vm_offset_t
1452 ps_allocate_cluster(
1453         vstruct_t               vs,
1454         int                     *psindex,
1455         paging_segment_t        use_ps)
1456 {
1457         unsigned int            byte_num;
1458         int                     bit_num = 0;
1459         paging_segment_t        ps;
1460         vm_offset_t             cluster;
1461         ipc_port_t              trigger = IP_NULL;
1462
1463         /*
1464          * Find best paging segment.
1465          * ps_select_segment will decrement cluster count on ps.
1466          * Must pass cluster shift to find the most appropriate segment.
1467          */
1468         /* NOTE:  The addition of paging segment delete capability threatened
1469          * to seriously complicate the treatment of paging segments in this
1470          * module and the ones that call it (notably ps_clmap), because of the
1471          * difficulty in assuring that the paging segment would continue to
1472          * exist between being unlocked and locked.   This was
1473          * avoided because all calls to this module are based in either
1474          * dp_memory_object calls which rely on the vs lock, or by
1475          * the transfer function which is part of the segment delete path.
1476          * The transfer function which is part of paging segment delete is
1477          * protected from multiple callers by the backing store lock.
1478          * The paging segment delete function treats mappings to a paging
1479          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1480          * while data is transferred to the remaining segments.  This is in
1481          * line with the view that incomplete or in-transition mappings between
1482          * data, a vstruct, and backing store are protected by the vs lock.
1483          * This and the ordering of the paging segment "going_away" bit setting
1484          * protects us.
1485          */
1486         if (use_ps != PAGING_SEGMENT_NULL) {
1487                 ps = use_ps;
1488                 PSL_LOCK();
1489                 PS_LOCK(ps);
1490
1491                 ASSERT(ps->ps_clcount != 0);
1492
1493                 ps->ps_clcount--;
1494                 dp_pages_free -=  1 << ps->ps_clshift;
1495                 if(min_pages_trigger_port &&
1496                                 (dp_pages_free < minimum_pages_remaining)) {
1497                         trigger = min_pages_trigger_port;
1498                         min_pages_trigger_port = NULL;
1499                 }
1500                 PSL_UNLOCK();
1501                 PS_UNLOCK(ps);
1502                 if (trigger != IP_NULL) {
1503                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1504                         ipc_port_release_send(trigger);
1505                 }
1506
1507         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1508                    PAGING_SEGMENT_NULL) {
1509                 static uint32_t lastnotify = 0;
1510                 uint32_t now, nanoseconds_dummy;
1511
1512                 /*
1513                  * Emit a notification of the low-paging resource condition
1514                  * but don't issue it more than once every five seconds.  This
1515                  * prevents us from overflowing logs with thousands of
1516                  * repetitions of the message.
1517                  */
1518                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1519                 if (now > lastnotify + 5) {
1520                         dprintf(("no space in available paging segments\n"));
1521                         lastnotify = now;
1522                 }
1523
1524                 /* the count got off maybe, reset to zero */
1525                 PSL_LOCK();
1526                 dp_pages_free = 0;
1527                 if(min_pages_trigger_port) {
1528                         trigger = min_pages_trigger_port;
1529                         min_pages_trigger_port = NULL;
1530                         bs_low = TRUE;
1531                 }
1532                 PSL_UNLOCK();
1533                 if (trigger != IP_NULL) {
1534                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1535                         ipc_port_release_send(trigger);
1536                 }
1537                 return (vm_offset_t) -1;
1538         }
1539
1540         /*
1541          * Look for an available cluster.  At the end of the loop,
1542          * byte_num is the byte offset and bit_num is the bit offset of the
1543          * first zero bit in the paging segment bitmap.
1544          */
1545         PS_LOCK(ps);
1546         byte_num = ps->ps_hint;
1547         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1548                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1549                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1550                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1551                                         break;
1552                         }
1553                         ASSERT(bit_num != NBBY);
1554                         break;
1555                 }
1556         }
1557         ps->ps_hint = byte_num;
1558         cluster = (byte_num*NBBY) + bit_num;
1559
1560         /* Space was reserved, so this must be true */
1561         ASSERT(cluster < ps->ps_ncls);
1562
1563         setbit(ps->ps_bmap, cluster);
1564         PS_UNLOCK(ps);
1565
1566         return cluster;
1567 }
1568
1569 void ps_deallocate_cluster(paging_segment_t, vm_offset_t);      /* forward */
1570
1571 void
1572 ps_deallocate_cluster(
1573         paging_segment_t        ps,
1574         vm_offset_t             cluster)
1575 {
1576
1577         if (cluster >= (vm_offset_t) ps->ps_ncls)
1578                 panic("ps_deallocate_cluster: Invalid cluster number");
1579
1580         /*
1581          * Lock the paging segment, clear the cluster's bitmap and increment the
1582          * number of free cluster.
1583          */
1584         PSL_LOCK();
1585         PS_LOCK(ps);
1586         clrbit(ps->ps_bmap, cluster);
1587         ++ps->ps_clcount;
1588         dp_pages_free +=  1 << ps->ps_clshift;
1589         PSL_UNLOCK();
1590
1591         /*
1592          * Move the hint down to the freed cluster if it is
1593          * less than the current hint.
1594          */
1595         if ((cluster/NBBY) < ps->ps_hint) {
1596                 ps->ps_hint = (cluster/NBBY);
1597         }
1598
1599         PS_UNLOCK(ps);
1600
1601         /*
1602          * If we're freeing space on a full priority, reset the array.
1603          */
1604         PSL_LOCK();
1605         if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1606                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1607         PSL_UNLOCK();
1608
1609         return;
1610 }
1611
1612 void ps_dealloc_vsmap(struct vs_map *, vm_size_t);      /* forward */
1613
1614 void
1615 ps_dealloc_vsmap(
1616         struct vs_map   *vsmap,
1617         vm_size_t       size)
1618 {
1619         unsigned int i;
1620         for (i = 0; i < size; i++)
1621                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1622                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1623                                               VSM_CLOFF(vsmap[i]));
1624 }
1625
1626 void
1627 ps_vstruct_dealloc(
1628         vstruct_t vs)
1629 {
1630         unsigned int    i;
1631 //      spl_t   s;
1632
1633         VS_MAP_LOCK(vs);
1634
1635         /*
1636          * If this is an indirect structure, then we walk through the valid
1637          * (non-zero) indirect pointers and deallocate the clusters
1638          * associated with each used map entry (via ps_dealloc_vsmap).
1639          * When all of the clusters in an indirect block have been
1640          * freed, we deallocate the block.  When all of the indirect
1641          * blocks have been deallocated we deallocate the memory
1642          * holding the indirect pointers.
1643          */
1644         if (vs->vs_indirect) {
1645                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1646                         if (vs->vs_imap[i] != NULL) {
1647                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1648                                 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1649                         }
1650                 }
1651                 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1652         } else {
1653                 /*
1654                  * Direct map.  Free used clusters, then memory.
1655                  */
1656                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1657                 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1658         }
1659         VS_MAP_UNLOCK(vs);
1660
1661         bs_commit(- vs->vs_size);
1662
1663         zfree(vstruct_zone, vs);
1664 }
1665
1666 int ps_map_extend(vstruct_t, unsigned int);     /* forward */
1667
1668 int ps_map_extend(
1669         vstruct_t       vs,
1670         unsigned int    new_size)
1671 {
1672         struct vs_map   **new_imap;
1673         struct vs_map   *new_dmap = NULL;
1674         int             newdsize;
1675         int             i;
1676         void            *old_map = NULL;
1677         int             old_map_size = 0;
1678
1679         if (vs->vs_size >= new_size) {
1680                 /*
1681                  * Someone has already done the work.
1682                  */
1683                 return 0;
1684         }
1685
1686         /*
1687          * If the new size extends into the indirect range, then we have one
1688          * of two cases: we are going from indirect to indirect, or we are
1689          * going from direct to indirect.  If we are going from indirect to
1690          * indirect, then it is possible that the new size will fit in the old
1691          * indirect map.  If this is the case, then just reset the size of the
1692          * vstruct map and we are done.  If the new size will not
1693          * fit into the old indirect map, then we have to allocate a new
1694          * indirect map and copy the old map pointers into this new map.
1695          *
1696          * If we are going from direct to indirect, then we have to allocate a
1697          * new indirect map and copy the old direct pages into the first
1698          * indirect page of the new map.
1699          * NOTE: allocating memory here is dangerous, as we're in the
1700          * pageout path.
1701          */
1702         if (INDIRECT_CLMAP(new_size)) {
1703                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1704
1705                 /*
1706                  * Get a new indirect map and zero it.
1707                  */
1708                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1709                 if (vs->vs_indirect &&
1710                     (new_map_size == old_map_size)) {
1711                         bs_commit(new_size - vs->vs_size);
1712                         vs->vs_size = new_size;
1713                         return 0;
1714                 }
1715
1716                 new_imap = (struct vs_map **)kalloc(new_map_size);
1717                 if (new_imap == NULL) {
1718                         return -1;
1719                 }
1720                 memset(new_imap, 0, new_map_size);
1721
1722                 if (vs->vs_indirect) {
1723                         /* Copy old entries into new map */
1724                         memcpy(new_imap, vs->vs_imap, old_map_size);
1725                         /* Arrange to free the old map */
1726                         old_map = (void *) vs->vs_imap;
1727                         newdsize = 0;
1728                 } else {        /* Old map was a direct map */
1729                         /* Allocate an indirect page */
1730                         if ((new_imap[0] = (struct vs_map *)
1731                              kalloc(CLMAP_THRESHOLD)) == NULL) {
1732                                 kfree(new_imap, new_map_size);
1733                                 return -1;
1734                         }
1735                         new_dmap = new_imap[0];
1736                         newdsize = CLMAP_ENTRIES;
1737                 }
1738         } else {
1739                 new_imap = NULL;
1740                 newdsize = new_size;
1741                 /*
1742                  * If the new map is a direct map, then the old map must
1743                  * also have been a direct map.  All we have to do is
1744                  * to allocate a new direct map, copy the old entries
1745                  * into it and free the old map.
1746                  */
1747                 if ((new_dmap = (struct vs_map *)
1748                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
1749                         return -1;
1750                 }
1751         }
1752         if (newdsize) {
1753
1754                 /* Free the old map */
1755                 old_map = (void *) vs->vs_dmap;
1756                 old_map_size = CLMAP_SIZE(vs->vs_size);
1757
1758                 /* Copy info from the old map into the new map */
1759                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1760
1761                 /* Initialize the rest of the new map */
1762                 for (i = vs->vs_size; i < newdsize; i++)
1763                         VSM_CLR(new_dmap[i]);
1764         }
1765         if (new_imap) {
1766                 vs->vs_imap = new_imap;
1767                 vs->vs_indirect = TRUE;
1768         } else
1769                 vs->vs_dmap = new_dmap;
1770         bs_commit(new_size - vs->vs_size);
1771         vs->vs_size = new_size;
1772         if (old_map)
1773                 kfree(old_map, old_map_size);
1774         return 0;
1775 }
1776
1777 vm_offset_t
1778 ps_clmap(
1779         vstruct_t       vs,
1780         vm_offset_t     offset,
1781         struct clmap    *clmap,
1782         int             flag,
1783         vm_size_t       size,
1784         int             error)
1785 {
1786         vm_offset_t     cluster;        /* The cluster of offset.       */
1787         vm_offset_t     newcl;          /* The new cluster allocated.   */
1788         vm_offset_t     newoff;
1789         unsigned int    i;
1790         struct vs_map   *vsmap;
1791
1792         VS_MAP_LOCK(vs);
1793
1794         ASSERT(vs->vs_dmap);
1795         cluster = atop_32(offset) >> vs->vs_clshift;
1796
1797         /*
1798          * Initialize cluster error value
1799          */
1800         clmap->cl_error = 0;
1801
1802         /*
1803          * If the object has grown, extend the page map.
1804          */
1805         if (cluster >= vs->vs_size) {
1806                 if (flag == CL_FIND) {
1807                         /* Do not allocate if just doing a lookup */
1808                         VS_MAP_UNLOCK(vs);
1809                         return (vm_offset_t) -1;
1810                 }
1811                 if (ps_map_extend(vs, cluster + 1)) {
1812                         VS_MAP_UNLOCK(vs);
1813                         return (vm_offset_t) -1;
1814                 }
1815         }
1816
1817         /*
1818          * Look for the desired cluster.  If the map is indirect, then we
1819          * have a two level lookup.  First find the indirect block, then
1820          * find the actual cluster.  If the indirect block has not yet
1821          * been allocated, then do so.  If the cluster has not yet been
1822          * allocated, then do so.
1823          *
1824          * If any of the allocations fail, then return an error.
1825          * Don't allocate if just doing a lookup.
1826          */
1827         if (vs->vs_indirect) {
1828                 long    ind_block = cluster/CLMAP_ENTRIES;
1829
1830                 /* Is the indirect block allocated? */
1831                 vsmap = vs->vs_imap[ind_block];
1832                 if (vsmap == NULL) {
1833                         if (flag == CL_FIND) {
1834                                 VS_MAP_UNLOCK(vs);
1835                                 return (vm_offset_t) -1;
1836                         }
1837
1838                         /* Allocate the indirect block */
1839                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1840                         if (vsmap == NULL) {
1841                                 VS_MAP_UNLOCK(vs);
1842                                 return (vm_offset_t) -1;
1843                         }
1844                         /* Initialize the cluster offsets */
1845                         for (i = 0; i < CLMAP_ENTRIES; i++)
1846                                 VSM_CLR(vsmap[i]);
1847                         vs->vs_imap[ind_block] = vsmap;
1848                 }
1849         } else
1850                 vsmap = vs->vs_dmap;
1851
1852         ASSERT(vsmap);
1853         vsmap += cluster%CLMAP_ENTRIES;
1854
1855         /*
1856          * At this point, vsmap points to the struct vs_map desired.
1857          *
1858          * Look in the map for the cluster, if there was an error on a
1859          * previous write, flag it and return.  If it is not yet
1860          * allocated, then allocate it, if we're writing; if we're
1861          * doing a lookup and the cluster's not allocated, return error.
1862          */
1863         if (VSM_ISERR(*vsmap)) {
1864                 clmap->cl_error = VSM_GETERR(*vsmap);
1865                 VS_MAP_UNLOCK(vs);
1866                 return (vm_offset_t) -1;
1867         } else if (VSM_ISCLR(*vsmap)) {
1868                 int psindex;
1869
1870                 if (flag == CL_FIND) {
1871                         /*
1872                          * If there's an error and the entry is clear, then
1873                          * we've run out of swap space.  Record the error
1874                          * here and return.
1875                          */
1876                         if (error) {
1877                                 VSM_SETERR(*vsmap, error);
1878                         }
1879                         VS_MAP_UNLOCK(vs);
1880                         return (vm_offset_t) -1;
1881                 } else {
1882                         /*
1883                          * Attempt to allocate a cluster from the paging segment
1884                          */
1885                         newcl = ps_allocate_cluster(vs, &psindex,
1886                                                     PAGING_SEGMENT_NULL);
1887                         if (newcl == (vm_offset_t) -1) {
1888                                 VS_MAP_UNLOCK(vs);
1889                                 return (vm_offset_t) -1;
1890                         }
1891                         VSM_CLR(*vsmap);
1892                         VSM_SETCLOFF(*vsmap, newcl);
1893                         VSM_SETPS(*vsmap, psindex);
1894                 }
1895         } else
1896                 newcl = VSM_CLOFF(*vsmap);
1897
1898         /*
1899          * Fill in pertinent fields of the clmap
1900          */
1901         clmap->cl_ps = VSM_PS(*vsmap);
1902         clmap->cl_numpages = VSCLSIZE(vs);
1903         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1904
1905         /*
1906          * Byte offset in paging segment is byte offset to cluster plus
1907          * byte offset within cluster.  It looks ugly, but should be
1908          * relatively quick.
1909          */
1910         ASSERT(trunc_page(offset) == offset);
1911         newcl = ptoa_32(newcl) << vs->vs_clshift;
1912         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1913         if (flag == CL_ALLOC) {
1914                 /*
1915                  * set bits in the allocation bitmap according to which
1916                  * pages were requested.  size is in bytes.
1917                  */
1918                 i = atop_32(newoff);
1919                 while ((size > 0) && (i < VSCLSIZE(vs))) {
1920                         VSM_SETALLOC(*vsmap, i);
1921                         i++;
1922                         size -= vm_page_size;
1923                 }
1924         }
1925         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1926         if (newoff) {
1927                 /*
1928                  * Offset is not cluster aligned, so number of pages
1929                  * and bitmaps must be adjusted
1930                  */
1931                 clmap->cl_numpages -= atop_32(newoff);
1932                 CLMAP_SHIFT(clmap, vs);
1933                 CLMAP_SHIFTALLOC(clmap, vs);
1934         }
1935
1936         /*
1937          *
1938          * The setting of valid bits and handling of write errors
1939          * must be done here, while we hold the lock on the map.
1940          * It logically should be done in ps_vs_write_complete().
1941          * The size and error information has been passed from
1942          * ps_vs_write_complete().  If the size parameter is non-zero,
1943          * then there is work to be done.  If error is also non-zero,
1944          * then the error number is recorded in the cluster and the
1945          * entire cluster is in error.
1946          */
1947         if (size && flag == CL_FIND) {
1948                 vm_offset_t off = (vm_offset_t) 0;
1949
1950                 if (!error) {
1951                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1952                              i++) {
1953                                 VSM_SETPG(*vsmap, i);
1954                                 size -= vm_page_size;
1955                         }
1956                         ASSERT(i <= VSCLSIZE(vs));
1957                 } else {
1958                         BS_STAT(clmap->cl_ps->ps_bs,
1959                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1960                                         atop_32(size));
1961                         off = VSM_CLOFF(*vsmap);
1962                         VSM_SETERR(*vsmap, error);
1963                 }
1964                 /*
1965                  * Deallocate cluster if error, and no valid pages
1966                  * already present.
1967                  */
1968                 if (off != (vm_offset_t) 0)
1969                         ps_deallocate_cluster(clmap->cl_ps, off);
1970                 VS_MAP_UNLOCK(vs);
1971                 return (vm_offset_t) 0;
1972         } else
1973                 VS_MAP_UNLOCK(vs);
1974
1975         DP_DEBUG(DEBUG_VS_INTERNAL,
1976                  ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1977                   newcl+newoff, (int) vs, (int) vsmap, flag));
1978         DP_DEBUG(DEBUG_VS_INTERNAL,
1979                  ("     clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1980                   (int) clmap->cl_ps, clmap->cl_numpages,
1981                   (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1982
1983         return (newcl + newoff);
1984 }
1985
1986 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t);     /* forward */
1987
1988 void
1989 ps_clunmap(
1990         vstruct_t       vs,
1991         vm_offset_t     offset,
1992         vm_size_t       length)
1993 {
1994         vm_offset_t             cluster; /* The cluster number of offset */
1995         struct vs_map           *vsmap;
1996
1997         VS_MAP_LOCK(vs);
1998
1999         /*
2000          * Loop through all clusters in this range, freeing paging segment
2001          * clusters and map entries as encountered.
2002          */
2003         while (length > 0) {
2004                 vm_offset_t     newoff;
2005                 unsigned int    i;
2006
2007                 cluster = atop_32(offset) >> vs->vs_clshift;
2008                 if (vs->vs_indirect)    /* indirect map */
2009                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2010                 else
2011                         vsmap = vs->vs_dmap;
2012                 if (vsmap == NULL) {
2013                         VS_MAP_UNLOCK(vs);
2014                         return;
2015                 }
2016                 vsmap += cluster%CLMAP_ENTRIES;
2017                 if (VSM_ISCLR(*vsmap)) {
2018                         length -= vm_page_size;
2019                         offset += vm_page_size;
2020                         continue;
2021                 }
2022                 /*
2023                  * We've got a valid mapping.  Clear it and deallocate
2024                  * paging segment cluster pages.
2025                  * Optimize for entire cluster cleraing.
2026                  */
2027                 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2028                         /*
2029                          * Not cluster aligned.
2030                          */
2031                         ASSERT(trunc_page(newoff) == newoff);
2032                         i = atop_32(newoff);
2033                 } else
2034                         i = 0;
2035                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2036                         VSM_CLRPG(*vsmap, i);
2037                         VSM_CLRALLOC(*vsmap, i);
2038                         length -= vm_page_size;
2039                         offset += vm_page_size;
2040                         i++;
2041                 }
2042
2043                 /*
2044                  * If map entry is empty, clear and deallocate cluster.
2045                  */
2046                 if (!VSM_ALLOC(*vsmap)) {
2047                         ps_deallocate_cluster(VSM_PS(*vsmap),
2048                                               VSM_CLOFF(*vsmap));
2049                         VSM_CLR(*vsmap);
2050                 }
2051         }
2052
2053         VS_MAP_UNLOCK(vs);
2054 }
2055
2056 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2057
2058 void
2059 ps_vs_write_complete(
2060         vstruct_t       vs,
2061         vm_offset_t     offset,
2062         vm_size_t       size,
2063         int             error)
2064 {
2065         struct clmap    clmap;
2066
2067         /*
2068          * Get the struct vsmap for this cluster.
2069          * Use READ, even though it was written, because the
2070          * cluster MUST be present, unless there was an error
2071          * in the original ps_clmap (e.g. no space), in which
2072          * case, nothing happens.
2073          *
2074          * Must pass enough information to ps_clmap to allow it
2075          * to set the vs_map structure bitmap under lock.
2076          */
2077         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2078 }
2079
2080 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int);    /* forward */
2081
2082 void
2083 vs_cl_write_complete(
2084         vstruct_t                                       vs,
2085         __unused paging_segment_t       ps,
2086         vm_offset_t                                     offset,
2087         __unused vm_offset_t            addr,
2088         vm_size_t                                       size,
2089         boolean_t                                       async,
2090         int                                                     error)
2091 {
2092 //      kern_return_t   kr;
2093
2094         if (error) {
2095                 /*
2096                  * For internal objects, the error is recorded on a
2097                  * per-cluster basis by ps_clmap() which is called
2098                  * by ps_vs_write_complete() below.
2099                  */
2100                 dprintf(("write failed error = 0x%x\n", error));
2101                 /* add upl_abort code here */
2102         } else
2103                 GSTAT(global_stats.gs_pages_out += atop_32(size));
2104         /*
2105          * Notify the vstruct mapping code, so it can do its accounting.
2106          */
2107         ps_vs_write_complete(vs, offset, size, error);
2108
2109         if (async) {
2110                 VS_LOCK(vs);
2111                 ASSERT(vs->vs_async_pending > 0);
2112                 vs->vs_async_pending -= size;
2113                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2114                         vs->vs_waiting_async = FALSE;
2115                         VS_UNLOCK(vs);
2116                         /* mutex_unlock(&vs->vs_waiting_async); */
2117                         thread_wakeup(&vs->vs_async_pending);
2118                 } else {
2119                         VS_UNLOCK(vs);
2120                 }
2121         }
2122 }
2123
2124 #ifdef DEVICE_PAGING
2125 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2126
2127 kern_return_t
2128 device_write_reply(
2129         MACH_PORT_FACE  reply_port,
2130         kern_return_t   device_code,
2131         io_buf_len_t    bytes_written)
2132 {
2133         struct vs_async *vsa;
2134
2135         vsa = (struct vs_async *)
2136                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2137
2138         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2139                 device_code = KERN_FAILURE;
2140         }
2141
2142         vsa->vsa_error = device_code;
2143
2144
2145         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2146         if(vsa->vsa_flags & VSA_TRANSFER) {
2147                 /* revisit when async disk segments redone */
2148                 if(vsa->vsa_error) {
2149                    /* need to consider error condition.  re-write data or */
2150                    /* throw it away here. */
2151                    vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2152                 }
2153                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2154                                                 vsa->vsa_size, vsa->vsa_error);
2155         } else {
2156                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2157                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2158                              vsa->vsa_error);
2159         }
2160         VS_FREE_ASYNC(vsa);
2161
2162         return KERN_SUCCESS;
2163 }
2164
2165 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2166 kern_return_t
2167 device_write_reply_inband(
2168         MACH_PORT_FACE          reply_port,
2169         kern_return_t           return_code,
2170         io_buf_len_t            bytes_written)
2171 {
2172         panic("device_write_reply_inband: illegal");
2173         return KERN_SUCCESS;
2174 }
2175
2176 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2177 kern_return_t
2178 device_read_reply(
2179         MACH_PORT_FACE          reply_port,
2180         kern_return_t           return_code,
2181         io_buf_ptr_t            data,
2182         mach_msg_type_number_t  dataCnt)
2183 {
2184         struct vs_async *vsa;
2185         vsa = (struct vs_async *)
2186                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2187         vsa->vsa_addr = (vm_offset_t)data;
2188         vsa->vsa_size = (vm_size_t)dataCnt;
2189         vsa->vsa_error = return_code;
2190         thread_wakeup(&vsa->vsa_lock);
2191         return KERN_SUCCESS;
2192 }
2193
2194 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2195 kern_return_t
2196 device_read_reply_inband(
2197         MACH_PORT_FACE          reply_port,
2198         kern_return_t           return_code,
2199         io_buf_ptr_inband_t     data,
2200         mach_msg_type_number_t  dataCnt)
2201 {
2202         panic("device_read_reply_inband: illegal");
2203         return KERN_SUCCESS;
2204 }
2205
2206 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2207 kern_return_t
2208 device_read_reply_overwrite(
2209         MACH_PORT_FACE          reply_port,
2210         kern_return_t           return_code,
2211         io_buf_len_t            bytes_read)
2212 {
2213         panic("device_read_reply_overwrite: illegal\n");
2214         return KERN_SUCCESS;
2215 }
2216
2217 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2218 kern_return_t
2219 device_open_reply(
2220         MACH_PORT_FACE          reply_port,
2221         kern_return_t           return_code,
2222         MACH_PORT_FACE          device_port)
2223 {
2224         panic("device_open_reply: illegal\n");
2225         return KERN_SUCCESS;
2226 }
2227
2228 kern_return_t
2229 ps_read_device(
2230         paging_segment_t        ps,
2231         vm_offset_t             offset,
2232         vm_offset_t             *bufferp,
2233         unsigned int            size,
2234         unsigned int            *residualp,
2235         int                     flags)
2236 {
2237         kern_return_t   kr;
2238         recnum_t        dev_offset;
2239         unsigned int    bytes_wanted;
2240         unsigned int    bytes_read;
2241         unsigned int    total_read;
2242         vm_offset_t     dev_buffer;
2243         vm_offset_t     buf_ptr;
2244         unsigned int    records_read;
2245         struct vs_async *vsa;
2246         mutex_t vs_waiting_read_reply;
2247
2248         device_t        device;
2249         vm_map_copy_t   device_data = NULL;
2250         default_pager_thread_t *dpt = NULL;
2251
2252         device = dev_port_lookup(ps->ps_device);
2253         clustered_reads[atop_32(size)]++;
2254
2255         dev_offset = (ps->ps_offset +
2256                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2257         bytes_wanted = size;
2258         total_read = 0;
2259         *bufferp = (vm_offset_t)NULL;
2260
2261         do {
2262                 vsa = VS_ALLOC_ASYNC();
2263                 if (vsa) {
2264                         vsa->vsa_vs = NULL;
2265                         vsa->vsa_addr = 0;
2266                         vsa->vsa_offset = 0;
2267                         vsa->vsa_size = 0;
2268                         vsa->vsa_ps = NULL;
2269                 }
2270                 mutex_init(&vsa->vsa_lock, 0);
2271                 ip_lock(vsa->reply_port);
2272                 vsa->reply_port->ip_sorights++;
2273                 ip_reference(vsa->reply_port);
2274                 ip_unlock(vsa->reply_port);
2275                 kr = ds_device_read_common(device,
2276                                  vsa->reply_port,
2277                                  (mach_msg_type_name_t)
2278                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2279                                  (dev_mode_t) 0,
2280                                  dev_offset,
2281                                  bytes_wanted,
2282                                  (IO_READ | IO_CALL),
2283                                  (io_buf_ptr_t *) &dev_buffer,
2284                                  (mach_msg_type_number_t *) &bytes_read);
2285                 if(kr == MIG_NO_REPLY) {
2286                         assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2287                         thread_block(THREAD_CONTINUE_NULL);
2288
2289                         dev_buffer = vsa->vsa_addr;
2290                         bytes_read = (unsigned int)vsa->vsa_size;
2291                         kr = vsa->vsa_error;
2292                 }
2293                 VS_FREE_ASYNC(vsa);
2294                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2295                         break;
2296                 }
2297                 total_read += bytes_read;
2298
2299                 /*
2300                  * If we got the entire range, use the returned dev_buffer.
2301                  */
2302                 if (bytes_read == size) {
2303                         *bufferp = (vm_offset_t)dev_buffer;
2304                         break;
2305                 }
2306
2307 #if 1
2308                 dprintf(("read only %d bytes out of %d\n",
2309                          bytes_read, bytes_wanted));
2310 #endif
2311                 if(dpt == NULL) {
2312                         dpt = get_read_buffer();
2313                         buf_ptr = dpt->dpt_buffer;
2314                         *bufferp = (vm_offset_t)buf_ptr;
2315                 }
2316                 /*
2317                  * Otherwise, copy the data into the provided buffer (*bufferp)
2318                  * and append the rest of the range as it comes in.
2319                  */
2320                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2321                 buf_ptr += bytes_read;
2322                 bytes_wanted -= bytes_read;
2323                 records_read = (bytes_read >>
2324                                 (vm_page_shift - ps->ps_record_shift));
2325                 dev_offset += records_read;
2326                 DP_DEBUG(DEBUG_VS_INTERNAL,
2327                          ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2328                           dev_buffer, bytes_read));
2329                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2330                     != KERN_SUCCESS)
2331                         Panic("dealloc buf");
2332         } while (bytes_wanted);
2333
2334         *residualp = size - total_read;
2335         if((dev_buffer != *bufferp) && (total_read != 0)) {
2336                 vm_offset_t temp_buffer;
2337                 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2338                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2339                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2340                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2341                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2342                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2343                         (vm_map_copy_t *)&device_data, FALSE))
2344                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2345         }
2346         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2347                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2348                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2349                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2350                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2351                         (vm_map_copy_t *)&device_data, FALSE))
2352                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2353         }
2354         else {
2355                 device_data = NULL;
2356         }
2357         *bufferp = (vm_offset_t)device_data;
2358
2359         if(dpt != NULL) {
2360                 /* Free the receive buffer */
2361                 dpt->checked_out = 0;
2362                 thread_wakeup(&dpt_array);
2363         }
2364         return KERN_SUCCESS;
2365 }
2366
2367 kern_return_t
2368 ps_write_device(
2369         paging_segment_t        ps,
2370         vm_offset_t             offset,
2371         vm_offset_t             addr,
2372         unsigned int            size,
2373         struct vs_async         *vsa)
2374 {
2375         recnum_t        dev_offset;
2376         io_buf_len_t    bytes_to_write, bytes_written;
2377         recnum_t        records_written;
2378         kern_return_t   kr;
2379         MACH_PORT_FACE  reply_port;
2380
2381
2382
2383         clustered_writes[atop_32(size)]++;
2384
2385         dev_offset = (ps->ps_offset +
2386                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2387         bytes_to_write = size;
2388
2389         if (vsa) {
2390                 /*
2391                  * Asynchronous write.
2392                  */
2393                 reply_port = vsa->reply_port;
2394                 ip_lock(reply_port);
2395                 reply_port->ip_sorights++;
2396                 ip_reference(reply_port);
2397                 ip_unlock(reply_port);
2398                 {
2399                 device_t        device;
2400                 device = dev_port_lookup(ps->ps_device);
2401
2402                 vsa->vsa_addr = addr;
2403                 kr=ds_device_write_common(device,
2404                         reply_port,
2405                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2406                         (dev_mode_t) 0,
2407                         dev_offset,
2408                         (io_buf_ptr_t)  addr,
2409                         size,
2410                         (IO_WRITE | IO_CALL),
2411                         &bytes_written);
2412                 }
2413                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2414                         if (verbose)
2415                                 dprintf(("%s0x%x, addr=0x%x,"
2416                                          "size=0x%x,offset=0x%x\n",
2417                                          "device_write_request returned ",
2418                                          kr, addr, size, offset));
2419                         BS_STAT(ps->ps_bs,
2420                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2421                         /* do the completion notification to free resources */
2422                         device_write_reply(reply_port, kr, 0);
2423                         return PAGER_ERROR;
2424                 }
2425         } else do {
2426                 /*
2427                  * Synchronous write.
2428                  */
2429                 {
2430                 device_t        device;
2431                 device = dev_port_lookup(ps->ps_device);
2432                 kr=ds_device_write_common(device,
2433                         IP_NULL, 0,
2434                         (dev_mode_t) 0,
2435                         dev_offset,
2436                         (io_buf_ptr_t)  addr,
2437                         size,
2438                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2439                         &bytes_written);
2440                 }
2441                 if (kr != KERN_SUCCESS) {
2442                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2443                                  "device_write returned ",
2444                                  kr, addr, size, offset));
2445                         BS_STAT(ps->ps_bs,
2446                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2447                         return PAGER_ERROR;
2448                 }
2449                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2450                         Panic("fragmented write");
2451                 records_written = (bytes_written >>
2452                                    (vm_page_shift - ps->ps_record_shift));
2453                 dev_offset += records_written;
2454 #if 1
2455                 if (bytes_written != bytes_to_write) {
2456                         dprintf(("wrote only %d bytes out of %d\n",
2457                                  bytes_written, bytes_to_write));
2458                 }
2459 #endif
2460                 bytes_to_write -= bytes_written;
2461                 addr += bytes_written;
2462         } while (bytes_to_write > 0);
2463
2464         return PAGER_SUCCESS;
2465 }
2466
2467
2468 #else /* !DEVICE_PAGING */
2469
2470 kern_return_t
2471 ps_read_device(
2472         __unused paging_segment_t       ps,
2473         __unused vm_offset_t            offset,
2474         __unused vm_offset_t            *bufferp,
2475         __unused unsigned int           size,
2476         __unused unsigned int           *residualp,
2477         __unused int                            flags)
2478 {
2479   panic("ps_read_device not supported");
2480 }
2481
2482 kern_return_t
2483 ps_write_device(
2484         __unused paging_segment_t       ps,
2485         __unused vm_offset_t            offset,
2486         __unused vm_offset_t            addr,
2487         __unused unsigned int           size,
2488         __unused struct vs_async        *vsa)
2489 {
2490   panic("ps_write_device not supported");
2491 }
2492
2493 #endif /* DEVICE_PAGING */
2494 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t);      /* forward */
2495
2496 void
2497 pvs_object_data_provided(
2498         __unused vstruct_t              vs,
2499         __unused upl_t                  upl,
2500         __unused upl_offset_t   offset,
2501         upl_size_t                              size)
2502 {
2503
2504         DP_DEBUG(DEBUG_VS_INTERNAL,
2505                  ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2506                   upl, offset, size));
2507
2508         ASSERT(size > 0);
2509         GSTAT(global_stats.gs_pages_in += atop_32(size));
2510
2511
2512 #if     USE_PRECIOUS
2513         ps_clunmap(vs, offset, size);
2514 #endif  /* USE_PRECIOUS */
2515
2516 }
2517
2518 kern_return_t
2519 pvs_cluster_read(
2520         vstruct_t       vs,
2521         vm_offset_t     vs_offset,
2522         vm_size_t       cnt)
2523 {
2524         upl_t                   upl;
2525         kern_return_t           error = KERN_SUCCESS;
2526         int                                     size;
2527         int                     residual;
2528         unsigned int            request_flags;
2529         int                                     seg_index;
2530         int                                     pages_in_cl;
2531         int                     cl_size;
2532         int                     cl_mask;
2533         int                                     cl_index;
2534         int                     xfer_size;
2535         vm_offset_t       ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2536         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2537         struct clmap            clmap;
2538
2539         pages_in_cl = 1 << vs->vs_clshift;
2540         cl_size = pages_in_cl * vm_page_size;
2541         cl_mask = cl_size - 1;
2542
2543         /*
2544          * This loop will be executed multiple times until the entire
2545          * request has been satisfied... if the request spans cluster
2546          * boundaries, the clusters will be checked for logical continunity,
2547          * if contiguous the I/O request will span multiple clusters, otherwise
2548          * it will be broken up into the minimal set of I/O's
2549          *
2550          * If there are holes in a request (either unallocated pages in a paging
2551          * segment or an unallocated paging segment), we stop
2552          * reading at the hole, inform the VM of any data read, inform
2553          * the VM of an unavailable range, then loop again, hoping to
2554          * find valid pages later in the requested range.  This continues until
2555          * the entire range has been examined, and read, if present.
2556          */
2557
2558 #if     USE_PRECIOUS
2559         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
2560 #else
2561         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
2562 #endif
2563
2564         assert(dp_encryption_inited);
2565         if (dp_encryption) {
2566                 /*
2567                  * ENCRYPTED SWAP:
2568                  * request that the UPL be prepared for
2569                  * decryption.
2570                  */
2571                 request_flags |= UPL_ENCRYPT;
2572         }
2573
2574         while (cnt && (error == KERN_SUCCESS)) {
2575                 int     ps_info_valid;
2576                 int     page_list_count;
2577
2578                 if((vs_offset & cl_mask) &&
2579                         (cnt > (VM_SUPER_CLUSTER -
2580                                 (vs_offset & cl_mask)))) {
2581                         size = VM_SUPER_CLUSTER;
2582                         size -= vs_offset & cl_mask;
2583                 } else if (cnt > VM_SUPER_CLUSTER) {
2584                         size = VM_SUPER_CLUSTER;
2585                 } else {
2586                         size = cnt;
2587                 }
2588                 cnt -= size;
2589
2590                 ps_info_valid = 0;
2591                 seg_index     = 0;
2592
2593                 while (size > 0 && error == KERN_SUCCESS) {
2594                         int           abort_size;
2595                         int           failed_size;
2596                         int           beg_pseg;
2597                         int           beg_indx;
2598                         vm_offset_t   cur_offset;
2599
2600
2601                         if ( !ps_info_valid) {
2602                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2603                                 psp[seg_index]       = CLMAP_PS(clmap);
2604                                 ps_info_valid = 1;
2605                         }
2606                         /*
2607                          * skip over unallocated physical segments
2608                          */
2609                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2610                                 abort_size = cl_size - (vs_offset & cl_mask);
2611                                 abort_size = MIN(abort_size, size);
2612
2613                                 page_list_count = 0;
2614                                 memory_object_super_upl_request(
2615                                         vs->vs_control,
2616                                         (memory_object_offset_t)vs_offset,
2617                                         abort_size, abort_size,
2618                                         &upl, NULL, &page_list_count,
2619                                         request_flags);
2620
2621                                 if (clmap.cl_error) {
2622                                         upl_abort(upl, UPL_ABORT_ERROR);
2623                                 } else {
2624                                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2625                                 }
2626                                 upl_deallocate(upl);
2627
2628                                 size       -= abort_size;
2629                                 vs_offset  += abort_size;
2630
2631                                 seg_index++;
2632                                 ps_info_valid = 0;
2633                                 continue;
2634                         }
2635                         cl_index = (vs_offset & cl_mask) / vm_page_size;
2636
2637                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2638                                 /*
2639                                  * skip over unallocated pages
2640                                  */
2641                                 if (CLMAP_ISSET(clmap, cl_index))
2642                                         break;
2643                                 abort_size += vm_page_size;
2644                         }
2645                         if (abort_size) {
2646                                 /*
2647                                  * Let VM system know about holes in clusters.
2648                                  */
2649                                 GSTAT(global_stats.gs_pages_unavail += atop_32(abort_size));
2650
2651                                 page_list_count = 0;
2652                                 memory_object_super_upl_request(
2653                                         vs->vs_control,
2654                                         (memory_object_offset_t)vs_offset,
2655                                         abort_size, abort_size,
2656                                         &upl, NULL, &page_list_count,
2657                                         request_flags);
2658
2659                                 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2660                                 upl_deallocate(upl);
2661
2662                                 size       -= abort_size;
2663                                 vs_offset  += abort_size;
2664
2665                                 if (cl_index == pages_in_cl) {
2666                                         /*
2667                                          * if we're at the end of this physical cluster
2668                                          * then bump to the next one and continue looking
2669                                          */
2670                                         seg_index++;
2671                                         ps_info_valid = 0;
2672                                         continue;
2673                                 }
2674                                 if (size == 0)
2675                                         break;
2676                         }
2677                         /*
2678                          * remember the starting point of the first allocated page
2679                          * for the I/O we're about to issue
2680                          */
2681                         beg_pseg   = seg_index;
2682                         beg_indx   = cl_index;
2683                         cur_offset = vs_offset;
2684
2685                         /*
2686                          * calculate the size of the I/O that we can do...
2687                          * this may span multiple physical segments if
2688                          * they are contiguous
2689                          */
2690                         for (xfer_size = 0; xfer_size < size; ) {
2691
2692                                 while (cl_index < pages_in_cl
2693                                                 && xfer_size < size) {
2694                                         /*
2695                                          * accumulate allocated pages within
2696                                          * a physical segment
2697                                          */
2698                                         if (CLMAP_ISSET(clmap, cl_index)) {
2699                                                 xfer_size  += vm_page_size;
2700                                                 cur_offset += vm_page_size;
2701                                                 cl_index++;
2702
2703                                                 BS_STAT(psp[seg_index]->ps_bs,
2704                                                         psp[seg_index]->ps_bs->bs_pages_in++);
2705                                         } else
2706                                                 break;
2707                                 }
2708                                 if (cl_index < pages_in_cl
2709                                                 || xfer_size >= size) {
2710                                         /*
2711                                          * we've hit an unallocated page or
2712                                          * the end of this request... go fire
2713                                          * the I/O
2714                                          */
2715                                         break;
2716                                 }
2717                                 /*
2718                                  * we've hit the end of the current physical
2719                                  * segment and there's more to do, so try
2720                                  * moving to the next one
2721                                  */
2722                                 seg_index++;
2723
2724                                 ps_offset[seg_index] =
2725                                         ps_clmap(vs,
2726                                                 cur_offset & ~cl_mask,
2727                                                 &clmap, CL_FIND, 0, 0);
2728                                 psp[seg_index] = CLMAP_PS(clmap);
2729                                 ps_info_valid = 1;
2730
2731                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2732                                         /*
2733                                          * if the physical segment we're about
2734                                          * to step into is not contiguous to
2735                                          * the one we're currently in, or it's
2736                                          * in a different paging file, or
2737                                          * it hasn't been allocated....
2738                                          * we stop here and generate the I/O
2739                                          */
2740                                         break;
2741                                 }
2742                                 /*
2743                                  * start with first page of the next physical
2744                                  *  segment
2745                                  */
2746                                 cl_index = 0;
2747                         }
2748                         if (xfer_size) {
2749                                 /*
2750                                  * we have a contiguous range of allocated pages
2751                                  * to read from
2752                                  */
2753                                 page_list_count = 0;
2754                                 memory_object_super_upl_request(vs->vs_control,
2755                                         (memory_object_offset_t)vs_offset,
2756                                         xfer_size, xfer_size,
2757                                         &upl, NULL, &page_list_count,
2758                                         request_flags | UPL_SET_INTERNAL);
2759
2760                                 error = ps_read_file(psp[beg_pseg],
2761                                         upl, (upl_offset_t) 0,
2762                                         ps_offset[beg_pseg] +
2763                                                 (beg_indx * vm_page_size),
2764                                         xfer_size, &residual, 0);
2765                         } else
2766                                 continue;
2767
2768                         failed_size = 0;
2769
2770                         /*
2771                          * Adjust counts and send response to VM.  Optimize
2772                          * for the common case, i.e. no error and/or partial
2773                          * data. If there was an error, then we need to error
2774                          * the entire range, even if some data was successfully
2775                          * read. If there was a partial read we may supply some
2776                          * data and may error some as well.  In all cases the
2777                          * VM must receive some notification for every page
2778                          * in the range.
2779                          */
2780                         if ((error == KERN_SUCCESS) && (residual == 0)) {
2781                                 /*
2782                                  * Got everything we asked for, supply the data
2783                                  * to the VM.  Note that as a side effect of
2784                                  * supplying the data, the buffer holding the
2785                                  * supplied data is deallocated from the pager's
2786                                  *  address space.
2787                                  */
2788                                 pvs_object_data_provided(
2789                                         vs, upl, vs_offset, xfer_size);
2790                         } else {
2791                                 failed_size = xfer_size;
2792
2793                                 if (error == KERN_SUCCESS) {
2794                                         if (residual == xfer_size) {
2795                                         /*
2796                                          * If a read operation returns no error
2797                                          * and no data moved, we turn it into
2798                                          * an error, assuming we're reading at
2799                                          * or beyong EOF.
2800                                          * Fall through and error the entire
2801                                          * range.
2802                                          */
2803                                                 error = KERN_FAILURE;
2804                                         } else {
2805                                         /*
2806                                          * Otherwise, we have partial read. If
2807                                          * the part read is a integral number
2808                                          * of pages supply it. Otherwise round
2809                                          * it up to a page boundary, zero fill
2810                                          * the unread part, and supply it.
2811                                          * Fall through and error the remainder
2812                                          * of the range, if any.
2813                                          */
2814                                                 int fill, lsize;
2815
2816                                                 fill = residual
2817                                                         & ~vm_page_size;
2818                                                 lsize = (xfer_size - residual)
2819                                                                          + fill;
2820                                                 pvs_object_data_provided(
2821                                                         vs, upl,
2822                                                         vs_offset, lsize);
2823
2824                                                 if (lsize < xfer_size) {
2825                                                         failed_size =
2826                                                             xfer_size - lsize;
2827                                                         error = KERN_FAILURE;
2828                                                 }
2829                                         }
2830                                 }
2831                         }
2832                         /*
2833                          * If there was an error in any part of the range, tell
2834                          * the VM. Note that error is explicitly checked again
2835                          *  since it can be modified above.
2836                          */
2837                         if (error != KERN_SUCCESS) {
2838                                 BS_STAT(psp[beg_pseg]->ps_bs,
2839                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail
2840                                                 += atop_32(failed_size));
2841                         }
2842                         size       -= xfer_size;
2843                         vs_offset  += xfer_size;
2844                 }
2845
2846         } /* END while (cnt && (error == 0)) */
2847         return error;
2848 }
2849
2850 int vs_do_async_write = 1;
2851
2852 kern_return_t
2853 vs_cluster_write(
2854         vstruct_t       vs,
2855         upl_t           internal_upl,
2856         upl_offset_t    offset,
2857         upl_size_t      cnt,
2858         boolean_t       dp_internal,
2859         int             flags)
2860 {
2861         upl_size_t      transfer_size;
2862         int             error = 0;
2863         struct clmap    clmap;
2864
2865         vm_offset_t     actual_offset;  /* Offset within paging segment */
2866         paging_segment_t ps;
2867         vm_offset_t     mobj_base_addr;
2868         vm_offset_t     mobj_target_addr;
2869
2870         upl_t           upl;
2871         upl_page_info_t *pl;
2872         int             page_index;
2873         int             list_size;
2874         int             pages_in_cl;
2875         unsigned int    cl_size;
2876         int             base_index;
2877         unsigned int    seg_size;
2878
2879         pages_in_cl = 1 << vs->vs_clshift;
2880         cl_size = pages_in_cl * vm_page_size;
2881
2882         if (!dp_internal) {
2883                 int          page_list_count;
2884                 int          request_flags;
2885                 unsigned int super_size;
2886                 int          first_dirty;
2887                 int          num_dirty;
2888                 int          num_of_pages;
2889                 int          seg_index;
2890                 upl_offset_t  upl_offset;
2891                 vm_offset_t  seg_offset;
2892                 vm_offset_t  ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2893                 paging_segment_t   psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2894
2895
2896                 if (bs_low) {
2897                         super_size = cl_size;
2898
2899                         request_flags = UPL_NOBLOCK |
2900                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2901                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2902                 } else {
2903                         super_size = VM_SUPER_CLUSTER;
2904
2905                         request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2906                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2907                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2908                 }
2909
2910                 if (!dp_encryption_inited) {
2911                         /*
2912                          * ENCRYPTED SWAP:
2913                          * Once we've started using swap, we
2914                          * can't change our mind on whether
2915                          * it needs to be encrypted or
2916                          * not.
2917                          */
2918                         dp_encryption_inited = TRUE;
2919                 }
2920                 if (dp_encryption) {
2921                         /*
2922                          * ENCRYPTED SWAP:
2923                          * request that the UPL be prepared for
2924                          * encryption.
2925                          */
2926                         request_flags |= UPL_ENCRYPT;
2927                         flags |= UPL_PAGING_ENCRYPTED;
2928                 }
2929
2930                 page_list_count = 0;
2931                 memory_object_super_upl_request(vs->vs_control,
2932                                 (memory_object_offset_t)offset,
2933                                 cnt, super_size,
2934                                 &upl, NULL, &page_list_count,
2935                                 request_flags | UPL_FOR_PAGEOUT);
2936
2937                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2938
2939                 seg_size = cl_size - (upl->offset % cl_size);
2940                 upl_offset = upl->offset & ~(cl_size - 1);
2941
2942                 for (seg_index = 0, transfer_size = upl->size;
2943                                                 transfer_size > 0; ) {
2944                         ps_offset[seg_index] =
2945                                 ps_clmap(vs,
2946                                         upl_offset,
2947                                         &clmap, CL_ALLOC,
2948                                         cl_size, 0);
2949
2950                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2951                                 upl_abort(upl, 0);
2952                                 upl_deallocate(upl);
2953
2954                                 return KERN_FAILURE;
2955
2956                         }
2957                         psp[seg_index] = CLMAP_PS(clmap);
2958
2959                         if (transfer_size > seg_size) {
2960                                 transfer_size -= seg_size;
2961                                 upl_offset += cl_size;
2962                                 seg_size    = cl_size;
2963                                 seg_index++;
2964                         } else
2965                                 transfer_size = 0;
2966                 }
2967                 /*
2968                  * Ignore any non-present pages at the end of the
2969                  * UPL.
2970                  */
2971                 for (page_index = upl->size / vm_page_size; page_index > 0;)
2972                         if (UPL_PAGE_PRESENT(pl, --page_index))
2973                                 break;
2974                 num_of_pages = page_index + 1;
2975
2976                 base_index = (upl->offset % cl_size) / PAGE_SIZE;
2977
2978                 for (page_index = 0; page_index < num_of_pages; ) {
2979                         /*
2980                          * skip over non-dirty pages
2981                          */
2982                         for ( ; page_index < num_of_pages; page_index++) {
2983                                 if (UPL_DIRTY_PAGE(pl, page_index)
2984                                         || UPL_PRECIOUS_PAGE(pl, page_index))
2985                                         /*
2986                                          * this is a page we need to write
2987                                          * go see if we can buddy it up with
2988                                          * others that are contiguous to it
2989                                          */
2990                                         break;
2991                                 /*
2992                                  * if the page is not-dirty, but present we
2993                                  * need to commit it...  This is an unusual
2994                                  * case since we only asked for dirty pages
2995                                  */
2996                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
2997                                         boolean_t empty = FALSE;
2998                                         upl_commit_range(upl,
2999                                                  page_index * vm_page_size,
3000                                                  vm_page_size,
3001                                                  UPL_COMMIT_NOTIFY_EMPTY,
3002                                                  pl,
3003                                                  page_list_count,
3004                                                  &empty);
3005                                         if (empty) {
3006                                                 assert(page_index ==
3007                                                        num_of_pages - 1);
3008                                                 upl_deallocate(upl);
3009                                         }
3010                                 }
3011                         }
3012                         if (page_index == num_of_pages)
3013                                 /*
3014                                  * no more pages to look at, we're out of here
3015                                  */
3016                                 break;
3017
3018                         /*
3019                          * gather up contiguous dirty pages... we have at
3020                          * least 1 * otherwise we would have bailed above
3021                          * make sure that each physical segment that we step
3022                          * into is contiguous to the one we're currently in
3023                          * if it's not, we have to stop and write what we have
3024                          */
3025                         for (first_dirty = page_index;
3026                                         page_index < num_of_pages; ) {
3027                                 if ( !UPL_DIRTY_PAGE(pl, page_index)
3028                                         && !UPL_PRECIOUS_PAGE(pl, page_index))
3029                                         break;
3030                                 page_index++;
3031                                 /*
3032                                  * if we just looked at the last page in the UPL
3033                                  * we don't need to check for physical segment
3034                                  * continuity
3035                                  */
3036                                 if (page_index < num_of_pages) {
3037                                         int cur_seg;
3038                                         int nxt_seg;
3039
3040                                         cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3041                                         nxt_seg = (base_index + page_index)/pages_in_cl;
3042
3043                                         if (cur_seg != nxt_seg) {
3044                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3045                                                 /*
3046                                                  * if the segment we're about
3047                                                  * to step into is not
3048                                                  * contiguous to the one we're
3049                                                  * currently in, or it's in a
3050                                                  * different paging file....
3051                                                  * we stop here and generate
3052                                                  * the I/O
3053                                                  */
3054                                                         break;
3055                                         }
3056                                 }
3057                         }
3058                         num_dirty = page_index - first_dirty;
3059
3060                         if (num_dirty) {
3061                                 upl_offset = first_dirty * vm_page_size;
3062                                 transfer_size = num_dirty * vm_page_size;
3063
3064                                 while (transfer_size) {
3065
3066                                         if ((seg_size = cl_size -
3067                                                 ((upl->offset + upl_offset) % cl_size))
3068                                                         > transfer_size)
3069                                                 seg_size = transfer_size;
3070
3071                                         ps_vs_write_complete(vs,
3072                                                 upl->offset + upl_offset,
3073                                                 seg_size, error);
3074
3075                                         transfer_size -= seg_size;
3076                                         upl_offset += seg_size;
3077                                 }
3078                                 upl_offset = first_dirty * vm_page_size;
3079                                 transfer_size = num_dirty * vm_page_size;
3080
3081                                 seg_index  = (base_index + first_dirty) / pages_in_cl;
3082                                 seg_offset = (upl->offset + upl_offset) % cl_size;
3083
3084                                 error = ps_write_file(psp[seg_index],
3085                                                 upl, upl_offset,
3086                                                 ps_offset[seg_index]
3087                                                                 + seg_offset,
3088                                                 transfer_size, flags);
3089                         } else {
3090                                 boolean_t empty = FALSE;
3091                                 upl_abort_range(upl,
3092                                                 first_dirty * vm_page_size,
3093                                                 num_dirty   * vm_page_size,
3094                                                 UPL_ABORT_NOTIFY_EMPTY,
3095                                                 &empty);
3096                                 if (empty) {
3097                                         assert(page_index == num_of_pages);
3098                                         upl_deallocate(upl);
3099                                 }
3100                         }
3101                 }
3102
3103         } else {
3104                 assert(cnt  <= (vm_page_size << vs->vs_clshift));
3105                 list_size = cnt;
3106
3107                 page_index = 0;
3108                 /* The caller provides a mapped_data which is derived  */
3109                 /* from a temporary object.  The targeted pages are    */
3110                 /* guaranteed to be set at offset 0 in the mapped_data */
3111                 /* The actual offset however must still be derived     */
3112                 /* from the offset in the vs in question               */
3113                 mobj_base_addr = offset;
3114                 mobj_target_addr = mobj_base_addr;
3115
3116                 for (transfer_size = list_size; transfer_size != 0;) {
3117                         actual_offset = ps_clmap(vs, mobj_target_addr,
3118                                 &clmap, CL_ALLOC,
3119                                 transfer_size < cl_size ?
3120                                         transfer_size : cl_size, 0);
3121                         if(actual_offset == (vm_offset_t) -1) {
3122                                 error = 1;
3123                                 break;
3124                         }
3125                         cnt = MIN(transfer_size,
3126                                 CLMAP_NPGS(clmap) * vm_page_size);
3127                         ps = CLMAP_PS(clmap);
3128                         /* Assume that the caller has given us contiguous */
3129                         /* pages */
3130                         if(cnt) {
3131                                 ps_vs_write_complete(vs, mobj_target_addr,
3132                                                                 cnt, error);
3133                                 error = ps_write_file(ps, internal_upl,
3134                                                 0, actual_offset,
3135                                                 cnt, flags);
3136                                 if (error)
3137                                         break;
3138                            }
3139                         if (error)
3140                                 break;
3141                         actual_offset += cnt;
3142                         mobj_target_addr += cnt;
3143                         transfer_size -= cnt;
3144                         cnt = 0;
3145
3146                         if (error)
3147                                 break;
3148                 }
3149         }
3150         if(error)
3151                 return KERN_FAILURE;
3152         else
3153                 return KERN_SUCCESS;
3154 }
3155
3156 vm_size_t
3157 ps_vstruct_allocated_size(
3158         vstruct_t       vs)
3159 {
3160         int             num_pages;
3161         struct vs_map   *vsmap;
3162         unsigned int    i, j, k;
3163
3164         num_pages = 0;
3165         if (vs->vs_indirect) {
3166                 /* loop on indirect maps */
3167                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3168                         vsmap = vs->vs_imap[i];
3169                         if (vsmap == NULL)
3170                                 continue;
3171                         /* loop on clusters in this indirect map */
3172                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3173                                 if (VSM_ISCLR(vsmap[j]) ||
3174                                     VSM_ISERR(vsmap[j]))
3175                                         continue;
3176                                 /* loop on pages in this cluster */
3177                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3178                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3179                                                 num_pages++;
3180                                 }
3181                         }
3182                 }
3183         } else {
3184                 vsmap = vs->vs_dmap;
3185                 if (vsmap == NULL)
3186                         return 0;
3187                 /* loop on clusters in the direct map */
3188                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3189                         if (VSM_ISCLR(vsmap[j]) ||
3190                             VSM_ISERR(vsmap[j]))
3191                                 continue;
3192                         /* loop on pages in this cluster */
3193                         for (k = 0; k < VSCLSIZE(vs); k++) {
3194                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3195                                         num_pages++;
3196                         }
3197                 }
3198         }
3199
3200         return ptoa_32(num_pages);
3201 }
3202
3203 size_t
3204 ps_vstruct_allocated_pages(
3205         vstruct_t               vs,
3206         default_pager_page_t    *pages,
3207         size_t                  pages_size)
3208 {
3209         unsigned int    num_pages;
3210         struct vs_map   *vsmap;
3211         vm_offset_t     offset;
3212         unsigned int    i, j, k;
3213
3214         num_pages = 0;
3215         offset = 0;
3216         if (vs->vs_indirect) {
3217                 /* loop on indirect maps */
3218                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3219                         vsmap = vs->vs_imap[i];
3220                         if (vsmap == NULL) {
3221                                 offset += (vm_page_size * CLMAP_ENTRIES *
3222                                            VSCLSIZE(vs));
3223                                 continue;
3224                         }
3225                         /* loop on clusters in this indirect map */
3226                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3227                                 if (VSM_ISCLR(vsmap[j]) ||
3228                                     VSM_ISERR(vsmap[j])) {
3229                                         offset += vm_page_size * VSCLSIZE(vs);
3230                                         continue;
3231                                 }
3232                                 /* loop on pages in this cluster */
3233                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3234                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3235                                                 num_pages++;
3236                                                 if (num_pages < pages_size)
3237                                                         pages++->dpp_offset =
3238                                                                 offset;
3239                                         }
3240                                         offset += vm_page_size;
3241                                 }
3242                         }
3243                 }
3244         } else {
3245                 vsmap = vs->vs_dmap;
3246                 if (vsmap == NULL)
3247                         return 0;
3248                 /* loop on clusters in the direct map */
3249                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3250                         if (VSM_ISCLR(vsmap[j]) ||
3251                             VSM_ISERR(vsmap[j])) {
3252                                 offset += vm_page_size * VSCLSIZE(vs);
3253                                 continue;
3254                         }
3255                         /* loop on pages in this cluster */
3256                         for (k = 0; k < VSCLSIZE(vs); k++) {
3257                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3258                                         num_pages++;
3259                                         if (num_pages < pages_size)
3260                                                 pages++->dpp_offset = offset;
3261                                 }
3262                                 offset += vm_page_size;
3263                         }
3264                 }
3265         }
3266
3267         return num_pages;
3268 }
3269
3270
3271 kern_return_t
3272 ps_vstruct_transfer_from_segment(
3273         vstruct_t        vs,
3274         paging_segment_t segment,
3275         upl_t            upl)
3276 {
3277         struct vs_map   *vsmap;
3278 //      struct vs_map   old_vsmap;
3279 //      struct vs_map   new_vsmap;
3280         unsigned int    i, j;
3281
3282         VS_LOCK(vs);    /* block all work on this vstruct */
3283                         /* can't allow the normal multiple write */
3284                         /* semantic because writes may conflict */
3285         vs->vs_xfer_pending = TRUE;
3286         vs_wait_for_sync_writers(vs);
3287         vs_start_write(vs);
3288         vs_wait_for_readers(vs);
3289         /* we will unlock the vs to allow other writes while transferring */
3290         /* and will be guaranteed of the persistance of the vs struct     */
3291         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3292         /* vs_async_pending */
3293         /* OK we now have guaranteed no other parties are accessing this */
3294         /* vs.  Now that we are also supporting simple lock versions of  */
3295         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3296         /* our purpose in holding it before was the multiple write case */
3297         /* we now use the boolean xfer_pending to do that.  We can use  */
3298         /* a boolean instead of a count because we have guaranteed single */
3299         /* file access to this code in its caller */
3300         VS_UNLOCK(vs);
3301 vs_changed:
3302         if (vs->vs_indirect) {
3303                 unsigned int    vsmap_size;
3304                 int             clmap_off;
3305                 /* loop on indirect maps */
3306                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3307                         vsmap = vs->vs_imap[i];
3308                         if (vsmap == NULL)
3309                                 continue;
3310                         /* loop on clusters in this indirect map */
3311                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3312                                            VSCLSIZE(vs) * i);
3313                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3314                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3315                         else
3316                                 vsmap_size = CLMAP_ENTRIES;
3317                         for (j = 0; j < vsmap_size; j++) {
3318                                 if (VSM_ISCLR(vsmap[j]) ||
3319                                     VSM_ISERR(vsmap[j]) ||
3320                                     (VSM_PS(vsmap[j]) != segment))
3321                                         continue;
3322                                 if(vs_cluster_transfer(vs,
3323                                         (vm_page_size * (j << vs->vs_clshift))
3324                                         + clmap_off,
3325                                         vm_page_size << vs->vs_clshift,
3326                                         upl)
3327                                                 != KERN_SUCCESS) {
3328                                    VS_LOCK(vs);
3329                                    vs->vs_xfer_pending = FALSE;
3330                                    VS_UNLOCK(vs);
3331                                    vs_finish_write(vs);
3332                                    return KERN_FAILURE;
3333                                 }
3334                                 /* allow other readers/writers during transfer*/
3335                                 VS_LOCK(vs);
3336                                 vs->vs_xfer_pending = FALSE;
3337                                 VS_UNLOCK(vs);
3338                                 vs_finish_write(vs);
3339                                 VS_LOCK(vs);
3340                                 vs->vs_xfer_pending = TRUE;
3341                                 vs_wait_for_sync_writers(vs);
3342                                 vs_start_write(vs);
3343                                 vs_wait_for_readers(vs);
3344                                 VS_UNLOCK(vs);
3345                                 if (!(vs->vs_indirect)) {
3346                                         goto vs_changed;
3347                                 }
3348                         }
3349                 }
3350         } else {
3351                 vsmap = vs->vs_dmap;
3352                 if (vsmap == NULL) {
3353                         VS_LOCK(vs);
3354                         vs->vs_xfer_pending = FALSE;
3355                         VS_UNLOCK(vs);
3356                         vs_finish_write(vs);
3357                         return KERN_SUCCESS;
3358                 }
3359                 /* loop on clusters in the direct map */
3360                 for (j = 0; j < vs->vs_size; j++) {
3361                         if (VSM_ISCLR(vsmap[j]) ||
3362                             VSM_ISERR(vsmap[j]) ||
3363                             (VSM_PS(vsmap[j]) != segment))
3364                                 continue;
3365                         if(vs_cluster_transfer(vs,
3366                                 vm_page_size * (j << vs->vs_clshift),
3367                                 vm_page_size << vs->vs_clshift,
3368                                 upl) != KERN_SUCCESS) {
3369                            VS_LOCK(vs);
3370                            vs->vs_xfer_pending = FALSE;
3371                            VS_UNLOCK(vs);
3372                            vs_finish_write(vs);
3373                            return KERN_FAILURE;
3374                         }
3375                         /* allow other readers/writers during transfer*/
3376                         VS_LOCK(vs);
3377                         vs->vs_xfer_pending = FALSE;
3378                         VS_UNLOCK(vs);
3379                         vs_finish_write(vs);
3380                         VS_LOCK(vs);
3381                         vs->vs_xfer_pending = TRUE;
3382                         VS_UNLOCK(vs);
3383                         vs_wait_for_sync_writers(vs);
3384                         vs_start_write(vs);
3385                         vs_wait_for_readers(vs);
3386                         if (vs->vs_indirect) {
3387                                 goto vs_changed;
3388                         }
3389                 }
3390         }
3391
3392         VS_LOCK(vs);
3393         vs->vs_xfer_pending = FALSE;
3394         VS_UNLOCK(vs);
3395         vs_finish_write(vs);
3396         return KERN_SUCCESS;
3397 }
3398
3399
3400
3401 vs_map_t
3402 vs_get_map_entry(
3403         vstruct_t       vs,
3404         vm_offset_t     offset)
3405 {
3406         struct vs_map   *vsmap;
3407         vm_offset_t     cluster;
3408
3409         cluster = atop_32(offset) >> vs->vs_clshift;
3410         if (vs->vs_indirect) {
3411                 long    ind_block = cluster/CLMAP_ENTRIES;
3412
3413                 /* Is the indirect block allocated? */
3414                 vsmap = vs->vs_imap[ind_block];
3415                 if(vsmap == (vs_map_t) NULL)
3416                         return vsmap;
3417         } else
3418                 vsmap = vs->vs_dmap;
3419         vsmap += cluster%CLMAP_ENTRIES;
3420         return vsmap;
3421 }
3422
3423 kern_return_t
3424 vs_cluster_transfer(
3425         vstruct_t       vs,
3426         vm_offset_t     offset,
3427         vm_size_t       cnt,
3428         upl_t           upl)
3429 {
3430         vm_offset_t             actual_offset;
3431         paging_segment_t        ps;
3432         struct clmap            clmap;
3433         kern_return_t           error = KERN_SUCCESS;
3434         unsigned int            size, size_wanted;
3435         int                     i;
3436         unsigned int            residual;
3437         unsigned int            unavail_size;
3438 //      default_pager_thread_t  *dpt;
3439 //      boolean_t               dealloc;
3440         struct  vs_map          *vsmap_ptr = NULL;
3441         struct  vs_map          read_vsmap;
3442         struct  vs_map          original_read_vsmap;
3443         struct  vs_map          write_vsmap;
3444 //      upl_t                           sync_upl;
3445 //      vm_offset_t                     ioaddr;
3446
3447         /* vs_cluster_transfer reads in the pages of a cluster and
3448          * then writes these pages back to new backing store.  The
3449          * segment the pages are being read from is assumed to have
3450          * been taken off-line and is no longer considered for new
3451          * space requests.
3452          */
3453
3454         /*
3455          * This loop will be executed once per cluster referenced.
3456          * Typically this means once, since it's unlikely that the
3457          * VM system will ask for anything spanning cluster boundaries.
3458          *
3459          * If there are holes in a cluster (in a paging segment), we stop
3460          * reading at the hole, then loop again, hoping to
3461          * find valid pages later in the cluster.  This continues until
3462          * the entire range has been examined, and read, if present.  The
3463          * pages are written as they are read.  If a failure occurs after
3464          * some pages are written the unmap call at the bottom of the loop
3465          * recovers the backing store and the old backing store remains
3466          * in effect.
3467          */
3468
3469         VSM_CLR(write_vsmap);
3470         VSM_CLR(original_read_vsmap);
3471         /* grab the actual object's pages to sync with I/O */
3472         while (cnt && (error == KERN_SUCCESS)) {
3473                 vsmap_ptr = vs_get_map_entry(vs, offset);
3474                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3475
3476                 if (actual_offset == (vm_offset_t) -1) {
3477
3478                         /*
3479                          * Nothing left to write in this cluster at least
3480                          * set write cluster information for any previous
3481                          * write, clear for next cluster, if there is one
3482                          */
3483                         unsigned int local_size, clmask, clsize;
3484
3485                         clsize = vm_page_size << vs->vs_clshift;
3486                         clmask = clsize - 1;
3487                         local_size = clsize - (offset & clmask);
3488                         ASSERT(local_size);
3489                         local_size = MIN(local_size, cnt);
3490
3491                         /* This cluster has no data in it beyond what may */
3492                         /* have been found on a previous iteration through */
3493                         /* the loop "write_vsmap" */
3494                         *vsmap_ptr = write_vsmap;
3495                         VSM_CLR(write_vsmap);
3496                         VSM_CLR(original_read_vsmap);
3497
3498                         cnt -= local_size;
3499                         offset += local_size;
3500                         continue;
3501                 }
3502
3503                 /*
3504                  * Count up contiguous available or unavailable
3505                  * pages.
3506                  */
3507                 ps = CLMAP_PS(clmap);
3508                 ASSERT(ps);
3509                 size = 0;
3510                 unavail_size = 0;
3511                 for (i = 0;
3512                      (size < cnt) && (unavail_size < cnt) &&
3513                      (i < CLMAP_NPGS(clmap)); i++) {
3514                         if (CLMAP_ISSET(clmap, i)) {
3515                                 if (unavail_size != 0)
3516                                         break;
3517                                 size += vm_page_size;
3518                                 BS_STAT(ps->ps_bs,
3519                                         ps->ps_bs->bs_pages_in++);
3520                         } else {
3521                                 if (size != 0)
3522                                         break;
3523                                 unavail_size += vm_page_size;
3524                         }
3525                 }
3526
3527                 if (size == 0) {
3528                         ASSERT(unavail_size);
3529                         cnt -= unavail_size;
3530                         offset += unavail_size;
3531                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3532                                 == 0) {
3533                                 /* There is no more to transfer in this
3534                                    cluster
3535                                 */
3536                                 *vsmap_ptr = write_vsmap;
3537                                 VSM_CLR(write_vsmap);
3538                                 VSM_CLR(original_read_vsmap);
3539                         }
3540                         continue;
3541                 }
3542
3543                 if(VSM_ISCLR(original_read_vsmap))
3544                         original_read_vsmap = *vsmap_ptr;
3545
3546                 if(ps->ps_segtype == PS_PARTITION) {
3547 /*
3548                         NEED TO ISSUE WITH SYNC & NO COMMIT
3549                         error = ps_read_device(ps, actual_offset, &buffer,
3550                                        size, &residual, flags);
3551 */
3552                 } else {
3553                         /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3554                         error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
3555                                         size, &residual,
3556                                         (UPL_IOSYNC | UPL_NOCOMMIT));
3557                 }
3558
3559                 read_vsmap = *vsmap_ptr;
3560
3561
3562                 /*
3563                  * Adjust counts and put data in new BS.  Optimize for the
3564                  * common case, i.e. no error and/or partial data.
3565                  * If there was an error, then we need to error the entire
3566                  * range, even if some data was successfully read.
3567                  *
3568                  */
3569                 if ((error == KERN_SUCCESS) && (residual == 0)) {
3570
3571                         /*
3572                          * Got everything we asked for, supply the data to
3573                          * the new BS.  Note that as a side effect of supplying
3574                          * the data, the buffer holding the supplied data is
3575                          * deallocated from the pager's address space unless
3576                          * the write is unsuccessful.
3577                          */
3578
3579                         /* note buffer will be cleaned up in all cases by */
3580                         /* internal_cluster_write or if an error on write */
3581                         /* the vm_map_copy_page_discard call              */
3582                         *vsmap_ptr = write_vsmap;
3583
3584                         if(vs_cluster_write(vs, upl, offset,
3585                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3586                                 error = KERN_FAILURE;
3587                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
3588                                         /* unmap the new backing store object */
3589                                         ps_clunmap(vs, offset, size);
3590                                 }
3591                                 /* original vsmap */
3592                                 *vsmap_ptr = original_read_vsmap;
3593                                 VSM_CLR(write_vsmap);
3594                         } else {
3595                                if((offset + size) &
3596                                         ((vm_page_size << vs->vs_clshift)
3597                                         - 1)) {
3598                                         /* There is more to transfer in this
3599                                            cluster
3600                                         */
3601                                         write_vsmap = *vsmap_ptr;
3602                                         *vsmap_ptr = read_vsmap;
3603                                 } else {
3604                                         /* discard the old backing object */
3605                                         write_vsmap = *vsmap_ptr;
3606                                         *vsmap_ptr = read_vsmap;
3607                                         ps_clunmap(vs, offset, size);
3608                                         *vsmap_ptr = write_vsmap;
3609                                         VSM_CLR(write_vsmap);
3610                                         VSM_CLR(original_read_vsmap);
3611                                 }
3612                         }
3613                 } else {
3614                         size_wanted = size;
3615                         if (error == KERN_SUCCESS) {
3616                                 if (residual == size) {
3617                                         /*
3618                                          * If a read operation returns no error
3619                                          * and no data moved, we turn it into
3620                                          * an error, assuming we're reading at
3621                                          * or beyond EOF.
3622                                          * Fall through and error the entire
3623                                          * range.
3624                                          */
3625                                         error = KERN_FAILURE;
3626                                         *vsmap_ptr = write_vsmap;
3627                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3628                                         /* unmap the new backing store object */
3629                                         ps_clunmap(vs, offset, size);
3630                                         }
3631                                         *vsmap_ptr = original_read_vsmap;
3632                                         VSM_CLR(write_vsmap);
3633                                         continue;
3634                                 } else {
3635                                         /*
3636                                          * Otherwise, we have partial read.
3637                                          * This is also considered an error
3638                                          * for the purposes of cluster transfer
3639                                          */
3640                                         error = KERN_FAILURE;
3641                                         *vsmap_ptr = write_vsmap;
3642                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3643                                         /* unmap the new backing store object */
3644                                         ps_clunmap(vs, offset, size);
3645                                         }
3646                                         *vsmap_ptr = original_read_vsmap;
3647                                         VSM_CLR(write_vsmap);
3648                                         continue;
3649                                 }
3650                         }
3651
3652                 }
3653                 cnt -= size;
3654                 offset += size;
3655
3656         } /* END while (cnt && (error == 0)) */
3657         if(!VSM_ISCLR(write_vsmap))
3658                 *vsmap_ptr = write_vsmap;
3659
3660         return error;
3661 }
3662
3663 kern_return_t
3664 default_pager_add_file(
3665         MACH_PORT_FACE  backing_store,
3666         vnode_ptr_t     vp,
3667         int             record_size,
3668         vm_size_t       size)
3669 {
3670         backing_store_t         bs;
3671         paging_segment_t        ps;
3672         int                     i;
3673         unsigned int            j;
3674         int                     error;
3675
3676         if ((bs = backing_store_lookup(backing_store))
3677             == BACKING_STORE_NULL)
3678                 return KERN_INVALID_ARGUMENT;
3679
3680         PSL_LOCK();
3681         for (i = 0; i <= paging_segment_max; i++) {
3682                 ps = paging_segments[i];
3683                 if (ps == PAGING_SEGMENT_NULL)
3684                         continue;
3685                 if (ps->ps_segtype != PS_FILE)
3686                         continue;
3687
3688                 /*
3689                  * Check for overlap on same device.
3690                  */
3691                 if (ps->ps_vnode == (struct vnode *)vp) {
3692                         PSL_UNLOCK();
3693                         BS_UNLOCK(bs);
3694                         return KERN_INVALID_ARGUMENT;
3695                 }
3696         }
3697         PSL_UNLOCK();
3698
3699         /*
3700          * Set up the paging segment
3701          */
3702         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3703         if (ps == PAGING_SEGMENT_NULL) {
3704                 BS_UNLOCK(bs);
3705                 return KERN_RESOURCE_SHORTAGE;
3706         }
3707
3708         ps->ps_segtype = PS_FILE;
3709         ps->ps_vnode = (struct vnode *)vp;
3710         ps->ps_offset = 0;
3711         ps->ps_record_shift = local_log2(vm_page_size / record_size);
3712         ps->ps_recnum = size;
3713         ps->ps_pgnum = size >> ps->ps_record_shift;
3714
3715         ps->ps_pgcount = ps->ps_pgnum;
3716         ps->ps_clshift = local_log2(bs->bs_clsize);
3717         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3718         ps->ps_hint = 0;
3719
3720         PS_LOCK_INIT(ps);
3721         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3722         if (!ps->ps_bmap) {
3723                 kfree(ps, sizeof *ps);
3724                 BS_UNLOCK(bs);
3725                 return KERN_RESOURCE_SHORTAGE;
3726         }
3727         for (j = 0; j < ps->ps_ncls; j++) {
3728                 clrbit(ps->ps_bmap, j);
3729         }
3730
3731         ps->ps_going_away = FALSE;
3732         ps->ps_bs = bs;
3733
3734         if ((error = ps_enter(ps)) != 0) {
3735                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3736                 kfree(ps, sizeof *ps);
3737                 BS_UNLOCK(bs);
3738                 return KERN_RESOURCE_SHORTAGE;
3739         }
3740
3741         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3742         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3743         PSL_LOCK();
3744         dp_pages_free += ps->ps_pgcount;
3745         PSL_UNLOCK();
3746
3747         BS_UNLOCK(bs);
3748
3749         bs_more_space(ps->ps_clcount);
3750
3751         DP_DEBUG(DEBUG_BS_INTERNAL,
3752                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3753                   device, offset, size, record_size,
3754                   ps->ps_record_shift, ps->ps_pgnum));
3755
3756         return KERN_SUCCESS;
3757 }
3758
3759
3760
3761 kern_return_t
3762 ps_read_file(
3763         paging_segment_t        ps,
3764         upl_t                   upl,
3765         upl_offset_t            upl_offset,
3766         vm_offset_t             offset,
3767         upl_size_t              size,
3768         unsigned int            *residualp,
3769         int                     flags)
3770 {
3771         vm_object_offset_t      f_offset;
3772         int                     error = 0;
3773         int                     result;
3774
3775         assert(dp_encryption_inited);
3776
3777         clustered_reads[atop_32(size)]++;
3778
3779         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3780
3781         /* for transfer case we need to pass uploffset and flags */
3782         error = vnode_pagein(ps->ps_vnode,
3783                                    upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
3784
3785         /* The vnode_pagein semantic is somewhat at odds with the existing   */
3786         /* device_read semantic.  Partial reads are not experienced at this  */
3787         /* level.  It is up to the bit map code and cluster read code to     */
3788         /* check that requested data locations are actually backed, and the  */
3789         /* pagein code to either read all of the requested data or return an */
3790         /* error. */
3791
3792         if (error)
3793                 result = KERN_FAILURE;
3794         else {
3795                 *residualp = 0;
3796                 result = KERN_SUCCESS;
3797         }
3798         return result;
3799 }
3800
3801 kern_return_t
3802 ps_write_file(
3803         paging_segment_t        ps,
3804         upl_t                   upl,
3805         upl_offset_t            upl_offset,
3806         vm_offset_t             offset,
3807         unsigned int            size,
3808         int                     flags)
3809 {
3810         vm_object_offset_t      f_offset;
3811         kern_return_t           result;
3812
3813         assert(dp_encryption_inited);
3814
3815         clustered_writes[atop_32(size)]++;
3816         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3817
3818         if (flags & UPL_PAGING_ENCRYPTED) {
3819                 /*
3820                  * ENCRYPTED SWAP:
3821                  * encrypt all the pages that we're going
3822                  * to pageout.
3823                  */
3824                 upl_encrypt(upl, upl_offset, size);
3825         }
3826
3827         if (vnode_pageout(ps->ps_vnode,
3828                                 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3829                 result = KERN_FAILURE;
3830         else
3831                 result = KERN_SUCCESS;
3832
3833         return result;
3834 }
3835
3836 kern_return_t
3837 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
3838         int             hi_wat,
3839         int             lo_wat,
3840         int             flags,
3841         MACH_PORT_FACE  trigger_port)
3842 {
3843         MACH_PORT_FACE release;
3844         kern_return_t kr;
3845
3846         PSL_LOCK();
3847         if (flags == SWAP_ENCRYPT_ON) {
3848                 /* ENCRYPTED SWAP: turn encryption on */
3849                 release = trigger_port;
3850                 if (!dp_encryption_inited) {
3851                         dp_encryption_inited = TRUE;
3852                         dp_encryption = TRUE;
3853                         kr = KERN_SUCCESS;
3854                 } else {
3855                         kr = KERN_FAILURE;
3856                 }
3857         } else if (flags == SWAP_ENCRYPT_OFF) {
3858                 /* ENCRYPTED SWAP: turn encryption off */
3859                 release = trigger_port;
3860                 if (!dp_encryption_inited) {
3861                         dp_encryption_inited = TRUE;
3862                         dp_encryption = FALSE;
3863                         kr = KERN_SUCCESS;
3864                 } else {
3865                         kr = KERN_FAILURE;
3866                 }
3867         } else if (flags == HI_WAT_ALERT) {
3868                 release = min_pages_trigger_port;
3869                 min_pages_trigger_port = trigger_port;
3870                 minimum_pages_remaining = hi_wat/vm_page_size;
3871                 bs_low = FALSE;
3872                 kr = KERN_SUCCESS;
3873         } else if (flags ==  LO_WAT_ALERT) {
3874                 release = max_pages_trigger_port;
3875                 max_pages_trigger_port = trigger_port;
3876                 maximum_pages_free = lo_wat/vm_page_size;
3877                 kr = KERN_SUCCESS;
3878         } else {
3879                 release = trigger_port;
3880                 kr =  KERN_INVALID_ARGUMENT;
3881         }
3882         PSL_UNLOCK();
3883
3884         if (IP_VALID(release))
3885                 ipc_port_release_send(release);
3886
3887         return kr;
3888 }
3889
3890 /*
3891  * Monitor the amount of available backing store vs. the amount of
3892  * required backing store, notify a listener (if present) when
3893  * backing store may safely be removed.
3894  *
3895  * We attempt to avoid the situation where backing store is
3896  * discarded en masse, as this can lead to thrashing as the
3897  * backing store is compacted.
3898  */
3899
3900 #define PF_INTERVAL     3       /* time between free level checks */
3901 #define PF_LATENCY      10      /* number of intervals before release */
3902
3903 static int dp_pages_free_low_count = 0;
3904 thread_call_t default_pager_backing_store_monitor_callout;
3905
3906 void
3907 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
3908                                                                         __unused thread_call_param_t p2)
3909 {
3910 //      unsigned long long      average;
3911         ipc_port_t              trigger;
3912         uint64_t                deadline;
3913
3914         /*
3915          * We determine whether it will be safe to release some
3916          * backing store by watching the free page level.  If
3917          * it remains below the maximum_pages_free threshold for
3918          * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3919          * then we deem it safe.
3920          *
3921          * Note that this establishes a maximum rate at which backing
3922          * store will be released, as each notification (currently)
3923          * only results in a single backing store object being
3924          * released.
3925          */
3926         if (dp_pages_free > maximum_pages_free) {
3927                 dp_pages_free_low_count++;
3928         } else {
3929                 dp_pages_free_low_count = 0;
3930         }
3931
3932         /* decide whether to send notification */
3933         trigger = IP_NULL;
3934         if (max_pages_trigger_port &&
3935             (backing_store_release_trigger_disable == 0) &&
3936             (dp_pages_free_low_count > PF_LATENCY)) {
3937                 trigger = max_pages_trigger_port;
3938                 max_pages_trigger_port = NULL;
3939         }
3940
3941         /* send notification */
3942         if (trigger != IP_NULL) {
3943                 VSL_LOCK();
3944                 if(backing_store_release_trigger_disable != 0) {
3945                         assert_wait((event_t)
3946                                     &backing_store_release_trigger_disable,
3947                                     THREAD_UNINT);
3948                         VSL_UNLOCK();
3949                         thread_block(THREAD_CONTINUE_NULL);
3950                 } else {
3951                         VSL_UNLOCK();
3952                 }
3953                 default_pager_space_alert(trigger, LO_WAT_ALERT);
3954                 ipc_port_release_send(trigger);
3955                 dp_pages_free_low_count = 0;
3956         }
3957
3958         clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
3959         thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
3960 }