osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*
  23  * @OSF_COPYRIGHT@
  24  */
  25 /*
  26  * Mach Operating System
  27  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  28  * All Rights Reserved.
  29  *
  30  * Permission to use, copy, modify and distribute this software and its
  31  * documentation is hereby granted, provided that both the copyright
  32  * notice and this permission notice appear in all copies of the
  33  * software, derivative works or modified versions, and any portions
  34  * thereof, and that both notices appear in supporting documentation.
  35  *
  36  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  37  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  38  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  39  *
  40  * Carnegie Mellon requests users of this software to return to
  41  *
  42  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  43  *  School of Computer Science
  44  *  Carnegie Mellon University
  45  *  Pittsburgh PA 15213-3890
  46  *
  47  * any improvements or extensions that they make and grant Carnegie Mellon
  48  * the rights to redistribute these changes.
  49  */
  50
  51 /*
  52  *      Default Pager.
  53  *              Paging File Management.
  54  */
  55
  56 #include <mach/host_priv.h>
  57 #include <mach/memory_object_control.h>
  58 #include <mach/memory_object_server.h>
  59 #include <mach/upl.h>
  60 #include <default_pager/default_pager_internal.h>
  61 #include <default_pager/default_pager_alerts.h>
  62 #include <default_pager/default_pager_object_server.h>
  63
  64 #include <ipc/ipc_types.h>
  65 #include <ipc/ipc_port.h>
  66 #include <ipc/ipc_space.h>
  67
  68 #include <kern/kern_types.h>
  69 #include <kern/host.h>
  70 #include <kern/queue.h>
  71 #include <kern/counters.h>
  72 #include <kern/sched_prim.h>
  73
  74 #include <vm/vm_kern.h>
  75 #include <vm/vm_pageout.h>
  76 #include <vm/vm_map.h>
  77 #include <vm/vm_object.h>
  78 #include <vm/vm_protos.h>
  79
  80 /* LP64todo - need large internal object support */
  81
  82 /*
  83  * ALLOC_STRIDE... the maximum number of bytes allocated from
  84  * a swap file before moving on to the next swap file... if
  85  * all swap files reside on a single disk, this value should
  86  * be very large (this is the default assumption)... if the
  87  * swap files are spread across multiple disks, than this value
  88  * should be small (128 * 1024)...
  89  *
  90  * This should be determined dynamically in the future
  91  */
  92
  93 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
  94 int physical_transfer_cluster_count = 0;
  95
  96 #define VM_SUPER_CLUSTER        0x40000
  97 #define VM_SUPER_PAGES          64
  98
  99 /*
 100  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
 101  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
 102  */
 103 #define VSTRUCT_DEF_CLSHIFT     2
 104 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
 105 int default_pager_clsize = 0;
 106
 107 /* statistics */
 108 unsigned int clustered_writes[VM_SUPER_PAGES+1];
 109 unsigned int clustered_reads[VM_SUPER_PAGES+1];
 110
 111 /*
 112  * Globals used for asynchronous paging operations:
 113  *      vs_async_list:  head of list of to-be-completed I/O ops
 114  *      async_num_queued: number of pages completed, but not yet
 115  *              processed by async thread.
 116  *      async_requests_out: number of pages of requests not completed.
 117  */
 118
 119 #if 0
 120 struct vs_async *vs_async_list;
 121 int     async_num_queued;
 122 int     async_requests_out;
 123 #endif
 124
 125
 126 #define VS_ASYNC_REUSE 1
 127 struct vs_async *vs_async_free_list;
 128
 129 mutex_t default_pager_async_lock;       /* Protects globals above */
 130
 131
 132 int vs_alloc_async_failed = 0;                  /* statistics */
 133 int vs_alloc_async_count = 0;                   /* statistics */
 134 struct vs_async *vs_alloc_async(void);          /* forward */
 135 void vs_free_async(struct vs_async *vsa);       /* forward */
 136
 137
 138 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 139 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 140
 141 #define VS_ASYNC_LOCK()         mutex_lock(&default_pager_async_lock)
 142 #define VS_ASYNC_UNLOCK()       mutex_unlock(&default_pager_async_lock)
 143 #define VS_ASYNC_LOCK_INIT()    mutex_init(&default_pager_async_lock, 0)
 144 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 145 /*
 146  *  Paging Space Hysteresis triggers and the target notification port
 147  *
 148  */
 149
 150 unsigned int    minimum_pages_remaining = 0;
 151 unsigned int    maximum_pages_free = 0;
 152 ipc_port_t      min_pages_trigger_port = NULL;
 153 ipc_port_t      max_pages_trigger_port = NULL;
 154
 155 boolean_t       bs_low = FALSE;
 156 int             backing_store_release_trigger_disable = 0;
 157
 158
 159 /* Have we decided if swap needs to be encrypted yet ? */
 160 boolean_t       dp_encryption_inited = FALSE;
 161 /* Should we encrypt swap ? */
 162 boolean_t       dp_encryption = FALSE;
 163
 164
 165 /*
 166  * Object sizes are rounded up to the next power of 2,
 167  * unless they are bigger than a given maximum size.
 168  */
 169 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 170
 171 /*
 172  * List of all backing store and segments.
 173  */
 174 struct backing_store_list_head backing_store_list;
 175 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 176 mutex_t                 paging_segments_lock;
 177 int                     paging_segment_max = 0;
 178 int                     paging_segment_count = 0;
 179 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 180
 181
 182 /*
 183  * Total pages free in system
 184  * This differs from clusters committed/avail which is a measure of the
 185  * over commitment of paging segments to backing store.  An idea which is
 186  * likely to be deprecated.
 187  */
 188 unsigned  int   dp_pages_free = 0;
 189 unsigned  int   cluster_transfer_minimum = 100;
 190
 191 /* forward declarations */
 192 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, int);     /* forward */
 193 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, unsigned int *, int);     /* forward */
 194 default_pager_thread_t *get_read_buffer( void );
 195 kern_return_t ps_vstruct_transfer_from_segment(
 196         vstruct_t        vs,
 197         paging_segment_t segment,
 198         upl_t            upl);
 199 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
 200 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
 201 kern_return_t vs_cluster_transfer(
 202         vstruct_t       vs,
 203         upl_offset_t    offset,
 204         upl_size_t      cnt,
 205         upl_t           upl);
 206 vs_map_t vs_get_map_entry(
 207         vstruct_t       vs,
 208         vm_offset_t     offset);
 209
 210
 211 default_pager_thread_t *
 212 get_read_buffer( void )
 213 {
 214         int     i;
 215
 216         DPT_LOCK(dpt_lock);
 217         while(TRUE) {
 218                 for (i=0; i<default_pager_internal_count; i++) {
 219                         if(dpt_array[i]->checked_out == FALSE) {
 220                           dpt_array[i]->checked_out = TRUE;
 221                           DPT_UNLOCK(dpt_lock);
 222                           return  dpt_array[i];
 223                         }
 224                 }
 225                 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
 226         }
 227 }
 228
 229 void
 230 bs_initialize(void)
 231 {
 232         int i;
 233
 234         /*
 235          * List of all backing store.
 236          */
 237         BSL_LOCK_INIT();
 238         queue_init(&backing_store_list.bsl_queue);
 239         PSL_LOCK_INIT();
 240
 241         VS_ASYNC_LOCK_INIT();
 242 #if     VS_ASYNC_REUSE
 243         vs_async_free_list = NULL;
 244 #endif  /* VS_ASYNC_REUSE */
 245
 246         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 247                 clustered_writes[i] = 0;
 248                 clustered_reads[i] = 0;
 249         }
 250
 251 }
 252
 253 /*
 254  * When things do not quite workout...
 255  */
 256 void bs_no_paging_space(boolean_t);     /* forward */
 257
 258 void
 259 bs_no_paging_space(
 260         boolean_t out_of_memory)
 261 {
 262
 263         if (out_of_memory)
 264                 dprintf(("*** OUT OF MEMORY ***\n"));
 265         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 266 }
 267
 268 void bs_more_space(int);        /* forward */
 269 void bs_commit(int);            /* forward */
 270
 271 boolean_t       user_warned = FALSE;
 272 unsigned int    clusters_committed = 0;
 273 unsigned int    clusters_available = 0;
 274 unsigned int    clusters_committed_peak = 0;
 275
 276 void
 277 bs_more_space(
 278         int     nclusters)
 279 {
 280         BSL_LOCK();
 281         /*
 282          * Account for new paging space.
 283          */
 284         clusters_available += nclusters;
 285
 286         if (clusters_available >= clusters_committed) {
 287                 if (verbose && user_warned) {
 288                         printf("%s%s - %d excess clusters now.\n",
 289                                my_name,
 290                                "paging space is OK now",
 291                                clusters_available - clusters_committed);
 292                         user_warned = FALSE;
 293                         clusters_committed_peak = 0;
 294                 }
 295         } else {
 296                 if (verbose && user_warned) {
 297                         printf("%s%s - still short of %d clusters.\n",
 298                                my_name,
 299                                "WARNING: paging space over-committed",
 300                                clusters_committed - clusters_available);
 301                         clusters_committed_peak -= nclusters;
 302                 }
 303         }
 304         BSL_UNLOCK();
 305
 306         return;
 307 }
 308
 309 void
 310 bs_commit(
 311         int     nclusters)
 312 {
 313         BSL_LOCK();
 314         clusters_committed += nclusters;
 315         if (clusters_committed > clusters_available) {
 316                 if (verbose && !user_warned) {
 317                         user_warned = TRUE;
 318                         printf("%s%s - short of %d clusters.\n",
 319                                my_name,
 320                                "WARNING: paging space over-committed",
 321                                clusters_committed - clusters_available);
 322                 }
 323                 if (clusters_committed > clusters_committed_peak) {
 324                         clusters_committed_peak = clusters_committed;
 325                 }
 326         } else {
 327                 if (verbose && user_warned) {
 328                         printf("%s%s - was short of up to %d clusters.\n",
 329                                my_name,
 330                                "paging space is OK now",
 331                                clusters_committed_peak - clusters_available);
 332                         user_warned = FALSE;
 333                         clusters_committed_peak = 0;
 334                 }
 335         }
 336         BSL_UNLOCK();
 337
 338         return;
 339 }
 340
 341 int default_pager_info_verbose = 1;
 342
 343 void
 344 bs_global_info(
 345         vm_size_t       *totalp,
 346         vm_size_t       *freep)
 347 {
 348         vm_size_t               pages_total, pages_free;
 349         paging_segment_t        ps;
 350         int                     i;
 351
 352         PSL_LOCK();
 353         pages_total = pages_free = 0;
 354         for (i = 0; i <= paging_segment_max; i++) {
 355                 ps = paging_segments[i];
 356                 if (ps == PAGING_SEGMENT_NULL)
 357                         continue;
 358
 359                 /*
 360                  * no need to lock: by the time this data
 361                  * gets back to any remote requestor it
 362                  * will be obsolete anyways
 363                  */
 364                 pages_total += ps->ps_pgnum;
 365                 pages_free += ps->ps_clcount << ps->ps_clshift;
 366                 DP_DEBUG(DEBUG_BS_INTERNAL,
 367                          ("segment #%d: %d total, %d free\n",
 368                           i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 369         }
 370         *totalp = pages_total;
 371         *freep = pages_free;
 372         if (verbose && user_warned && default_pager_info_verbose) {
 373                 if (clusters_available < clusters_committed) {
 374                         printf("%s %d clusters committed, %d available.\n",
 375                                my_name,
 376                                clusters_committed,
 377                                clusters_available);
 378                 }
 379         }
 380         PSL_UNLOCK();
 381 }
 382
 383 backing_store_t backing_store_alloc(void);      /* forward */
 384
 385 backing_store_t
 386 backing_store_alloc(void)
 387 {
 388         backing_store_t bs;
 389
 390         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 391         if (bs == BACKING_STORE_NULL)
 392                 panic("backing_store_alloc: no memory");
 393
 394         BS_LOCK_INIT(bs);
 395         bs->bs_port = MACH_PORT_NULL;
 396         bs->bs_priority = 0;
 397         bs->bs_clsize = 0;
 398         bs->bs_pages_total = 0;
 399         bs->bs_pages_in = 0;
 400         bs->bs_pages_in_fail = 0;
 401         bs->bs_pages_out = 0;
 402         bs->bs_pages_out_fail = 0;
 403
 404         return bs;
 405 }
 406
 407 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 408
 409 /* Even in both the component space and external versions of this pager, */
 410 /* backing_store_lookup will be called from tasks in the application space */
 411 backing_store_t
 412 backing_store_lookup(
 413         MACH_PORT_FACE port)
 414 {
 415         backing_store_t bs;
 416
 417 /*
 418         port is currently backed with a vs structure in the alias field
 419         we could create an ISBS alias and a port_is_bs call but frankly
 420         I see no reason for the test, the bs->port == port check below
 421         will work properly on junk entries.
 422
 423         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 424 */
 425         if ((port == MACH_PORT_NULL))
 426                 return BACKING_STORE_NULL;
 427
 428         BSL_LOCK();
 429         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 430                       bs_links) {
 431                 BS_LOCK(bs);
 432                 if (bs->bs_port == port) {
 433                         BSL_UNLOCK();
 434                         /* Success, return it locked. */
 435                         return bs;
 436                 }
 437                 BS_UNLOCK(bs);
 438         }
 439         BSL_UNLOCK();
 440         return BACKING_STORE_NULL;
 441 }
 442
 443 void backing_store_add(backing_store_t);        /* forward */
 444
 445 void
 446 backing_store_add(
 447         __unused backing_store_t bs)
 448 {
 449 //      MACH_PORT_FACE          port = bs->bs_port;
 450 //      MACH_PORT_FACE          pset = default_pager_default_set;
 451         kern_return_t           kr = KERN_SUCCESS;
 452
 453         if (kr != KERN_SUCCESS)
 454                 panic("backing_store_add: add to set");
 455
 456 }
 457
 458 /*
 459  * Set up default page shift, but only if not already
 460  * set and argument is within range.
 461  */
 462 boolean_t
 463 bs_set_default_clsize(unsigned int npages)
 464 {
 465         switch(npages){
 466             case 1:
 467             case 2:
 468             case 4:
 469             case 8:
 470                 if (default_pager_clsize == 0)  /* if not yet set */
 471                         vstruct_def_clshift = local_log2(npages);
 472                 return(TRUE);
 473         }
 474         return(FALSE);
 475 }
 476
 477 int bs_get_global_clsize(int clsize);   /* forward */
 478
 479 int
 480 bs_get_global_clsize(
 481         int     clsize)
 482 {
 483         int                     i;
 484         memory_object_default_t dmm;
 485         kern_return_t           kr;
 486
 487         /*
 488          * Only allow setting of cluster size once. If called
 489          * with no cluster size (default), we use the compiled-in default
 490          * for the duration. The same cluster size is used for all
 491          * paging segments.
 492          */
 493         if (default_pager_clsize == 0) {
 494                 /*
 495                  * Keep cluster size in bit shift because it's quicker
 496                  * arithmetic, and easier to keep at a power of 2.
 497                  */
 498                 if (clsize != NO_CLSIZE) {
 499                         for (i = 0; (1 << i) < clsize; i++);
 500                         if (i > MAX_CLUSTER_SHIFT)
 501                                 i = MAX_CLUSTER_SHIFT;
 502                         vstruct_def_clshift = i;
 503                 }
 504                 default_pager_clsize = (1 << vstruct_def_clshift);
 505
 506                 /*
 507                  * Let the user know the new (and definitive) cluster size.
 508                  */
 509                 if (verbose)
 510                         printf("%scluster size = %d page%s\n",
 511                                 my_name, default_pager_clsize,
 512                                 (default_pager_clsize == 1) ? "" : "s");
 513
 514                 /*
 515                  * Let the kernel know too, in case it hasn't used the
 516                  * default value provided in main() yet.
 517                  */
 518                 dmm = default_pager_object;
 519                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 520                 kr = host_default_memory_manager(host_priv_self(),
 521                                                  &dmm,
 522                                                  clsize);
 523                 memory_object_default_deallocate(dmm);
 524
 525                 if (kr != KERN_SUCCESS) {
 526                    panic("bs_get_global_cl_size:host_default_memory_manager");
 527                 }
 528                 if (dmm != default_pager_object) {
 529                   panic("bs_get_global_cl_size:there is another default pager");
 530                 }
 531         }
 532         ASSERT(default_pager_clsize > 0 &&
 533                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 534
 535         return default_pager_clsize;
 536 }
 537
 538 kern_return_t
 539 default_pager_backing_store_create(
 540         memory_object_default_t pager,
 541         int                     priority,
 542         int                     clsize,         /* in bytes */
 543         MACH_PORT_FACE          *backing_store)
 544 {
 545         backing_store_t bs;
 546         MACH_PORT_FACE  port;
 547 //      kern_return_t   kr;
 548         struct vstruct_alias *alias_struct;
 549
 550         if (pager != default_pager_object)
 551                 return KERN_INVALID_ARGUMENT;
 552
 553         bs = backing_store_alloc();
 554         port = ipc_port_alloc_kernel();
 555         ipc_port_make_send(port);
 556         assert (port != IP_NULL);
 557
 558         DP_DEBUG(DEBUG_BS_EXTERNAL,
 559                  ("priority=%d clsize=%d bs_port=0x%x\n",
 560                   priority, clsize, (int) backing_store));
 561
 562         alias_struct = (struct vstruct_alias *)
 563                                 kalloc(sizeof (struct vstruct_alias));
 564         if(alias_struct != NULL) {
 565                 alias_struct->vs = (struct vstruct *)bs;
 566                 alias_struct->name = &default_pager_ops;
 567                 port->alias = (int) alias_struct;
 568         }
 569         else {
 570                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 571                 kfree(bs, sizeof (struct backing_store));
 572                 return KERN_RESOURCE_SHORTAGE;
 573         }
 574
 575         bs->bs_port = port;
 576         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 577                 priority = BS_MAXPRI;
 578         else if (priority == BS_NOPRI)
 579                 priority = BS_MAXPRI;
 580         else
 581                 priority = BS_MINPRI;
 582         bs->bs_priority = priority;
 583
 584         bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
 585
 586         BSL_LOCK();
 587         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 588                     bs_links);
 589         BSL_UNLOCK();
 590
 591         backing_store_add(bs);
 592
 593         *backing_store = port;
 594         return KERN_SUCCESS;
 595 }
 596
 597 kern_return_t
 598 default_pager_backing_store_info(
 599         MACH_PORT_FACE          backing_store,
 600         backing_store_flavor_t  flavour,
 601         backing_store_info_t    info,
 602         mach_msg_type_number_t  *size)
 603 {
 604         backing_store_t                 bs;
 605         backing_store_basic_info_t      basic;
 606         int                             i;
 607         paging_segment_t                ps;
 608
 609         if (flavour != BACKING_STORE_BASIC_INFO ||
 610             *size < BACKING_STORE_BASIC_INFO_COUNT)
 611                 return KERN_INVALID_ARGUMENT;
 612
 613         basic = (backing_store_basic_info_t)info;
 614         *size = BACKING_STORE_BASIC_INFO_COUNT;
 615
 616         VSTATS_LOCK(&global_stats.gs_lock);
 617         basic->pageout_calls    = global_stats.gs_pageout_calls;
 618         basic->pagein_calls     = global_stats.gs_pagein_calls;
 619         basic->pages_in         = global_stats.gs_pages_in;
 620         basic->pages_out        = global_stats.gs_pages_out;
 621         basic->pages_unavail    = global_stats.gs_pages_unavail;
 622         basic->pages_init       = global_stats.gs_pages_init;
 623         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 624         VSTATS_UNLOCK(&global_stats.gs_lock);
 625
 626         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 627                 return KERN_INVALID_ARGUMENT;
 628
 629         basic->bs_pages_total   = bs->bs_pages_total;
 630         PSL_LOCK();
 631         bs->bs_pages_free = 0;
 632         for (i = 0; i <= paging_segment_max; i++) {
 633                 ps = paging_segments[i];
 634                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 635                         PS_LOCK(ps);
 636                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 637                         PS_UNLOCK(ps);
 638                 }
 639         }
 640         PSL_UNLOCK();
 641         basic->bs_pages_free    = bs->bs_pages_free;
 642         basic->bs_pages_in      = bs->bs_pages_in;
 643         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 644         basic->bs_pages_out     = bs->bs_pages_out;
 645         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 646
 647         basic->bs_priority      = bs->bs_priority;
 648         basic->bs_clsize        = ptoa_32(bs->bs_clsize);       /* in bytes */
 649
 650         BS_UNLOCK(bs);
 651
 652         return KERN_SUCCESS;
 653 }
 654
 655 int ps_delete(paging_segment_t);        /* forward */
 656
 657 int
 658 ps_delete(
 659         paging_segment_t ps)
 660 {
 661         vstruct_t       vs;
 662         kern_return_t   error = KERN_SUCCESS;
 663         int             vs_count;
 664
 665         VSL_LOCK();             /* get the lock on the list of vs's      */
 666
 667         /* The lock relationship and sequence is farily complicated      */
 668         /* this code looks at a live list, locking and unlocking the list */
 669         /* as it traverses it.  It depends on the locking behavior of    */
 670         /* default_pager_no_senders.  no_senders always locks the vstruct */
 671         /* targeted for removal before locking the vstruct list.  However */
 672         /* it will remove that member of the list without locking its    */
 673         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 674         /* it cannot be removed from the list but we must hold the list  */
 675         /* lock to be sure that its pointers to its neighbors are valid. */
 676         /* Also, we can hold off destruction of a vstruct when the list  */
 677         /* lock and the vs locks are not being held by bumping the       */
 678         /* vs_async_pending count.      */
 679
 680
 681         while(backing_store_release_trigger_disable != 0) {
 682                 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
 683         }
 684
 685         /* we will choose instead to hold a send right */
 686         vs_count = vstruct_list.vsl_count;
 687         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 688         if(vs == (vstruct_t)&vstruct_list)  {
 689                 VSL_UNLOCK();
 690                 return KERN_SUCCESS;
 691         }
 692         VS_LOCK(vs);
 693         vs_async_wait(vs);  /* wait for any pending async writes */
 694         if ((vs_count != 0) && (vs != NULL))
 695                 vs->vs_async_pending += 1;  /* hold parties calling  */
 696                                             /* vs_async_wait */
 697         VS_UNLOCK(vs);
 698         VSL_UNLOCK();
 699         while((vs_count != 0) && (vs != NULL)) {
 700                 /* We take the count of AMO's before beginning the         */
 701                 /* transfer of of the target segment.                      */
 702                 /* We are guaranteed that the target segment cannot get    */
 703                 /* more users.  We also know that queue entries are        */
 704                 /* made at the back of the list.  If some of the entries   */
 705                 /* we would check disappear while we are traversing the    */
 706                 /* list then we will either check new entries which        */
 707                 /* do not have any backing store in the target segment     */
 708                 /* or re-check old entries.  This might not be optimal     */
 709                 /* but it will always be correct. The alternative is to    */
 710                 /* take a snapshot of the list.                            */
 711                 vstruct_t       next_vs;
 712
 713                 if(dp_pages_free < cluster_transfer_minimum)
 714                         error = KERN_FAILURE;
 715                 else {
 716                         vm_object_t     transfer_object;
 717                         unsigned int    count;
 718                         upl_t           upl;
 719
 720                         transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
 721                         count = 0;
 722                         error = vm_object_upl_request(transfer_object,
 723                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 724                                 &upl, NULL, &count,
 725                                 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
 726                                             | UPL_SET_INTERNAL);
 727                         if(error == KERN_SUCCESS) {
 728                                 error = ps_vstruct_transfer_from_segment(
 729                                                         vs, ps, upl);
 730                                 upl_commit(upl, NULL, 0);
 731                                 upl_deallocate(upl);
 732                         } else {
 733                                 error = KERN_FAILURE;
 734                         }
 735                         vm_object_deallocate(transfer_object);
 736                 }
 737                 if(error) {
 738                         VS_LOCK(vs);
 739                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 740                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 741                                 vs->vs_waiting_async = FALSE;
 742                                 VS_UNLOCK(vs);
 743                                 thread_wakeup(&vs->vs_async_pending);
 744                         } else {
 745                                 VS_UNLOCK(vs);
 746                         }
 747                         return KERN_FAILURE;
 748                 }
 749
 750                 VSL_LOCK();
 751
 752                 while(backing_store_release_trigger_disable != 0) {
 753                         VSL_SLEEP(&backing_store_release_trigger_disable,
 754                                   THREAD_UNINT);
 755                 }
 756
 757                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 758                 if((next_vs != (vstruct_t)&vstruct_list) &&
 759                                 (vs != next_vs) && (vs_count != 1)) {
 760                         VS_LOCK(next_vs);
 761                         vs_async_wait(next_vs);  /* wait for any  */
 762                                                  /* pending async writes */
 763                         next_vs->vs_async_pending += 1; /* hold parties  */
 764                                                 /* calling vs_async_wait */
 765                         VS_UNLOCK(next_vs);
 766                 }
 767                 VSL_UNLOCK();
 768                 VS_LOCK(vs);
 769                 vs->vs_async_pending -= 1;
 770                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 771                         vs->vs_waiting_async = FALSE;
 772                         VS_UNLOCK(vs);
 773                         thread_wakeup(&vs->vs_async_pending);
 774                 } else {
 775                         VS_UNLOCK(vs);
 776                 }
 777                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 778                         vs = NULL;
 779                 else
 780                         vs = next_vs;
 781                 vs_count--;
 782         }
 783         return KERN_SUCCESS;
 784 }
 785
 786
 787 kern_return_t
 788 default_pager_backing_store_delete(
 789         MACH_PORT_FACE backing_store)
 790 {
 791         backing_store_t         bs;
 792         int                     i;
 793         paging_segment_t        ps;
 794         int                     error;
 795         int                     interim_pages_removed = 0;
 796 //      kern_return_t           kr;
 797
 798         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 799                 return KERN_INVALID_ARGUMENT;
 800
 801 #if 0
 802         /* not implemented */
 803         BS_UNLOCK(bs);
 804         return KERN_FAILURE;
 805 #endif
 806
 807     restart:
 808         PSL_LOCK();
 809         error = KERN_SUCCESS;
 810         for (i = 0; i <= paging_segment_max; i++) {
 811                 ps = paging_segments[i];
 812                 if (ps != PAGING_SEGMENT_NULL &&
 813                     ps->ps_bs == bs &&
 814                     ! ps->ps_going_away) {
 815                         PS_LOCK(ps);
 816                         /* disable access to this segment */
 817                         ps->ps_going_away = TRUE;
 818                         PS_UNLOCK(ps);
 819                         /*
 820                          * The "ps" segment is "off-line" now,
 821                          * we can try and delete it...
 822                          */
 823                         if(dp_pages_free < (cluster_transfer_minimum
 824                                                         + ps->ps_pgcount)) {
 825                                 error = KERN_FAILURE;
 826                                 PSL_UNLOCK();
 827                         }
 828                         else {
 829                                 /* remove all pages associated with the  */
 830                                 /* segment from the list of free pages   */
 831                                 /* when transfer is through, all target  */
 832                                 /* segment pages will appear to be free  */
 833
 834                                 dp_pages_free -=  ps->ps_pgcount;
 835                                 interim_pages_removed += ps->ps_pgcount;
 836                                 PSL_UNLOCK();
 837                                 error = ps_delete(ps);
 838                         }
 839                         if (error != KERN_SUCCESS) {
 840                                 /*
 841                                  * We couldn't delete the segment,
 842                                  * probably because there's not enough
 843                                  * virtual memory left.
 844                                  * Re-enable all the segments.
 845                                  */
 846                                 PSL_LOCK();
 847                                 break;
 848                         }
 849                         goto restart;
 850                 }
 851         }
 852
 853         if (error != KERN_SUCCESS) {
 854                 for (i = 0; i <= paging_segment_max; i++) {
 855                         ps = paging_segments[i];
 856                         if (ps != PAGING_SEGMENT_NULL &&
 857                             ps->ps_bs == bs &&
 858                             ps->ps_going_away) {
 859                                 PS_LOCK(ps);
 860                                 /* re-enable access to this segment */
 861                                 ps->ps_going_away = FALSE;
 862                                 PS_UNLOCK(ps);
 863                         }
 864                 }
 865                 dp_pages_free += interim_pages_removed;
 866                 PSL_UNLOCK();
 867                 BS_UNLOCK(bs);
 868                 return error;
 869         }
 870
 871         for (i = 0; i <= paging_segment_max; i++) {
 872                 ps = paging_segments[i];
 873                 if (ps != PAGING_SEGMENT_NULL &&
 874                     ps->ps_bs == bs) {
 875                         if(ps->ps_going_away) {
 876                                 paging_segments[i] = PAGING_SEGMENT_NULL;
 877                                 paging_segment_count--;
 878                                 PS_LOCK(ps);
 879                                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
 880                                 kfree(ps, sizeof *ps);
 881                         }
 882                 }
 883         }
 884
 885         /* Scan the entire ps array separately to make certain we find the */
 886         /* proper paging_segment_max                                       */
 887         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 888                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 889                    paging_segment_max = i;
 890         }
 891
 892         PSL_UNLOCK();
 893
 894         /*
 895          * All the segments have been deleted.
 896          * We can remove the backing store.
 897          */
 898
 899         /*
 900          * Disable lookups of this backing store.
 901          */
 902         if((void *)bs->bs_port->alias != NULL)
 903                 kfree((void *) bs->bs_port->alias,
 904                       sizeof (struct vstruct_alias));
 905         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 906         bs->bs_port = MACH_PORT_NULL;
 907         BS_UNLOCK(bs);
 908
 909         /*
 910          * Remove backing store from backing_store list.
 911          */
 912         BSL_LOCK();
 913         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
 914                      bs_links);
 915         BSL_UNLOCK();
 916
 917         /*
 918          * Free the backing store structure.
 919          */
 920         kfree(bs, sizeof *bs);
 921
 922         return KERN_SUCCESS;
 923 }
 924
 925 int     ps_enter(paging_segment_t);     /* forward */
 926
 927 int
 928 ps_enter(
 929         paging_segment_t ps)
 930 {
 931         int i;
 932
 933         PSL_LOCK();
 934
 935         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 936                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
 937                         break;
 938         }
 939
 940         if (i < MAX_NUM_PAGING_SEGMENTS) {
 941                 paging_segments[i] = ps;
 942                 if (i > paging_segment_max)
 943                         paging_segment_max = i;
 944                 paging_segment_count++;
 945                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
 946                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
 947                         ps_select_array[ps->ps_bs->bs_priority] = 0;
 948                 i = 0;
 949         } else {
 950                 PSL_UNLOCK();
 951                 return KERN_RESOURCE_SHORTAGE;
 952         }
 953
 954         PSL_UNLOCK();
 955         return i;
 956 }
 957
 958 #ifdef DEVICE_PAGING
 959 kern_return_t
 960 default_pager_add_segment(
 961         MACH_PORT_FACE  backing_store,
 962         MACH_PORT_FACE  device,
 963         recnum_t        offset,
 964         recnum_t        count,
 965         int             record_size)
 966 {
 967         backing_store_t         bs;
 968         paging_segment_t        ps;
 969         int                     i;
 970         int                     error;
 971
 972         if ((bs = backing_store_lookup(backing_store))
 973             == BACKING_STORE_NULL)
 974                 return KERN_INVALID_ARGUMENT;
 975
 976         PSL_LOCK();
 977         for (i = 0; i <= paging_segment_max; i++) {
 978                 ps = paging_segments[i];
 979                 if (ps == PAGING_SEGMENT_NULL)
 980                         continue;
 981
 982                 /*
 983                  * Check for overlap on same device.
 984                  */
 985                 if (!(ps->ps_device != device
 986                       || offset >= ps->ps_offset + ps->ps_recnum
 987                       || offset + count <= ps->ps_offset)) {
 988                         PSL_UNLOCK();
 989                         BS_UNLOCK(bs);
 990                         return KERN_INVALID_ARGUMENT;
 991                 }
 992         }
 993         PSL_UNLOCK();
 994
 995         /*
 996          * Set up the paging segment
 997          */
 998         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
 999         if (ps == PAGING_SEGMENT_NULL) {
1000                 BS_UNLOCK(bs);
1001                 return KERN_RESOURCE_SHORTAGE;
1002         }
1003
1004         ps->ps_segtype = PS_PARTITION;
1005         ps->ps_device = device;
1006         ps->ps_offset = offset;
1007         ps->ps_record_shift = local_log2(vm_page_size / record_size);
1008         ps->ps_recnum = count;
1009         ps->ps_pgnum = count >> ps->ps_record_shift;
1010
1011         ps->ps_pgcount = ps->ps_pgnum;
1012         ps->ps_clshift = local_log2(bs->bs_clsize);
1013         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1014         ps->ps_hint = 0;
1015
1016         PS_LOCK_INIT(ps);
1017         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1018         if (!ps->ps_bmap) {
1019                 kfree(ps, sizeof *ps);
1020                 BS_UNLOCK(bs);
1021                 return KERN_RESOURCE_SHORTAGE;
1022         }
1023         for (i = 0; i < ps->ps_ncls; i++) {
1024                 clrbit(ps->ps_bmap, i);
1025         }
1026
1027         ps->ps_going_away = FALSE;
1028         ps->ps_bs = bs;
1029
1030         if ((error = ps_enter(ps)) != 0) {
1031                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1032                 kfree(ps, sizeof *ps);
1033                 BS_UNLOCK(bs);
1034                 return KERN_RESOURCE_SHORTAGE;
1035         }
1036
1037         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1038         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1039         BS_UNLOCK(bs);
1040
1041         PSL_LOCK();
1042         dp_pages_free += ps->ps_pgcount;
1043         PSL_UNLOCK();
1044
1045         bs_more_space(ps->ps_clcount);
1046
1047         DP_DEBUG(DEBUG_BS_INTERNAL,
1048                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1049                   device, offset, count, record_size,
1050                   ps->ps_record_shift, ps->ps_pgnum));
1051
1052         return KERN_SUCCESS;
1053 }
1054
1055 boolean_t
1056 bs_add_device(
1057         char            *dev_name,
1058         MACH_PORT_FACE  master)
1059 {
1060         security_token_t        null_security_token = {
1061                 { 0, 0 }
1062         };
1063         MACH_PORT_FACE  device;
1064         int             info[DEV_GET_SIZE_COUNT];
1065         mach_msg_type_number_t info_count;
1066         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1067         unsigned int    rec_size;
1068         recnum_t        count;
1069         int             clsize;
1070         MACH_PORT_FACE  reply_port;
1071
1072         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1073                         null_security_token, dev_name, &device))
1074                 return FALSE;
1075
1076         info_count = DEV_GET_SIZE_COUNT;
1077         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1078                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1079                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1080                 clsize = bs_get_global_clsize(0);
1081                 if (!default_pager_backing_store_create(
1082                                         default_pager_object,
1083                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1084                                         (clsize * vm_page_size),
1085                                         &bs)) {
1086                         if (!default_pager_add_segment(bs, device,
1087                                                        0, count, rec_size)) {
1088                                 return TRUE;
1089                         }
1090                         ipc_port_release_receive(bs);
1091                 }
1092         }
1093
1094         ipc_port_release_send(device);
1095         return FALSE;
1096 }
1097 #endif /* DEVICE_PAGING */
1098
1099 #if     VS_ASYNC_REUSE
1100
1101 struct vs_async *
1102 vs_alloc_async(void)
1103 {
1104         struct vs_async *vsa;
1105         MACH_PORT_FACE  reply_port;
1106 //      kern_return_t   kr;
1107
1108         VS_ASYNC_LOCK();
1109         if (vs_async_free_list == NULL) {
1110                 VS_ASYNC_UNLOCK();
1111                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1112                 if (vsa != NULL) {
1113                         /*
1114                          * Try allocating a reply port named after the
1115                          * address of the vs_async structure.
1116                          */
1117                         struct vstruct_alias    *alias_struct;
1118
1119                         reply_port = ipc_port_alloc_kernel();
1120                         alias_struct = (struct vstruct_alias *)
1121                                 kalloc(sizeof (struct vstruct_alias));
1122                         if(alias_struct != NULL) {
1123                                 alias_struct->vs = (struct vstruct *)vsa;
1124                                 alias_struct->name = &default_pager_ops;
1125                                 reply_port->alias = (int) alias_struct;
1126                                 vsa->reply_port = reply_port;
1127                                 vs_alloc_async_count++;
1128                         }
1129                         else {
1130                                 vs_alloc_async_failed++;
1131                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1132                                                                 (reply_port));
1133                                 kfree(vsa, sizeof (struct vs_async));
1134                                 vsa = NULL;
1135                         }
1136                 }
1137         } else {
1138                 vsa = vs_async_free_list;
1139                 vs_async_free_list = vs_async_free_list->vsa_next;
1140                 VS_ASYNC_UNLOCK();
1141         }
1142
1143         return vsa;
1144 }
1145
1146 void
1147 vs_free_async(
1148         struct vs_async *vsa)
1149 {
1150         VS_ASYNC_LOCK();
1151         vsa->vsa_next = vs_async_free_list;
1152         vs_async_free_list = vsa;
1153         VS_ASYNC_UNLOCK();
1154 }
1155
1156 #else   /* VS_ASYNC_REUSE */
1157
1158 struct vs_async *
1159 vs_alloc_async(void)
1160 {
1161         struct vs_async *vsa;
1162         MACH_PORT_FACE  reply_port;
1163         kern_return_t   kr;
1164
1165         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1166         if (vsa != NULL) {
1167                 /*
1168                  * Try allocating a reply port named after the
1169                  * address of the vs_async structure.
1170                  */
1171                         reply_port = ipc_port_alloc_kernel();
1172                         alias_struct = (vstruct_alias *)
1173                                 kalloc(sizeof (struct vstruct_alias));
1174                         if(alias_struct != NULL) {
1175                                 alias_struct->vs = reply_port;
1176                                 alias_struct->name = &default_pager_ops;
1177                                 reply_port->alias = (int) vsa;
1178                                 vsa->reply_port = reply_port;
1179                                 vs_alloc_async_count++;
1180                         }
1181                         else {
1182                                 vs_alloc_async_failed++;
1183                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1184                                                                 (reply_port));
1185                                 kfree(vsa, sizeof (struct vs_async));
1186                                 vsa = NULL;
1187                         }
1188         }
1189
1190         return vsa;
1191 }
1192
1193 void
1194 vs_free_async(
1195         struct vs_async *vsa)
1196 {
1197         MACH_PORT_FACE  reply_port;
1198         kern_return_t   kr;
1199
1200         reply_port = vsa->reply_port;
1201         kfree(reply_port->alias, sizeof (struct vstuct_alias));
1202         kfree(vsa, sizeof (struct vs_async));
1203         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1204 #if 0
1205         VS_ASYNC_LOCK();
1206         vs_alloc_async_count--;
1207         VS_ASYNC_UNLOCK();
1208 #endif
1209 }
1210
1211 #endif  /* VS_ASYNC_REUSE */
1212
1213 zone_t  vstruct_zone;
1214
1215 vstruct_t
1216 ps_vstruct_create(
1217         vm_size_t size)
1218 {
1219         vstruct_t       vs;
1220         unsigned int    i;
1221
1222         vs = (vstruct_t) zalloc(vstruct_zone);
1223         if (vs == VSTRUCT_NULL) {
1224                 return VSTRUCT_NULL;
1225         }
1226
1227         VS_LOCK_INIT(vs);
1228
1229         /*
1230          * The following fields will be provided later.
1231          */
1232         vs->vs_pager_ops = NULL;
1233         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1234         vs->vs_references = 1;
1235         vs->vs_seqno = 0;
1236
1237 #ifdef MACH_KERNEL
1238         vs->vs_waiting_seqno = FALSE;
1239         vs->vs_waiting_read = FALSE;
1240         vs->vs_waiting_write = FALSE;
1241         vs->vs_waiting_async = FALSE;
1242 #else
1243         mutex_init(&vs->vs_waiting_seqno, 0);
1244         mutex_init(&vs->vs_waiting_read, 0);
1245         mutex_init(&vs->vs_waiting_write, 0);
1246         mutex_init(&vs->vs_waiting_refs, 0);
1247         mutex_init(&vs->vs_waiting_async, 0);
1248 #endif
1249
1250         vs->vs_readers = 0;
1251         vs->vs_writers = 0;
1252
1253         vs->vs_errors = 0;
1254
1255         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1256         vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1257         vs->vs_async_pending = 0;
1258
1259         /*
1260          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1261          * depending on the size of the memory object.
1262          */
1263         if (INDIRECT_CLMAP(vs->vs_size)) {
1264                 vs->vs_imap = (struct vs_map **)
1265                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1266                 vs->vs_indirect = TRUE;
1267         } else {
1268                 vs->vs_dmap = (struct vs_map *)
1269                         kalloc(CLMAP_SIZE(vs->vs_size));
1270                 vs->vs_indirect = FALSE;
1271         }
1272         vs->vs_xfer_pending = FALSE;
1273         DP_DEBUG(DEBUG_VS_INTERNAL,
1274                  ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1275
1276         /*
1277          * Check to see that we got the space.
1278          */
1279         if (!vs->vs_dmap) {
1280                 kfree(vs, sizeof *vs);
1281                 return VSTRUCT_NULL;
1282         }
1283
1284         /*
1285          * Zero the indirect pointers, or clear the direct pointers.
1286          */
1287         if (vs->vs_indirect)
1288                 memset(vs->vs_imap, 0,
1289                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1290         else
1291                 for (i = 0; i < vs->vs_size; i++)
1292                         VSM_CLR(vs->vs_dmap[i]);
1293
1294         VS_MAP_LOCK_INIT(vs);
1295
1296         bs_commit(vs->vs_size);
1297
1298         return vs;
1299 }
1300
1301 paging_segment_t ps_select_segment(unsigned int, int *);        /* forward */
1302
1303 paging_segment_t
1304 ps_select_segment(
1305         unsigned int    shift,
1306         int             *psindex)
1307 {
1308         paging_segment_t        ps;
1309         int                     i;
1310         int                     j;
1311
1312         /*
1313          * Optimize case where there's only one segment.
1314          * paging_segment_max will index the one and only segment.
1315          */
1316
1317         PSL_LOCK();
1318         if (paging_segment_count == 1) {
1319                 paging_segment_t lps;   /* used to avoid extra PS_UNLOCK */
1320                 ipc_port_t trigger = IP_NULL;
1321
1322                 ps = paging_segments[paging_segment_max];
1323                 *psindex = paging_segment_max;
1324                 PS_LOCK(ps);
1325                 if (ps->ps_going_away) {
1326                         /* this segment is being turned off */
1327                         lps = PAGING_SEGMENT_NULL;
1328                 } else {
1329                         ASSERT(ps->ps_clshift >= shift);
1330                         if (ps->ps_clcount) {
1331                                 ps->ps_clcount--;
1332                                 dp_pages_free -=  1 << ps->ps_clshift;
1333                                 if(min_pages_trigger_port &&
1334                                   (dp_pages_free < minimum_pages_remaining)) {
1335                                         trigger = min_pages_trigger_port;
1336                                         min_pages_trigger_port = NULL;
1337                                         bs_low = TRUE;
1338                                 }
1339                                 lps = ps;
1340                         } else
1341                                 lps = PAGING_SEGMENT_NULL;
1342                 }
1343                 PS_UNLOCK(ps);
1344                 PSL_UNLOCK();
1345
1346                 if (trigger != IP_NULL) {
1347                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1348                         ipc_port_release_send(trigger);
1349                 }
1350                 return lps;
1351         }
1352
1353         if (paging_segment_count == 0) {
1354                 PSL_UNLOCK();
1355                 return PAGING_SEGMENT_NULL;
1356         }
1357
1358         for (i = BS_MAXPRI;
1359              i >= BS_MINPRI; i--) {
1360                 int start_index;
1361
1362                 if ((ps_select_array[i] == BS_NOPRI) ||
1363                                 (ps_select_array[i] == BS_FULLPRI))
1364                         continue;
1365                 start_index = ps_select_array[i];
1366
1367                 if(!(paging_segments[start_index])) {
1368                         j = start_index+1;
1369                         physical_transfer_cluster_count = 0;
1370                 }
1371                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1372                                 (((paging_segments[start_index])->ps_clshift)
1373                                 + vm_page_shift))) {
1374                         physical_transfer_cluster_count = 0;
1375                         j = start_index + 1;
1376                 } else {
1377                         physical_transfer_cluster_count+=1;
1378                         j = start_index;
1379                         if(start_index == 0)
1380                                 start_index = paging_segment_max;
1381                         else
1382                                 start_index = start_index - 1;
1383                 }
1384
1385                 while (1) {
1386                         if (j > paging_segment_max)
1387                                 j = 0;
1388                         if ((ps = paging_segments[j]) &&
1389                             (ps->ps_bs->bs_priority == i)) {
1390                                 /*
1391                                  * Force the ps cluster size to be
1392                                  * >= that of the vstruct.
1393                                  */
1394                                 PS_LOCK(ps);
1395                                 if (ps->ps_going_away) {
1396                                         /* this segment is being turned off */
1397                                 } else if ((ps->ps_clcount) &&
1398                                            (ps->ps_clshift >= shift)) {
1399                                         ipc_port_t trigger = IP_NULL;
1400
1401                                         ps->ps_clcount--;
1402                                         dp_pages_free -=  1 << ps->ps_clshift;
1403                                         if(min_pages_trigger_port &&
1404                                                 (dp_pages_free <
1405                                                 minimum_pages_remaining)) {
1406                                                 trigger = min_pages_trigger_port;
1407                                                 min_pages_trigger_port = NULL;
1408                                         }
1409                                         PS_UNLOCK(ps);
1410                                         /*
1411                                          * found one, quit looking.
1412                                          */
1413                                         ps_select_array[i] = j;
1414                                         PSL_UNLOCK();
1415
1416                                         if (trigger != IP_NULL) {
1417                                                 default_pager_space_alert(
1418                                                         trigger,
1419                                                         HI_WAT_ALERT);
1420                                                 ipc_port_release_send(trigger);
1421                                         }
1422                                         *psindex = j;
1423                                         return ps;
1424                                 }
1425                                 PS_UNLOCK(ps);
1426                         }
1427                         if (j == start_index) {
1428                                 /*
1429                                  * none at this priority -- mark it full
1430                                  */
1431                                 ps_select_array[i] = BS_FULLPRI;
1432                                 break;
1433                         }
1434                         j++;
1435                 }
1436         }
1437         PSL_UNLOCK();
1438         return PAGING_SEGMENT_NULL;
1439 }
1440
1441 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1442
1443 vm_offset_t
1444 ps_allocate_cluster(
1445         vstruct_t               vs,
1446         int                     *psindex,
1447         paging_segment_t        use_ps)
1448 {
1449         unsigned int            byte_num;
1450         int                     bit_num = 0;
1451         paging_segment_t        ps;
1452         vm_offset_t             cluster;
1453         ipc_port_t              trigger = IP_NULL;
1454
1455         /*
1456          * Find best paging segment.
1457          * ps_select_segment will decrement cluster count on ps.
1458          * Must pass cluster shift to find the most appropriate segment.
1459          */
1460         /* NOTE:  The addition of paging segment delete capability threatened
1461          * to seriously complicate the treatment of paging segments in this
1462          * module and the ones that call it (notably ps_clmap), because of the
1463          * difficulty in assuring that the paging segment would continue to
1464          * exist between being unlocked and locked.   This was
1465          * avoided because all calls to this module are based in either
1466          * dp_memory_object calls which rely on the vs lock, or by
1467          * the transfer function which is part of the segment delete path.
1468          * The transfer function which is part of paging segment delete is
1469          * protected from multiple callers by the backing store lock.
1470          * The paging segment delete function treats mappings to a paging
1471          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1472          * while data is transferred to the remaining segments.  This is in
1473          * line with the view that incomplete or in-transition mappings between
1474          * data, a vstruct, and backing store are protected by the vs lock.
1475          * This and the ordering of the paging segment "going_away" bit setting
1476          * protects us.
1477          */
1478         if (use_ps != PAGING_SEGMENT_NULL) {
1479                 ps = use_ps;
1480                 PSL_LOCK();
1481                 PS_LOCK(ps);
1482
1483                 ASSERT(ps->ps_clcount != 0);
1484
1485                 ps->ps_clcount--;
1486                 dp_pages_free -=  1 << ps->ps_clshift;
1487                 if(min_pages_trigger_port &&
1488                                 (dp_pages_free < minimum_pages_remaining)) {
1489                         trigger = min_pages_trigger_port;
1490                         min_pages_trigger_port = NULL;
1491                 }
1492                 PSL_UNLOCK();
1493                 PS_UNLOCK(ps);
1494                 if (trigger != IP_NULL) {
1495                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1496                         ipc_port_release_send(trigger);
1497                 }
1498
1499         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1500                    PAGING_SEGMENT_NULL) {
1501                 static uint32_t lastnotify = 0;
1502                 uint32_t now, nanoseconds_dummy;
1503
1504                 /*
1505                  * Emit a notification of the low-paging resource condition
1506                  * but don't issue it more than once every five seconds.  This
1507                  * prevents us from overflowing logs with thousands of
1508                  * repetitions of the message.
1509                  */
1510                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1511                 if (now > lastnotify + 5) {
1512                         dprintf(("no space in available paging segments\n"));
1513                         lastnotify = now;
1514                 }
1515
1516                 /* the count got off maybe, reset to zero */
1517                 PSL_LOCK();
1518                 dp_pages_free = 0;
1519                 if(min_pages_trigger_port) {
1520                         trigger = min_pages_trigger_port;
1521                         min_pages_trigger_port = NULL;
1522                         bs_low = TRUE;
1523                 }
1524                 PSL_UNLOCK();
1525                 if (trigger != IP_NULL) {
1526                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1527                         ipc_port_release_send(trigger);
1528                 }
1529                 return (vm_offset_t) -1;
1530         }
1531
1532         /*
1533          * Look for an available cluster.  At the end of the loop,
1534          * byte_num is the byte offset and bit_num is the bit offset of the
1535          * first zero bit in the paging segment bitmap.
1536          */
1537         PS_LOCK(ps);
1538         byte_num = ps->ps_hint;
1539         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1540                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1541                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1542                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1543                                         break;
1544                         }
1545                         ASSERT(bit_num != NBBY);
1546                         break;
1547                 }
1548         }
1549         ps->ps_hint = byte_num;
1550         cluster = (byte_num*NBBY) + bit_num;
1551
1552         /* Space was reserved, so this must be true */
1553         ASSERT(cluster < ps->ps_ncls);
1554
1555         setbit(ps->ps_bmap, cluster);
1556         PS_UNLOCK(ps);
1557
1558         return cluster;
1559 }
1560
1561 void ps_deallocate_cluster(paging_segment_t, vm_offset_t);      /* forward */
1562
1563 void
1564 ps_deallocate_cluster(
1565         paging_segment_t        ps,
1566         vm_offset_t             cluster)
1567 {
1568
1569         if (cluster >= (vm_offset_t) ps->ps_ncls)
1570                 panic("ps_deallocate_cluster: Invalid cluster number");
1571
1572         /*
1573          * Lock the paging segment, clear the cluster's bitmap and increment the
1574          * number of free cluster.
1575          */
1576         PSL_LOCK();
1577         PS_LOCK(ps);
1578         clrbit(ps->ps_bmap, cluster);
1579         ++ps->ps_clcount;
1580         dp_pages_free +=  1 << ps->ps_clshift;
1581         PSL_UNLOCK();
1582
1583         /*
1584          * Move the hint down to the freed cluster if it is
1585          * less than the current hint.
1586          */
1587         if ((cluster/NBBY) < ps->ps_hint) {
1588                 ps->ps_hint = (cluster/NBBY);
1589         }
1590
1591         PS_UNLOCK(ps);
1592
1593         /*
1594          * If we're freeing space on a full priority, reset the array.
1595          */
1596         PSL_LOCK();
1597         if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1598                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1599         PSL_UNLOCK();
1600
1601         return;
1602 }
1603
1604 void ps_dealloc_vsmap(struct vs_map *, vm_size_t);      /* forward */
1605
1606 void
1607 ps_dealloc_vsmap(
1608         struct vs_map   *vsmap,
1609         vm_size_t       size)
1610 {
1611         unsigned int i;
1612         for (i = 0; i < size; i++)
1613                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1614                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1615                                               VSM_CLOFF(vsmap[i]));
1616 }
1617
1618 void
1619 ps_vstruct_dealloc(
1620         vstruct_t vs)
1621 {
1622         unsigned int    i;
1623 //      spl_t   s;
1624
1625         VS_MAP_LOCK(vs);
1626
1627         /*
1628          * If this is an indirect structure, then we walk through the valid
1629          * (non-zero) indirect pointers and deallocate the clusters
1630          * associated with each used map entry (via ps_dealloc_vsmap).
1631          * When all of the clusters in an indirect block have been
1632          * freed, we deallocate the block.  When all of the indirect
1633          * blocks have been deallocated we deallocate the memory
1634          * holding the indirect pointers.
1635          */
1636         if (vs->vs_indirect) {
1637                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1638                         if (vs->vs_imap[i] != NULL) {
1639                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1640                                 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1641                         }
1642                 }
1643                 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1644         } else {
1645                 /*
1646                  * Direct map.  Free used clusters, then memory.
1647                  */
1648                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1649                 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1650         }
1651         VS_MAP_UNLOCK(vs);
1652
1653         bs_commit(- vs->vs_size);
1654
1655         zfree(vstruct_zone, vs);
1656 }
1657
1658 int ps_map_extend(vstruct_t, unsigned int);     /* forward */
1659
1660 int ps_map_extend(
1661         vstruct_t       vs,
1662         unsigned int    new_size)
1663 {
1664         struct vs_map   **new_imap;
1665         struct vs_map   *new_dmap = NULL;
1666         int             newdsize;
1667         int             i;
1668         void            *old_map = NULL;
1669         int             old_map_size = 0;
1670
1671         if (vs->vs_size >= new_size) {
1672                 /*
1673                  * Someone has already done the work.
1674                  */
1675                 return 0;
1676         }
1677
1678         /*
1679          * If the new size extends into the indirect range, then we have one
1680          * of two cases: we are going from indirect to indirect, or we are
1681          * going from direct to indirect.  If we are going from indirect to
1682          * indirect, then it is possible that the new size will fit in the old
1683          * indirect map.  If this is the case, then just reset the size of the
1684          * vstruct map and we are done.  If the new size will not
1685          * fit into the old indirect map, then we have to allocate a new
1686          * indirect map and copy the old map pointers into this new map.
1687          *
1688          * If we are going from direct to indirect, then we have to allocate a
1689          * new indirect map and copy the old direct pages into the first
1690          * indirect page of the new map.
1691          * NOTE: allocating memory here is dangerous, as we're in the
1692          * pageout path.
1693          */
1694         if (INDIRECT_CLMAP(new_size)) {
1695                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1696
1697                 /*
1698                  * Get a new indirect map and zero it.
1699                  */
1700                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1701                 if (vs->vs_indirect &&
1702                     (new_map_size == old_map_size)) {
1703                         bs_commit(new_size - vs->vs_size);
1704                         vs->vs_size = new_size;
1705                         return 0;
1706                 }
1707
1708                 new_imap = (struct vs_map **)kalloc(new_map_size);
1709                 if (new_imap == NULL) {
1710                         return -1;
1711                 }
1712                 memset(new_imap, 0, new_map_size);
1713
1714                 if (vs->vs_indirect) {
1715                         /* Copy old entries into new map */
1716                         memcpy(new_imap, vs->vs_imap, old_map_size);
1717                         /* Arrange to free the old map */
1718                         old_map = (void *) vs->vs_imap;
1719                         newdsize = 0;
1720                 } else {        /* Old map was a direct map */
1721                         /* Allocate an indirect page */
1722                         if ((new_imap[0] = (struct vs_map *)
1723                              kalloc(CLMAP_THRESHOLD)) == NULL) {
1724                                 kfree(new_imap, new_map_size);
1725                                 return -1;
1726                         }
1727                         new_dmap = new_imap[0];
1728                         newdsize = CLMAP_ENTRIES;
1729                 }
1730         } else {
1731                 new_imap = NULL;
1732                 newdsize = new_size;
1733                 /*
1734                  * If the new map is a direct map, then the old map must
1735                  * also have been a direct map.  All we have to do is
1736                  * to allocate a new direct map, copy the old entries
1737                  * into it and free the old map.
1738                  */
1739                 if ((new_dmap = (struct vs_map *)
1740                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
1741                         return -1;
1742                 }
1743         }
1744         if (newdsize) {
1745
1746                 /* Free the old map */
1747                 old_map = (void *) vs->vs_dmap;
1748                 old_map_size = CLMAP_SIZE(vs->vs_size);
1749
1750                 /* Copy info from the old map into the new map */
1751                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1752
1753                 /* Initialize the rest of the new map */
1754                 for (i = vs->vs_size; i < newdsize; i++)
1755                         VSM_CLR(new_dmap[i]);
1756         }
1757         if (new_imap) {
1758                 vs->vs_imap = new_imap;
1759                 vs->vs_indirect = TRUE;
1760         } else
1761                 vs->vs_dmap = new_dmap;
1762         bs_commit(new_size - vs->vs_size);
1763         vs->vs_size = new_size;
1764         if (old_map)
1765                 kfree(old_map, old_map_size);
1766         return 0;
1767 }
1768
1769 vm_offset_t
1770 ps_clmap(
1771         vstruct_t       vs,
1772         vm_offset_t     offset,
1773         struct clmap    *clmap,
1774         int             flag,
1775         vm_size_t       size,
1776         int             error)
1777 {
1778         vm_offset_t     cluster;        /* The cluster of offset.       */
1779         vm_offset_t     newcl;          /* The new cluster allocated.   */
1780         vm_offset_t     newoff;
1781         unsigned int    i;
1782         struct vs_map   *vsmap;
1783
1784         VS_MAP_LOCK(vs);
1785
1786         ASSERT(vs->vs_dmap);
1787         cluster = atop_32(offset) >> vs->vs_clshift;
1788
1789         /*
1790          * Initialize cluster error value
1791          */
1792         clmap->cl_error = 0;
1793
1794         /*
1795          * If the object has grown, extend the page map.
1796          */
1797         if (cluster >= vs->vs_size) {
1798                 if (flag == CL_FIND) {
1799                         /* Do not allocate if just doing a lookup */
1800                         VS_MAP_UNLOCK(vs);
1801                         return (vm_offset_t) -1;
1802                 }
1803                 if (ps_map_extend(vs, cluster + 1)) {
1804                         VS_MAP_UNLOCK(vs);
1805                         return (vm_offset_t) -1;
1806                 }
1807         }
1808
1809         /*
1810          * Look for the desired cluster.  If the map is indirect, then we
1811          * have a two level lookup.  First find the indirect block, then
1812          * find the actual cluster.  If the indirect block has not yet
1813          * been allocated, then do so.  If the cluster has not yet been
1814          * allocated, then do so.
1815          *
1816          * If any of the allocations fail, then return an error.
1817          * Don't allocate if just doing a lookup.
1818          */
1819         if (vs->vs_indirect) {
1820                 long    ind_block = cluster/CLMAP_ENTRIES;
1821
1822                 /* Is the indirect block allocated? */
1823                 vsmap = vs->vs_imap[ind_block];
1824                 if (vsmap == NULL) {
1825                         if (flag == CL_FIND) {
1826                                 VS_MAP_UNLOCK(vs);
1827                                 return (vm_offset_t) -1;
1828                         }
1829
1830                         /* Allocate the indirect block */
1831                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1832                         if (vsmap == NULL) {
1833                                 VS_MAP_UNLOCK(vs);
1834                                 return (vm_offset_t) -1;
1835                         }
1836                         /* Initialize the cluster offsets */
1837                         for (i = 0; i < CLMAP_ENTRIES; i++)
1838                                 VSM_CLR(vsmap[i]);
1839                         vs->vs_imap[ind_block] = vsmap;
1840                 }
1841         } else
1842                 vsmap = vs->vs_dmap;
1843
1844         ASSERT(vsmap);
1845         vsmap += cluster%CLMAP_ENTRIES;
1846
1847         /*
1848          * At this point, vsmap points to the struct vs_map desired.
1849          *
1850          * Look in the map for the cluster, if there was an error on a
1851          * previous write, flag it and return.  If it is not yet
1852          * allocated, then allocate it, if we're writing; if we're
1853          * doing a lookup and the cluster's not allocated, return error.
1854          */
1855         if (VSM_ISERR(*vsmap)) {
1856                 clmap->cl_error = VSM_GETERR(*vsmap);
1857                 VS_MAP_UNLOCK(vs);
1858                 return (vm_offset_t) -1;
1859         } else if (VSM_ISCLR(*vsmap)) {
1860                 int psindex;
1861
1862                 if (flag == CL_FIND) {
1863                         /*
1864                          * If there's an error and the entry is clear, then
1865                          * we've run out of swap space.  Record the error
1866                          * here and return.
1867                          */
1868                         if (error) {
1869                                 VSM_SETERR(*vsmap, error);
1870                         }
1871                         VS_MAP_UNLOCK(vs);
1872                         return (vm_offset_t) -1;
1873                 } else {
1874                         /*
1875                          * Attempt to allocate a cluster from the paging segment
1876                          */
1877                         newcl = ps_allocate_cluster(vs, &psindex,
1878                                                     PAGING_SEGMENT_NULL);
1879                         if (newcl == (vm_offset_t) -1) {
1880                                 VS_MAP_UNLOCK(vs);
1881                                 return (vm_offset_t) -1;
1882                         }
1883                         VSM_CLR(*vsmap);
1884                         VSM_SETCLOFF(*vsmap, newcl);
1885                         VSM_SETPS(*vsmap, psindex);
1886                 }
1887         } else
1888                 newcl = VSM_CLOFF(*vsmap);
1889
1890         /*
1891          * Fill in pertinent fields of the clmap
1892          */
1893         clmap->cl_ps = VSM_PS(*vsmap);
1894         clmap->cl_numpages = VSCLSIZE(vs);
1895         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1896
1897         /*
1898          * Byte offset in paging segment is byte offset to cluster plus
1899          * byte offset within cluster.  It looks ugly, but should be
1900          * relatively quick.
1901          */
1902         ASSERT(trunc_page(offset) == offset);
1903         newcl = ptoa_32(newcl) << vs->vs_clshift;
1904         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1905         if (flag == CL_ALLOC) {
1906                 /*
1907                  * set bits in the allocation bitmap according to which
1908                  * pages were requested.  size is in bytes.
1909                  */
1910                 i = atop_32(newoff);
1911                 while ((size > 0) && (i < VSCLSIZE(vs))) {
1912                         VSM_SETALLOC(*vsmap, i);
1913                         i++;
1914                         size -= vm_page_size;
1915                 }
1916         }
1917         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1918         if (newoff) {
1919                 /*
1920                  * Offset is not cluster aligned, so number of pages
1921                  * and bitmaps must be adjusted
1922                  */
1923                 clmap->cl_numpages -= atop_32(newoff);
1924                 CLMAP_SHIFT(clmap, vs);
1925                 CLMAP_SHIFTALLOC(clmap, vs);
1926         }
1927
1928         /*
1929          *
1930          * The setting of valid bits and handling of write errors
1931          * must be done here, while we hold the lock on the map.
1932          * It logically should be done in ps_vs_write_complete().
1933          * The size and error information has been passed from
1934          * ps_vs_write_complete().  If the size parameter is non-zero,
1935          * then there is work to be done.  If error is also non-zero,
1936          * then the error number is recorded in the cluster and the
1937          * entire cluster is in error.
1938          */
1939         if (size && flag == CL_FIND) {
1940                 vm_offset_t off = (vm_offset_t) 0;
1941
1942                 if (!error) {
1943                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1944                              i++) {
1945                                 VSM_SETPG(*vsmap, i);
1946                                 size -= vm_page_size;
1947                         }
1948                         ASSERT(i <= VSCLSIZE(vs));
1949                 } else {
1950                         BS_STAT(clmap->cl_ps->ps_bs,
1951                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1952                                         atop_32(size));
1953                         off = VSM_CLOFF(*vsmap);
1954                         VSM_SETERR(*vsmap, error);
1955                 }
1956                 /*
1957                  * Deallocate cluster if error, and no valid pages
1958                  * already present.
1959                  */
1960                 if (off != (vm_offset_t) 0)
1961                         ps_deallocate_cluster(clmap->cl_ps, off);
1962                 VS_MAP_UNLOCK(vs);
1963                 return (vm_offset_t) 0;
1964         } else
1965                 VS_MAP_UNLOCK(vs);
1966
1967         DP_DEBUG(DEBUG_VS_INTERNAL,
1968                  ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1969                   newcl+newoff, (int) vs, (int) vsmap, flag));
1970         DP_DEBUG(DEBUG_VS_INTERNAL,
1971                  ("     clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1972                   (int) clmap->cl_ps, clmap->cl_numpages,
1973                   (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1974
1975         return (newcl + newoff);
1976 }
1977
1978 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t);     /* forward */
1979
1980 void
1981 ps_clunmap(
1982         vstruct_t       vs,
1983         vm_offset_t     offset,
1984         vm_size_t       length)
1985 {
1986         vm_offset_t             cluster; /* The cluster number of offset */
1987         struct vs_map           *vsmap;
1988
1989         VS_MAP_LOCK(vs);
1990
1991         /*
1992          * Loop through all clusters in this range, freeing paging segment
1993          * clusters and map entries as encountered.
1994          */
1995         while (length > 0) {
1996                 vm_offset_t     newoff;
1997                 unsigned int    i;
1998
1999                 cluster = atop_32(offset) >> vs->vs_clshift;
2000                 if (vs->vs_indirect)    /* indirect map */
2001                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2002                 else
2003                         vsmap = vs->vs_dmap;
2004                 if (vsmap == NULL) {
2005                         VS_MAP_UNLOCK(vs);
2006                         return;
2007                 }
2008                 vsmap += cluster%CLMAP_ENTRIES;
2009                 if (VSM_ISCLR(*vsmap)) {
2010                         length -= vm_page_size;
2011                         offset += vm_page_size;
2012                         continue;
2013                 }
2014                 /*
2015                  * We've got a valid mapping.  Clear it and deallocate
2016                  * paging segment cluster pages.
2017                  * Optimize for entire cluster cleraing.
2018                  */
2019                 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2020                         /*
2021                          * Not cluster aligned.
2022                          */
2023                         ASSERT(trunc_page(newoff) == newoff);
2024                         i = atop_32(newoff);
2025                 } else
2026                         i = 0;
2027                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2028                         VSM_CLRPG(*vsmap, i);
2029                         VSM_CLRALLOC(*vsmap, i);
2030                         length -= vm_page_size;
2031                         offset += vm_page_size;
2032                         i++;
2033                 }
2034
2035                 /*
2036                  * If map entry is empty, clear and deallocate cluster.
2037                  */
2038                 if (!VSM_ALLOC(*vsmap)) {
2039                         ps_deallocate_cluster(VSM_PS(*vsmap),
2040                                               VSM_CLOFF(*vsmap));
2041                         VSM_CLR(*vsmap);
2042                 }
2043         }
2044
2045         VS_MAP_UNLOCK(vs);
2046 }
2047
2048 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2049
2050 void
2051 ps_vs_write_complete(
2052         vstruct_t       vs,
2053         vm_offset_t     offset,
2054         vm_size_t       size,
2055         int             error)
2056 {
2057         struct clmap    clmap;
2058
2059         /*
2060          * Get the struct vsmap for this cluster.
2061          * Use READ, even though it was written, because the
2062          * cluster MUST be present, unless there was an error
2063          * in the original ps_clmap (e.g. no space), in which
2064          * case, nothing happens.
2065          *
2066          * Must pass enough information to ps_clmap to allow it
2067          * to set the vs_map structure bitmap under lock.
2068          */
2069         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2070 }
2071
2072 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int);    /* forward */
2073
2074 void
2075 vs_cl_write_complete(
2076         vstruct_t                                       vs,
2077         __unused paging_segment_t       ps,
2078         vm_offset_t                                     offset,
2079         __unused vm_offset_t            addr,
2080         vm_size_t                                       size,
2081         boolean_t                                       async,
2082         int                                                     error)
2083 {
2084 //      kern_return_t   kr;
2085
2086         if (error) {
2087                 /*
2088                  * For internal objects, the error is recorded on a
2089                  * per-cluster basis by ps_clmap() which is called
2090                  * by ps_vs_write_complete() below.
2091                  */
2092                 dprintf(("write failed error = 0x%x\n", error));
2093                 /* add upl_abort code here */
2094         } else
2095                 GSTAT(global_stats.gs_pages_out += atop_32(size));
2096         /*
2097          * Notify the vstruct mapping code, so it can do its accounting.
2098          */
2099         ps_vs_write_complete(vs, offset, size, error);
2100
2101         if (async) {
2102                 VS_LOCK(vs);
2103                 ASSERT(vs->vs_async_pending > 0);
2104                 vs->vs_async_pending -= size;
2105                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2106                         vs->vs_waiting_async = FALSE;
2107                         VS_UNLOCK(vs);
2108                         /* mutex_unlock(&vs->vs_waiting_async); */
2109                         thread_wakeup(&vs->vs_async_pending);
2110                 } else {
2111                         VS_UNLOCK(vs);
2112                 }
2113         }
2114 }
2115
2116 #ifdef DEVICE_PAGING
2117 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2118
2119 kern_return_t
2120 device_write_reply(
2121         MACH_PORT_FACE  reply_port,
2122         kern_return_t   device_code,
2123         io_buf_len_t    bytes_written)
2124 {
2125         struct vs_async *vsa;
2126
2127         vsa = (struct vs_async *)
2128                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2129
2130         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2131                 device_code = KERN_FAILURE;
2132         }
2133
2134         vsa->vsa_error = device_code;
2135
2136
2137         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2138         if(vsa->vsa_flags & VSA_TRANSFER) {
2139                 /* revisit when async disk segments redone */
2140                 if(vsa->vsa_error) {
2141                    /* need to consider error condition.  re-write data or */
2142                    /* throw it away here. */
2143                    vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2144                 }
2145                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2146                                                 vsa->vsa_size, vsa->vsa_error);
2147         } else {
2148                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2149                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2150                              vsa->vsa_error);
2151         }
2152         VS_FREE_ASYNC(vsa);
2153
2154         return KERN_SUCCESS;
2155 }
2156
2157 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2158 kern_return_t
2159 device_write_reply_inband(
2160         MACH_PORT_FACE          reply_port,
2161         kern_return_t           return_code,
2162         io_buf_len_t            bytes_written)
2163 {
2164         panic("device_write_reply_inband: illegal");
2165         return KERN_SUCCESS;
2166 }
2167
2168 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2169 kern_return_t
2170 device_read_reply(
2171         MACH_PORT_FACE          reply_port,
2172         kern_return_t           return_code,
2173         io_buf_ptr_t            data,
2174         mach_msg_type_number_t  dataCnt)
2175 {
2176         struct vs_async *vsa;
2177         vsa = (struct vs_async *)
2178                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2179         vsa->vsa_addr = (vm_offset_t)data;
2180         vsa->vsa_size = (vm_size_t)dataCnt;
2181         vsa->vsa_error = return_code;
2182         thread_wakeup(&vsa->vsa_lock);
2183         return KERN_SUCCESS;
2184 }
2185
2186 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2187 kern_return_t
2188 device_read_reply_inband(
2189         MACH_PORT_FACE          reply_port,
2190         kern_return_t           return_code,
2191         io_buf_ptr_inband_t     data,
2192         mach_msg_type_number_t  dataCnt)
2193 {
2194         panic("device_read_reply_inband: illegal");
2195         return KERN_SUCCESS;
2196 }
2197
2198 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2199 kern_return_t
2200 device_read_reply_overwrite(
2201         MACH_PORT_FACE          reply_port,
2202         kern_return_t           return_code,
2203         io_buf_len_t            bytes_read)
2204 {
2205         panic("device_read_reply_overwrite: illegal\n");
2206         return KERN_SUCCESS;
2207 }
2208
2209 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2210 kern_return_t
2211 device_open_reply(
2212         MACH_PORT_FACE          reply_port,
2213         kern_return_t           return_code,
2214         MACH_PORT_FACE          device_port)
2215 {
2216         panic("device_open_reply: illegal\n");
2217         return KERN_SUCCESS;
2218 }
2219
2220 kern_return_t
2221 ps_read_device(
2222         paging_segment_t        ps,
2223         vm_offset_t             offset,
2224         vm_offset_t             *bufferp,
2225         unsigned int            size,
2226         unsigned int            *residualp,
2227         int                     flags)
2228 {
2229         kern_return_t   kr;
2230         recnum_t        dev_offset;
2231         unsigned int    bytes_wanted;
2232         unsigned int    bytes_read;
2233         unsigned int    total_read;
2234         vm_offset_t     dev_buffer;
2235         vm_offset_t     buf_ptr;
2236         unsigned int    records_read;
2237         struct vs_async *vsa;
2238         mutex_t vs_waiting_read_reply;
2239
2240         device_t        device;
2241         vm_map_copy_t   device_data = NULL;
2242         default_pager_thread_t *dpt = NULL;
2243
2244         device = dev_port_lookup(ps->ps_device);
2245         clustered_reads[atop_32(size)]++;
2246
2247         dev_offset = (ps->ps_offset +
2248                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2249         bytes_wanted = size;
2250         total_read = 0;
2251         *bufferp = (vm_offset_t)NULL;
2252
2253         do {
2254                 vsa = VS_ALLOC_ASYNC();
2255                 if (vsa) {
2256                         vsa->vsa_vs = NULL;
2257                         vsa->vsa_addr = 0;
2258                         vsa->vsa_offset = 0;
2259                         vsa->vsa_size = 0;
2260                         vsa->vsa_ps = NULL;
2261                 }
2262                 mutex_init(&vsa->vsa_lock, 0);
2263                 ip_lock(vsa->reply_port);
2264                 vsa->reply_port->ip_sorights++;
2265                 ip_reference(vsa->reply_port);
2266                 ip_unlock(vsa->reply_port);
2267                 kr = ds_device_read_common(device,
2268                                  vsa->reply_port,
2269                                  (mach_msg_type_name_t)
2270                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2271                                  (dev_mode_t) 0,
2272                                  dev_offset,
2273                                  bytes_wanted,
2274                                  (IO_READ | IO_CALL),
2275                                  (io_buf_ptr_t *) &dev_buffer,
2276                                  (mach_msg_type_number_t *) &bytes_read);
2277                 if(kr == MIG_NO_REPLY) {
2278                         assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2279                         thread_block(THREAD_CONTINUE_NULL);
2280
2281                         dev_buffer = vsa->vsa_addr;
2282                         bytes_read = (unsigned int)vsa->vsa_size;
2283                         kr = vsa->vsa_error;
2284                 }
2285                 VS_FREE_ASYNC(vsa);
2286                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2287                         break;
2288                 }
2289                 total_read += bytes_read;
2290
2291                 /*
2292                  * If we got the entire range, use the returned dev_buffer.
2293                  */
2294                 if (bytes_read == size) {
2295                         *bufferp = (vm_offset_t)dev_buffer;
2296                         break;
2297                 }
2298
2299 #if 1
2300                 dprintf(("read only %d bytes out of %d\n",
2301                          bytes_read, bytes_wanted));
2302 #endif
2303                 if(dpt == NULL) {
2304                         dpt = get_read_buffer();
2305                         buf_ptr = dpt->dpt_buffer;
2306                         *bufferp = (vm_offset_t)buf_ptr;
2307                 }
2308                 /*
2309                  * Otherwise, copy the data into the provided buffer (*bufferp)
2310                  * and append the rest of the range as it comes in.
2311                  */
2312                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2313                 buf_ptr += bytes_read;
2314                 bytes_wanted -= bytes_read;
2315                 records_read = (bytes_read >>
2316                                 (vm_page_shift - ps->ps_record_shift));
2317                 dev_offset += records_read;
2318                 DP_DEBUG(DEBUG_VS_INTERNAL,
2319                          ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2320                           dev_buffer, bytes_read));
2321                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2322                     != KERN_SUCCESS)
2323                         Panic("dealloc buf");
2324         } while (bytes_wanted);
2325
2326         *residualp = size - total_read;
2327         if((dev_buffer != *bufferp) && (total_read != 0)) {
2328                 vm_offset_t temp_buffer;
2329                 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2330                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2331                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2332                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2333                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2334                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2335                         (vm_map_copy_t *)&device_data, FALSE))
2336                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2337         }
2338         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2339                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2340                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2341                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2342                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2343                         (vm_map_copy_t *)&device_data, FALSE))
2344                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2345         }
2346         else {
2347                 device_data = NULL;
2348         }
2349         *bufferp = (vm_offset_t)device_data;
2350
2351         if(dpt != NULL) {
2352                 /* Free the receive buffer */
2353                 dpt->checked_out = 0;
2354                 thread_wakeup(&dpt_array);
2355         }
2356         return KERN_SUCCESS;
2357 }
2358
2359 kern_return_t
2360 ps_write_device(
2361         paging_segment_t        ps,
2362         vm_offset_t             offset,
2363         vm_offset_t             addr,
2364         unsigned int            size,
2365         struct vs_async         *vsa)
2366 {
2367         recnum_t        dev_offset;
2368         io_buf_len_t    bytes_to_write, bytes_written;
2369         recnum_t        records_written;
2370         kern_return_t   kr;
2371         MACH_PORT_FACE  reply_port;
2372
2373
2374
2375         clustered_writes[atop_32(size)]++;
2376
2377         dev_offset = (ps->ps_offset +
2378                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2379         bytes_to_write = size;
2380
2381         if (vsa) {
2382                 /*
2383                  * Asynchronous write.
2384                  */
2385                 reply_port = vsa->reply_port;
2386                 ip_lock(reply_port);
2387                 reply_port->ip_sorights++;
2388                 ip_reference(reply_port);
2389                 ip_unlock(reply_port);
2390                 {
2391                 device_t        device;
2392                 device = dev_port_lookup(ps->ps_device);
2393
2394                 vsa->vsa_addr = addr;
2395                 kr=ds_device_write_common(device,
2396                         reply_port,
2397                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2398                         (dev_mode_t) 0,
2399                         dev_offset,
2400                         (io_buf_ptr_t)  addr,
2401                         size,
2402                         (IO_WRITE | IO_CALL),
2403                         &bytes_written);
2404                 }
2405                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2406                         if (verbose)
2407                                 dprintf(("%s0x%x, addr=0x%x,"
2408                                          "size=0x%x,offset=0x%x\n",
2409                                          "device_write_request returned ",
2410                                          kr, addr, size, offset));
2411                         BS_STAT(ps->ps_bs,
2412                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2413                         /* do the completion notification to free resources */
2414                         device_write_reply(reply_port, kr, 0);
2415                         return PAGER_ERROR;
2416                 }
2417         } else do {
2418                 /*
2419                  * Synchronous write.
2420                  */
2421                 {
2422                 device_t        device;
2423                 device = dev_port_lookup(ps->ps_device);
2424                 kr=ds_device_write_common(device,
2425                         IP_NULL, 0,
2426                         (dev_mode_t) 0,
2427                         dev_offset,
2428                         (io_buf_ptr_t)  addr,
2429                         size,
2430                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2431                         &bytes_written);
2432                 }
2433                 if (kr != KERN_SUCCESS) {
2434                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2435                                  "device_write returned ",
2436                                  kr, addr, size, offset));
2437                         BS_STAT(ps->ps_bs,
2438                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2439                         return PAGER_ERROR;
2440                 }
2441                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2442                         Panic("fragmented write");
2443                 records_written = (bytes_written >>
2444                                    (vm_page_shift - ps->ps_record_shift));
2445                 dev_offset += records_written;
2446 #if 1
2447                 if (bytes_written != bytes_to_write) {
2448                         dprintf(("wrote only %d bytes out of %d\n",
2449                                  bytes_written, bytes_to_write));
2450                 }
2451 #endif
2452                 bytes_to_write -= bytes_written;
2453                 addr += bytes_written;
2454         } while (bytes_to_write > 0);
2455
2456         return PAGER_SUCCESS;
2457 }
2458
2459
2460 #else /* !DEVICE_PAGING */
2461
2462 kern_return_t
2463 ps_read_device(
2464         __unused paging_segment_t       ps,
2465         __unused vm_offset_t            offset,
2466         __unused vm_offset_t            *bufferp,
2467         __unused unsigned int           size,
2468         __unused unsigned int           *residualp,
2469         __unused int                            flags)
2470 {
2471   panic("ps_read_device not supported");
2472   return KERN_FAILURE;
2473 }
2474
2475 kern_return_t
2476 ps_write_device(
2477         __unused paging_segment_t       ps,
2478         __unused vm_offset_t            offset,
2479         __unused vm_offset_t            addr,
2480         __unused unsigned int           size,
2481         __unused struct vs_async        *vsa)
2482 {
2483   panic("ps_write_device not supported");
2484   return KERN_FAILURE;
2485 }
2486
2487 #endif /* DEVICE_PAGING */
2488 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t);      /* forward */
2489
2490 void
2491 pvs_object_data_provided(
2492         __unused vstruct_t              vs,
2493         __unused upl_t                  upl,
2494         __unused upl_offset_t   offset,
2495         upl_size_t                              size)
2496 {
2497
2498         DP_DEBUG(DEBUG_VS_INTERNAL,
2499                  ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2500                   upl, offset, size));
2501
2502         ASSERT(size > 0);
2503         GSTAT(global_stats.gs_pages_in += atop_32(size));
2504
2505
2506 #if     USE_PRECIOUS
2507         ps_clunmap(vs, offset, size);
2508 #endif  /* USE_PRECIOUS */
2509
2510 }
2511
2512 kern_return_t
2513 pvs_cluster_read(
2514         vstruct_t       vs,
2515         vm_offset_t     vs_offset,
2516         vm_size_t       cnt)
2517 {
2518         upl_t                   upl;
2519         kern_return_t           error = KERN_SUCCESS;
2520         int                                     size;
2521         unsigned int            residual;
2522         unsigned int            request_flags;
2523         int                                     seg_index;
2524         int                                     pages_in_cl;
2525         int                     cl_size;
2526         int                     cl_mask;
2527         int                                     cl_index;
2528         int                     xfer_size;
2529         vm_offset_t       ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2530         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2531         struct clmap            clmap;
2532
2533         pages_in_cl = 1 << vs->vs_clshift;
2534         cl_size = pages_in_cl * vm_page_size;
2535         cl_mask = cl_size - 1;
2536
2537         /*
2538          * This loop will be executed multiple times until the entire
2539          * request has been satisfied... if the request spans cluster
2540          * boundaries, the clusters will be checked for logical continunity,
2541          * if contiguous the I/O request will span multiple clusters, otherwise
2542          * it will be broken up into the minimal set of I/O's
2543          *
2544          * If there are holes in a request (either unallocated pages in a paging
2545          * segment or an unallocated paging segment), we stop
2546          * reading at the hole, inform the VM of any data read, inform
2547          * the VM of an unavailable range, then loop again, hoping to
2548          * find valid pages later in the requested range.  This continues until
2549          * the entire range has been examined, and read, if present.
2550          */
2551
2552 #if     USE_PRECIOUS
2553         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
2554 #else
2555         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
2556 #endif
2557
2558         assert(dp_encryption_inited);
2559         if (dp_encryption) {
2560                 /*
2561                  * ENCRYPTED SWAP:
2562                  * request that the UPL be prepared for
2563                  * decryption.
2564                  */
2565                 request_flags |= UPL_ENCRYPT;
2566         }
2567
2568         while (cnt && (error == KERN_SUCCESS)) {
2569                 int             ps_info_valid;
2570                 unsigned int    page_list_count;
2571
2572                 if((vs_offset & cl_mask) &&
2573                         (cnt > (VM_SUPER_CLUSTER -
2574                                 (vs_offset & cl_mask)))) {
2575                         size = VM_SUPER_CLUSTER;
2576                         size -= vs_offset & cl_mask;
2577                 } else if (cnt > VM_SUPER_CLUSTER) {
2578                         size = VM_SUPER_CLUSTER;
2579                 } else {
2580                         size = cnt;
2581                 }
2582                 cnt -= size;
2583
2584                 ps_info_valid = 0;
2585                 seg_index     = 0;
2586
2587                 while (size > 0 && error == KERN_SUCCESS) {
2588                         int           abort_size;
2589                         int           failed_size;
2590                         int           beg_pseg;
2591                         int           beg_indx;
2592                         vm_offset_t   cur_offset;
2593
2594
2595                         if ( !ps_info_valid) {
2596                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2597                                 psp[seg_index]       = CLMAP_PS(clmap);
2598                                 ps_info_valid = 1;
2599                         }
2600                         /*
2601                          * skip over unallocated physical segments
2602                          */
2603                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2604                                 abort_size = cl_size - (vs_offset & cl_mask);
2605                                 abort_size = MIN(abort_size, size);
2606
2607                                 page_list_count = 0;
2608                                 memory_object_super_upl_request(
2609                                         vs->vs_control,
2610                                         (memory_object_offset_t)vs_offset,
2611                                         abort_size, abort_size,
2612                                         &upl, NULL, &page_list_count,
2613                                         request_flags);
2614
2615                                 if (clmap.cl_error) {
2616                                         upl_abort(upl, UPL_ABORT_ERROR);
2617                                 } else {
2618                                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2619                                 }
2620                                 upl_deallocate(upl);
2621
2622                                 size       -= abort_size;
2623                                 vs_offset  += abort_size;
2624
2625                                 seg_index++;
2626                                 ps_info_valid = 0;
2627                                 continue;
2628                         }
2629                         cl_index = (vs_offset & cl_mask) / vm_page_size;
2630
2631                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2632                                 /*
2633                                  * skip over unallocated pages
2634                                  */
2635                                 if (CLMAP_ISSET(clmap, cl_index))
2636                                         break;
2637                                 abort_size += vm_page_size;
2638                         }
2639                         if (abort_size) {
2640                                 /*
2641                                  * Let VM system know about holes in clusters.
2642                                  */
2643                                 GSTAT(global_stats.gs_pages_unavail += atop_32(abort_size));
2644
2645                                 page_list_count = 0;
2646                                 memory_object_super_upl_request(
2647                                         vs->vs_control,
2648                                         (memory_object_offset_t)vs_offset,
2649                                         abort_size, abort_size,
2650                                         &upl, NULL, &page_list_count,
2651                                         request_flags);
2652
2653                                 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2654                                 upl_deallocate(upl);
2655
2656                                 size       -= abort_size;
2657                                 vs_offset  += abort_size;
2658
2659                                 if (cl_index == pages_in_cl) {
2660                                         /*
2661                                          * if we're at the end of this physical cluster
2662                                          * then bump to the next one and continue looking
2663                                          */
2664                                         seg_index++;
2665                                         ps_info_valid = 0;
2666                                         continue;
2667                                 }
2668                                 if (size == 0)
2669                                         break;
2670                         }
2671                         /*
2672                          * remember the starting point of the first allocated page
2673                          * for the I/O we're about to issue
2674                          */
2675                         beg_pseg   = seg_index;
2676                         beg_indx   = cl_index;
2677                         cur_offset = vs_offset;
2678
2679                         /*
2680                          * calculate the size of the I/O that we can do...
2681                          * this may span multiple physical segments if
2682                          * they are contiguous
2683                          */
2684                         for (xfer_size = 0; xfer_size < size; ) {
2685
2686                                 while (cl_index < pages_in_cl
2687                                                 && xfer_size < size) {
2688                                         /*
2689                                          * accumulate allocated pages within
2690                                          * a physical segment
2691                                          */
2692                                         if (CLMAP_ISSET(clmap, cl_index)) {
2693                                                 xfer_size  += vm_page_size;
2694                                                 cur_offset += vm_page_size;
2695                                                 cl_index++;
2696
2697                                                 BS_STAT(psp[seg_index]->ps_bs,
2698                                                         psp[seg_index]->ps_bs->bs_pages_in++);
2699                                         } else
2700                                                 break;
2701                                 }
2702                                 if (cl_index < pages_in_cl
2703                                                 || xfer_size >= size) {
2704                                         /*
2705                                          * we've hit an unallocated page or
2706                                          * the end of this request... go fire
2707                                          * the I/O
2708                                          */
2709                                         break;
2710                                 }
2711                                 /*
2712                                  * we've hit the end of the current physical
2713                                  * segment and there's more to do, so try
2714                                  * moving to the next one
2715                                  */
2716                                 seg_index++;
2717
2718                                 ps_offset[seg_index] =
2719                                         ps_clmap(vs,
2720                                                 cur_offset & ~cl_mask,
2721                                                 &clmap, CL_FIND, 0, 0);
2722                                 psp[seg_index] = CLMAP_PS(clmap);
2723                                 ps_info_valid = 1;
2724
2725                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2726                                         /*
2727                                          * if the physical segment we're about
2728                                          * to step into is not contiguous to
2729                                          * the one we're currently in, or it's
2730                                          * in a different paging file, or
2731                                          * it hasn't been allocated....
2732                                          * we stop here and generate the I/O
2733                                          */
2734                                         break;
2735                                 }
2736                                 /*
2737                                  * start with first page of the next physical
2738                                  *  segment
2739                                  */
2740                                 cl_index = 0;
2741                         }
2742                         if (xfer_size) {
2743                                 /*
2744                                  * we have a contiguous range of allocated pages
2745                                  * to read from
2746                                  */
2747                                 page_list_count = 0;
2748                                 memory_object_super_upl_request(vs->vs_control,
2749                                         (memory_object_offset_t)vs_offset,
2750                                         xfer_size, xfer_size,
2751                                         &upl, NULL, &page_list_count,
2752                                         request_flags | UPL_SET_INTERNAL);
2753
2754                                 error = ps_read_file(psp[beg_pseg],
2755                                         upl, (upl_offset_t) 0,
2756                                         ps_offset[beg_pseg] +
2757                                                 (beg_indx * vm_page_size),
2758                                         xfer_size, &residual, 0);
2759                         } else
2760                                 continue;
2761
2762                         failed_size = 0;
2763
2764                         /*
2765                          * Adjust counts and send response to VM.  Optimize
2766                          * for the common case, i.e. no error and/or partial
2767                          * data. If there was an error, then we need to error
2768                          * the entire range, even if some data was successfully
2769                          * read. If there was a partial read we may supply some
2770                          * data and may error some as well.  In all cases the
2771                          * VM must receive some notification for every page
2772                          * in the range.
2773                          */
2774                         if ((error == KERN_SUCCESS) && (residual == 0)) {
2775                                 /*
2776                                  * Got everything we asked for, supply the data
2777                                  * to the VM.  Note that as a side effect of
2778                                  * supplying the data, the buffer holding the
2779                                  * supplied data is deallocated from the pager's
2780                                  *  address space.
2781                                  */
2782                                 pvs_object_data_provided(
2783                                         vs, upl, vs_offset, xfer_size);
2784                         } else {
2785                                 failed_size = xfer_size;
2786
2787                                 if (error == KERN_SUCCESS) {
2788                                         if ((signed) residual == xfer_size) {
2789                                         /*
2790                                          * If a read operation returns no error
2791                                          * and no data moved, we turn it into
2792                                          * an error, assuming we're reading at
2793                                          * or beyong EOF.
2794                                          * Fall through and error the entire
2795                                          * range.
2796                                          */
2797                                                 error = KERN_FAILURE;
2798                                         } else {
2799                                         /*
2800                                          * Otherwise, we have partial read. If
2801                                          * the part read is a integral number
2802                                          * of pages supply it. Otherwise round
2803                                          * it up to a page boundary, zero fill
2804                                          * the unread part, and supply it.
2805                                          * Fall through and error the remainder
2806                                          * of the range, if any.
2807                                          */
2808                                                 int fill, lsize;
2809
2810                                                 fill = residual
2811                                                         & ~vm_page_size;
2812                                                 lsize = (xfer_size - residual)
2813                                                                          + fill;
2814                                                 pvs_object_data_provided(
2815                                                         vs, upl,
2816                                                         vs_offset, lsize);
2817
2818                                                 if (lsize < xfer_size) {
2819                                                         failed_size =
2820                                                             xfer_size - lsize;
2821                                                         error = KERN_FAILURE;
2822                                                 }
2823                                         }
2824                                 }
2825                         }
2826                         /*
2827                          * If there was an error in any part of the range, tell
2828                          * the VM. Note that error is explicitly checked again
2829                          *  since it can be modified above.
2830                          */
2831                         if (error != KERN_SUCCESS) {
2832                                 BS_STAT(psp[beg_pseg]->ps_bs,
2833                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail
2834                                                 += atop_32(failed_size));
2835                         }
2836                         size       -= xfer_size;
2837                         vs_offset  += xfer_size;
2838                 }
2839
2840         } /* END while (cnt && (error == 0)) */
2841         return error;
2842 }
2843
2844 int vs_do_async_write = 1;
2845
2846 kern_return_t
2847 vs_cluster_write(
2848         vstruct_t       vs,
2849         upl_t           internal_upl,
2850         upl_offset_t    offset,
2851         upl_size_t      cnt,
2852         boolean_t       dp_internal,
2853         int             flags)
2854 {
2855         upl_size_t      transfer_size;
2856         int             error = 0;
2857         struct clmap    clmap;
2858
2859         vm_offset_t     actual_offset;  /* Offset within paging segment */
2860         paging_segment_t ps;
2861         vm_offset_t     mobj_base_addr;
2862         vm_offset_t     mobj_target_addr;
2863
2864         upl_t           upl;
2865         upl_page_info_t *pl;
2866         int             page_index;
2867         int             list_size;
2868         int             pages_in_cl;
2869         unsigned int    cl_size;
2870         int             base_index;
2871         unsigned int    seg_size;
2872
2873         pages_in_cl = 1 << vs->vs_clshift;
2874         cl_size = pages_in_cl * vm_page_size;
2875
2876         if (!dp_internal) {
2877                 unsigned int page_list_count;
2878                 int          request_flags;
2879                 unsigned int super_size;
2880                 int          first_dirty;
2881                 int          num_dirty;
2882                 int          num_of_pages;
2883                 int          seg_index;
2884                 upl_offset_t  upl_offset;
2885                 vm_offset_t  seg_offset;
2886                 vm_offset_t  ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2887                 paging_segment_t   psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2888
2889
2890                 if (bs_low) {
2891                         super_size = cl_size;
2892
2893                         request_flags = UPL_NOBLOCK |
2894                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2895                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2896                 } else {
2897                         super_size = VM_SUPER_CLUSTER;
2898
2899                         request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2900                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2901                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2902                 }
2903
2904                 if (!dp_encryption_inited) {
2905                         /*
2906                          * ENCRYPTED SWAP:
2907                          * Once we've started using swap, we
2908                          * can't change our mind on whether
2909                          * it needs to be encrypted or
2910                          * not.
2911                          */
2912                         dp_encryption_inited = TRUE;
2913                 }
2914                 if (dp_encryption) {
2915                         /*
2916                          * ENCRYPTED SWAP:
2917                          * request that the UPL be prepared for
2918                          * encryption.
2919                          */
2920                         request_flags |= UPL_ENCRYPT;
2921                         flags |= UPL_PAGING_ENCRYPTED;
2922                 }
2923
2924                 page_list_count = 0;
2925                 memory_object_super_upl_request(vs->vs_control,
2926                                 (memory_object_offset_t)offset,
2927                                 cnt, super_size,
2928                                 &upl, NULL, &page_list_count,
2929                                 request_flags | UPL_FOR_PAGEOUT);
2930
2931                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2932
2933                 seg_size = cl_size - (upl->offset % cl_size);
2934                 upl_offset = upl->offset & ~(cl_size - 1);
2935
2936                 for (seg_index = 0, transfer_size = upl->size;
2937                                                 transfer_size > 0; ) {
2938                         ps_offset[seg_index] =
2939                                 ps_clmap(vs,
2940                                         upl_offset,
2941                                         &clmap, CL_ALLOC,
2942                                         cl_size, 0);
2943
2944                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2945                                 upl_abort(upl, 0);
2946                                 upl_deallocate(upl);
2947
2948                                 return KERN_FAILURE;
2949
2950                         }
2951                         psp[seg_index] = CLMAP_PS(clmap);
2952
2953                         if (transfer_size > seg_size) {
2954                                 transfer_size -= seg_size;
2955                                 upl_offset += cl_size;
2956                                 seg_size    = cl_size;
2957                                 seg_index++;
2958                         } else
2959                                 transfer_size = 0;
2960                 }
2961                 /*
2962                  * Ignore any non-present pages at the end of the
2963                  * UPL.
2964                  */
2965                 for (page_index = upl->size / vm_page_size; page_index > 0;)
2966                         if (UPL_PAGE_PRESENT(pl, --page_index))
2967                                 break;
2968                 num_of_pages = page_index + 1;
2969
2970                 base_index = (upl->offset % cl_size) / PAGE_SIZE;
2971
2972                 for (page_index = 0; page_index < num_of_pages; ) {
2973                         /*
2974                          * skip over non-dirty pages
2975                          */
2976                         for ( ; page_index < num_of_pages; page_index++) {
2977                                 if (UPL_DIRTY_PAGE(pl, page_index)
2978                                         || UPL_PRECIOUS_PAGE(pl, page_index))
2979                                         /*
2980                                          * this is a page we need to write
2981                                          * go see if we can buddy it up with
2982                                          * others that are contiguous to it
2983                                          */
2984                                         break;
2985                                 /*
2986                                  * if the page is not-dirty, but present we
2987                                  * need to commit it...  This is an unusual
2988                                  * case since we only asked for dirty pages
2989                                  */
2990                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
2991                                         boolean_t empty = FALSE;
2992                                         upl_commit_range(upl,
2993                                                  page_index * vm_page_size,
2994                                                  vm_page_size,
2995                                                  UPL_COMMIT_NOTIFY_EMPTY,
2996                                                  pl,
2997                                                  page_list_count,
2998                                                  &empty);
2999                                         if (empty) {
3000                                                 assert(page_index ==
3001                                                        num_of_pages - 1);
3002                                                 upl_deallocate(upl);
3003                                         }
3004                                 }
3005                         }
3006                         if (page_index == num_of_pages)
3007                                 /*
3008                                  * no more pages to look at, we're out of here
3009                                  */
3010                                 break;
3011
3012                         /*
3013                          * gather up contiguous dirty pages... we have at
3014                          * least 1 * otherwise we would have bailed above
3015                          * make sure that each physical segment that we step
3016                          * into is contiguous to the one we're currently in
3017                          * if it's not, we have to stop and write what we have
3018                          */
3019                         for (first_dirty = page_index;
3020                                         page_index < num_of_pages; ) {
3021                                 if ( !UPL_DIRTY_PAGE(pl, page_index)
3022                                         && !UPL_PRECIOUS_PAGE(pl, page_index))
3023                                         break;
3024                                 page_index++;
3025                                 /*
3026                                  * if we just looked at the last page in the UPL
3027                                  * we don't need to check for physical segment
3028                                  * continuity
3029                                  */
3030                                 if (page_index < num_of_pages) {
3031                                         int cur_seg;
3032                                         int nxt_seg;
3033
3034                                         cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3035                                         nxt_seg = (base_index + page_index)/pages_in_cl;
3036
3037                                         if (cur_seg != nxt_seg) {
3038                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3039                                                 /*
3040                                                  * if the segment we're about
3041                                                  * to step into is not
3042                                                  * contiguous to the one we're
3043                                                  * currently in, or it's in a
3044                                                  * different paging file....
3045                                                  * we stop here and generate
3046                                                  * the I/O
3047                                                  */
3048                                                         break;
3049                                         }
3050                                 }
3051                         }
3052                         num_dirty = page_index - first_dirty;
3053
3054                         if (num_dirty) {
3055                                 upl_offset = first_dirty * vm_page_size;
3056                                 transfer_size = num_dirty * vm_page_size;
3057
3058                                 while (transfer_size) {
3059
3060                                         if ((seg_size = cl_size -
3061                                                 ((upl->offset + upl_offset) % cl_size))
3062                                                         > transfer_size)
3063                                                 seg_size = transfer_size;
3064
3065                                         ps_vs_write_complete(vs,
3066                                                 upl->offset + upl_offset,
3067                                                 seg_size, error);
3068
3069                                         transfer_size -= seg_size;
3070                                         upl_offset += seg_size;
3071                                 }
3072                                 upl_offset = first_dirty * vm_page_size;
3073                                 transfer_size = num_dirty * vm_page_size;
3074
3075                                 seg_index  = (base_index + first_dirty) / pages_in_cl;
3076                                 seg_offset = (upl->offset + upl_offset) % cl_size;
3077
3078                                 error = ps_write_file(psp[seg_index],
3079                                                 upl, upl_offset,
3080                                                 ps_offset[seg_index]
3081                                                                 + seg_offset,
3082                                                 transfer_size, flags);
3083                         } else {
3084                                 boolean_t empty = FALSE;
3085                                 upl_abort_range(upl,
3086                                                 first_dirty * vm_page_size,
3087                                                 num_dirty   * vm_page_size,
3088                                                 UPL_ABORT_NOTIFY_EMPTY,
3089                                                 &empty);
3090                                 if (empty) {
3091                                         assert(page_index == num_of_pages);
3092                                         upl_deallocate(upl);
3093                                 }
3094                         }
3095                 }
3096
3097         } else {
3098                 assert(cnt  <= (vm_page_size << vs->vs_clshift));
3099                 list_size = cnt;
3100
3101                 page_index = 0;
3102                 /* The caller provides a mapped_data which is derived  */
3103                 /* from a temporary object.  The targeted pages are    */
3104                 /* guaranteed to be set at offset 0 in the mapped_data */
3105                 /* The actual offset however must still be derived     */
3106                 /* from the offset in the vs in question               */
3107                 mobj_base_addr = offset;
3108                 mobj_target_addr = mobj_base_addr;
3109
3110                 for (transfer_size = list_size; transfer_size != 0;) {
3111                         actual_offset = ps_clmap(vs, mobj_target_addr,
3112                                 &clmap, CL_ALLOC,
3113                                 transfer_size < cl_size ?
3114                                         transfer_size : cl_size, 0);
3115                         if(actual_offset == (vm_offset_t) -1) {
3116                                 error = 1;
3117                                 break;
3118                         }
3119                         cnt = MIN(transfer_size,
3120                                 CLMAP_NPGS(clmap) * vm_page_size);
3121                         ps = CLMAP_PS(clmap);
3122                         /* Assume that the caller has given us contiguous */
3123                         /* pages */
3124                         if(cnt) {
3125                                 ps_vs_write_complete(vs, mobj_target_addr,
3126                                                                 cnt, error);
3127                                 error = ps_write_file(ps, internal_upl,
3128                                                 0, actual_offset,
3129                                                 cnt, flags);
3130                                 if (error)
3131                                         break;
3132                            }
3133                         if (error)
3134                                 break;
3135                         actual_offset += cnt;
3136                         mobj_target_addr += cnt;
3137                         transfer_size -= cnt;
3138                         cnt = 0;
3139
3140                         if (error)
3141                                 break;
3142                 }
3143         }
3144         if(error)
3145                 return KERN_FAILURE;
3146         else
3147                 return KERN_SUCCESS;
3148 }
3149
3150 vm_size_t
3151 ps_vstruct_allocated_size(
3152         vstruct_t       vs)
3153 {
3154         int             num_pages;
3155         struct vs_map   *vsmap;
3156         unsigned int    i, j, k;
3157
3158         num_pages = 0;
3159         if (vs->vs_indirect) {
3160                 /* loop on indirect maps */
3161                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3162                         vsmap = vs->vs_imap[i];
3163                         if (vsmap == NULL)
3164                                 continue;
3165                         /* loop on clusters in this indirect map */
3166                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3167                                 if (VSM_ISCLR(vsmap[j]) ||
3168                                     VSM_ISERR(vsmap[j]))
3169                                         continue;
3170                                 /* loop on pages in this cluster */
3171                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3172                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3173                                                 num_pages++;
3174                                 }
3175                         }
3176                 }
3177         } else {
3178                 vsmap = vs->vs_dmap;
3179                 if (vsmap == NULL)
3180                         return 0;
3181                 /* loop on clusters in the direct map */
3182                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3183                         if (VSM_ISCLR(vsmap[j]) ||
3184                             VSM_ISERR(vsmap[j]))
3185                                 continue;
3186                         /* loop on pages in this cluster */
3187                         for (k = 0; k < VSCLSIZE(vs); k++) {
3188                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3189                                         num_pages++;
3190                         }
3191                 }
3192         }
3193
3194         return ptoa_32(num_pages);
3195 }
3196
3197 size_t
3198 ps_vstruct_allocated_pages(
3199         vstruct_t               vs,
3200         default_pager_page_t    *pages,
3201         size_t                  pages_size)
3202 {
3203         unsigned int    num_pages;
3204         struct vs_map   *vsmap;
3205         vm_offset_t     offset;
3206         unsigned int    i, j, k;
3207
3208         num_pages = 0;
3209         offset = 0;
3210         if (vs->vs_indirect) {
3211                 /* loop on indirect maps */
3212                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3213                         vsmap = vs->vs_imap[i];
3214                         if (vsmap == NULL) {
3215                                 offset += (vm_page_size * CLMAP_ENTRIES *
3216                                            VSCLSIZE(vs));
3217                                 continue;
3218                         }
3219                         /* loop on clusters in this indirect map */
3220                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3221                                 if (VSM_ISCLR(vsmap[j]) ||
3222                                     VSM_ISERR(vsmap[j])) {
3223                                         offset += vm_page_size * VSCLSIZE(vs);
3224                                         continue;
3225                                 }
3226                                 /* loop on pages in this cluster */
3227                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3228                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3229                                                 num_pages++;
3230                                                 if (num_pages < pages_size)
3231                                                         pages++->dpp_offset =
3232                                                                 offset;
3233                                         }
3234                                         offset += vm_page_size;
3235                                 }
3236                         }
3237                 }
3238         } else {
3239                 vsmap = vs->vs_dmap;
3240                 if (vsmap == NULL)
3241                         return 0;
3242                 /* loop on clusters in the direct map */
3243                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3244                         if (VSM_ISCLR(vsmap[j]) ||
3245                             VSM_ISERR(vsmap[j])) {
3246                                 offset += vm_page_size * VSCLSIZE(vs);
3247                                 continue;
3248                         }
3249                         /* loop on pages in this cluster */
3250                         for (k = 0; k < VSCLSIZE(vs); k++) {
3251                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3252                                         num_pages++;
3253                                         if (num_pages < pages_size)
3254                                                 pages++->dpp_offset = offset;
3255                                 }
3256                                 offset += vm_page_size;
3257                         }
3258                 }
3259         }
3260
3261         return num_pages;
3262 }
3263
3264
3265 kern_return_t
3266 ps_vstruct_transfer_from_segment(
3267         vstruct_t        vs,
3268         paging_segment_t segment,
3269         upl_t            upl)
3270 {
3271         struct vs_map   *vsmap;
3272 //      struct vs_map   old_vsmap;
3273 //      struct vs_map   new_vsmap;
3274         unsigned int    i, j;
3275
3276         VS_LOCK(vs);    /* block all work on this vstruct */
3277                         /* can't allow the normal multiple write */
3278                         /* semantic because writes may conflict */
3279         vs->vs_xfer_pending = TRUE;
3280         vs_wait_for_sync_writers(vs);
3281         vs_start_write(vs);
3282         vs_wait_for_readers(vs);
3283         /* we will unlock the vs to allow other writes while transferring */
3284         /* and will be guaranteed of the persistance of the vs struct     */
3285         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3286         /* vs_async_pending */
3287         /* OK we now have guaranteed no other parties are accessing this */
3288         /* vs.  Now that we are also supporting simple lock versions of  */
3289         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3290         /* our purpose in holding it before was the multiple write case */
3291         /* we now use the boolean xfer_pending to do that.  We can use  */
3292         /* a boolean instead of a count because we have guaranteed single */
3293         /* file access to this code in its caller */
3294         VS_UNLOCK(vs);
3295 vs_changed:
3296         if (vs->vs_indirect) {
3297                 unsigned int    vsmap_size;
3298                 int             clmap_off;
3299                 /* loop on indirect maps */
3300                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3301                         vsmap = vs->vs_imap[i];
3302                         if (vsmap == NULL)
3303                                 continue;
3304                         /* loop on clusters in this indirect map */
3305                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3306                                            VSCLSIZE(vs) * i);
3307                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3308                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3309                         else
3310                                 vsmap_size = CLMAP_ENTRIES;
3311                         for (j = 0; j < vsmap_size; j++) {
3312                                 if (VSM_ISCLR(vsmap[j]) ||
3313                                     VSM_ISERR(vsmap[j]) ||
3314                                     (VSM_PS(vsmap[j]) != segment))
3315                                         continue;
3316                                 if(vs_cluster_transfer(vs,
3317                                         (vm_page_size * (j << vs->vs_clshift))
3318                                         + clmap_off,
3319                                         vm_page_size << vs->vs_clshift,
3320                                         upl)
3321                                                 != KERN_SUCCESS) {
3322                                    VS_LOCK(vs);
3323                                    vs->vs_xfer_pending = FALSE;
3324                                    VS_UNLOCK(vs);
3325                                    vs_finish_write(vs);
3326                                    return KERN_FAILURE;
3327                                 }
3328                                 /* allow other readers/writers during transfer*/
3329                                 VS_LOCK(vs);
3330                                 vs->vs_xfer_pending = FALSE;
3331                                 VS_UNLOCK(vs);
3332                                 vs_finish_write(vs);
3333                                 VS_LOCK(vs);
3334                                 vs->vs_xfer_pending = TRUE;
3335                                 vs_wait_for_sync_writers(vs);
3336                                 vs_start_write(vs);
3337                                 vs_wait_for_readers(vs);
3338                                 VS_UNLOCK(vs);
3339                                 if (!(vs->vs_indirect)) {
3340                                         goto vs_changed;
3341                                 }
3342                         }
3343                 }
3344         } else {
3345                 vsmap = vs->vs_dmap;
3346                 if (vsmap == NULL) {
3347                         VS_LOCK(vs);
3348                         vs->vs_xfer_pending = FALSE;
3349                         VS_UNLOCK(vs);
3350                         vs_finish_write(vs);
3351                         return KERN_SUCCESS;
3352                 }
3353                 /* loop on clusters in the direct map */
3354                 for (j = 0; j < vs->vs_size; j++) {
3355                         if (VSM_ISCLR(vsmap[j]) ||
3356                             VSM_ISERR(vsmap[j]) ||
3357                             (VSM_PS(vsmap[j]) != segment))
3358                                 continue;
3359                         if(vs_cluster_transfer(vs,
3360                                 vm_page_size * (j << vs->vs_clshift),
3361                                 vm_page_size << vs->vs_clshift,
3362                                 upl) != KERN_SUCCESS) {
3363                            VS_LOCK(vs);
3364                            vs->vs_xfer_pending = FALSE;
3365                            VS_UNLOCK(vs);
3366                            vs_finish_write(vs);
3367                            return KERN_FAILURE;
3368                         }
3369                         /* allow other readers/writers during transfer*/
3370                         VS_LOCK(vs);
3371                         vs->vs_xfer_pending = FALSE;
3372                         VS_UNLOCK(vs);
3373                         vs_finish_write(vs);
3374                         VS_LOCK(vs);
3375                         vs->vs_xfer_pending = TRUE;
3376                         VS_UNLOCK(vs);
3377                         vs_wait_for_sync_writers(vs);
3378                         vs_start_write(vs);
3379                         vs_wait_for_readers(vs);
3380                         if (vs->vs_indirect) {
3381                                 goto vs_changed;
3382                         }
3383                 }
3384         }
3385
3386         VS_LOCK(vs);
3387         vs->vs_xfer_pending = FALSE;
3388         VS_UNLOCK(vs);
3389         vs_finish_write(vs);
3390         return KERN_SUCCESS;
3391 }
3392
3393
3394
3395 vs_map_t
3396 vs_get_map_entry(
3397         vstruct_t       vs,
3398         vm_offset_t     offset)
3399 {
3400         struct vs_map   *vsmap;
3401         vm_offset_t     cluster;
3402
3403         cluster = atop_32(offset) >> vs->vs_clshift;
3404         if (vs->vs_indirect) {
3405                 long    ind_block = cluster/CLMAP_ENTRIES;
3406
3407                 /* Is the indirect block allocated? */
3408                 vsmap = vs->vs_imap[ind_block];
3409                 if(vsmap == (vs_map_t) NULL)
3410                         return vsmap;
3411         } else
3412                 vsmap = vs->vs_dmap;
3413         vsmap += cluster%CLMAP_ENTRIES;
3414         return vsmap;
3415 }
3416
3417 kern_return_t
3418 vs_cluster_transfer(
3419         vstruct_t       vs,
3420         vm_offset_t     offset,
3421         vm_size_t       cnt,
3422         upl_t           upl)
3423 {
3424         vm_offset_t             actual_offset;
3425         paging_segment_t        ps;
3426         struct clmap            clmap;
3427         kern_return_t           error = KERN_SUCCESS;
3428         unsigned int            size, size_wanted;
3429         int                     i;
3430         unsigned int            residual = 0;
3431         unsigned int            unavail_size;
3432 //      default_pager_thread_t  *dpt;
3433 //      boolean_t               dealloc;
3434         struct  vs_map          *vsmap_ptr = NULL;
3435         struct  vs_map          read_vsmap;
3436         struct  vs_map          original_read_vsmap;
3437         struct  vs_map          write_vsmap;
3438 //      upl_t                           sync_upl;
3439 //      vm_offset_t                     ioaddr;
3440
3441         /* vs_cluster_transfer reads in the pages of a cluster and
3442          * then writes these pages back to new backing store.  The
3443          * segment the pages are being read from is assumed to have
3444          * been taken off-line and is no longer considered for new
3445          * space requests.
3446          */
3447
3448         /*
3449          * This loop will be executed once per cluster referenced.
3450          * Typically this means once, since it's unlikely that the
3451          * VM system will ask for anything spanning cluster boundaries.
3452          *
3453          * If there are holes in a cluster (in a paging segment), we stop
3454          * reading at the hole, then loop again, hoping to
3455          * find valid pages later in the cluster.  This continues until
3456          * the entire range has been examined, and read, if present.  The
3457          * pages are written as they are read.  If a failure occurs after
3458          * some pages are written the unmap call at the bottom of the loop
3459          * recovers the backing store and the old backing store remains
3460          * in effect.
3461          */
3462
3463         VSM_CLR(write_vsmap);
3464         VSM_CLR(original_read_vsmap);
3465         /* grab the actual object's pages to sync with I/O */
3466         while (cnt && (error == KERN_SUCCESS)) {
3467                 vsmap_ptr = vs_get_map_entry(vs, offset);
3468                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3469
3470                 if (actual_offset == (vm_offset_t) -1) {
3471
3472                         /*
3473                          * Nothing left to write in this cluster at least
3474                          * set write cluster information for any previous
3475                          * write, clear for next cluster, if there is one
3476                          */
3477                         unsigned int local_size, clmask, clsize;
3478
3479                         clsize = vm_page_size << vs->vs_clshift;
3480                         clmask = clsize - 1;
3481                         local_size = clsize - (offset & clmask);
3482                         ASSERT(local_size);
3483                         local_size = MIN(local_size, cnt);
3484
3485                         /* This cluster has no data in it beyond what may */
3486                         /* have been found on a previous iteration through */
3487                         /* the loop "write_vsmap" */
3488                         *vsmap_ptr = write_vsmap;
3489                         VSM_CLR(write_vsmap);
3490                         VSM_CLR(original_read_vsmap);
3491
3492                         cnt -= local_size;
3493                         offset += local_size;
3494                         continue;
3495                 }
3496
3497                 /*
3498                  * Count up contiguous available or unavailable
3499                  * pages.
3500                  */
3501                 ps = CLMAP_PS(clmap);
3502                 ASSERT(ps);
3503                 size = 0;
3504                 unavail_size = 0;
3505                 for (i = 0;
3506                      (size < cnt) && (unavail_size < cnt) &&
3507                      (i < CLMAP_NPGS(clmap)); i++) {
3508                         if (CLMAP_ISSET(clmap, i)) {
3509                                 if (unavail_size != 0)
3510                                         break;
3511                                 size += vm_page_size;
3512                                 BS_STAT(ps->ps_bs,
3513                                         ps->ps_bs->bs_pages_in++);
3514                         } else {
3515                                 if (size != 0)
3516                                         break;
3517                                 unavail_size += vm_page_size;
3518                         }
3519                 }
3520
3521                 if (size == 0) {
3522                         ASSERT(unavail_size);
3523                         cnt -= unavail_size;
3524                         offset += unavail_size;
3525                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3526                                 == 0) {
3527                                 /* There is no more to transfer in this
3528                                    cluster
3529                                 */
3530                                 *vsmap_ptr = write_vsmap;
3531                                 VSM_CLR(write_vsmap);
3532                                 VSM_CLR(original_read_vsmap);
3533                         }
3534                         continue;
3535                 }
3536
3537                 if(VSM_ISCLR(original_read_vsmap))
3538                         original_read_vsmap = *vsmap_ptr;
3539
3540                 if(ps->ps_segtype == PS_PARTITION) {
3541                         panic("swap partition not supported\n");
3542                         /*NOTREACHED*/
3543                         error = KERN_FAILURE;
3544                         residual = size;
3545 /*
3546                         NEED TO ISSUE WITH SYNC & NO COMMIT
3547                         error = ps_read_device(ps, actual_offset, &buffer,
3548                                        size, &residual, flags);
3549 */
3550                 } else {
3551                         /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3552                         error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
3553                                         size, &residual,
3554                                         (UPL_IOSYNC | UPL_NOCOMMIT));
3555                 }
3556
3557                 read_vsmap = *vsmap_ptr;
3558
3559
3560                 /*
3561                  * Adjust counts and put data in new BS.  Optimize for the
3562                  * common case, i.e. no error and/or partial data.
3563                  * If there was an error, then we need to error the entire
3564                  * range, even if some data was successfully read.
3565                  *
3566                  */
3567                 if ((error == KERN_SUCCESS) && (residual == 0)) {
3568
3569                         /*
3570                          * Got everything we asked for, supply the data to
3571                          * the new BS.  Note that as a side effect of supplying
3572                          * the data, the buffer holding the supplied data is
3573                          * deallocated from the pager's address space unless
3574                          * the write is unsuccessful.
3575                          */
3576
3577                         /* note buffer will be cleaned up in all cases by */
3578                         /* internal_cluster_write or if an error on write */
3579                         /* the vm_map_copy_page_discard call              */
3580                         *vsmap_ptr = write_vsmap;
3581
3582                         if(vs_cluster_write(vs, upl, offset,
3583                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3584                                 error = KERN_FAILURE;
3585                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
3586                                         /* unmap the new backing store object */
3587                                         ps_clunmap(vs, offset, size);
3588                                 }
3589                                 /* original vsmap */
3590                                 *vsmap_ptr = original_read_vsmap;
3591                                 VSM_CLR(write_vsmap);
3592                         } else {
3593                                if((offset + size) &
3594                                         ((vm_page_size << vs->vs_clshift)
3595                                         - 1)) {
3596                                         /* There is more to transfer in this
3597                                            cluster
3598                                         */
3599                                         write_vsmap = *vsmap_ptr;
3600                                         *vsmap_ptr = read_vsmap;
3601                                 } else {
3602                                         /* discard the old backing object */
3603                                         write_vsmap = *vsmap_ptr;
3604                                         *vsmap_ptr = read_vsmap;
3605                                         ps_clunmap(vs, offset, size);
3606                                         *vsmap_ptr = write_vsmap;
3607                                         VSM_CLR(write_vsmap);
3608                                         VSM_CLR(original_read_vsmap);
3609                                 }
3610                         }
3611                 } else {
3612                         size_wanted = size;
3613                         if (error == KERN_SUCCESS) {
3614                                 if (residual == size) {
3615                                         /*
3616                                          * If a read operation returns no error
3617                                          * and no data moved, we turn it into
3618                                          * an error, assuming we're reading at
3619                                          * or beyond EOF.
3620                                          * Fall through and error the entire
3621                                          * range.
3622                                          */
3623                                         error = KERN_FAILURE;
3624                                         *vsmap_ptr = write_vsmap;
3625                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3626                                         /* unmap the new backing store object */
3627                                         ps_clunmap(vs, offset, size);
3628                                         }
3629                                         *vsmap_ptr = original_read_vsmap;
3630                                         VSM_CLR(write_vsmap);
3631                                         continue;
3632                                 } else {
3633                                         /*
3634                                          * Otherwise, we have partial read.
3635                                          * This is also considered an error
3636                                          * for the purposes of cluster transfer
3637                                          */
3638                                         error = KERN_FAILURE;
3639                                         *vsmap_ptr = write_vsmap;
3640                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3641                                         /* unmap the new backing store object */
3642                                         ps_clunmap(vs, offset, size);
3643                                         }
3644                                         *vsmap_ptr = original_read_vsmap;
3645                                         VSM_CLR(write_vsmap);
3646                                         continue;
3647                                 }
3648                         }
3649
3650                 }
3651                 cnt -= size;
3652                 offset += size;
3653
3654         } /* END while (cnt && (error == 0)) */
3655         if(!VSM_ISCLR(write_vsmap))
3656                 *vsmap_ptr = write_vsmap;
3657
3658         return error;
3659 }
3660
3661 kern_return_t
3662 default_pager_add_file(
3663         MACH_PORT_FACE  backing_store,
3664         vnode_ptr_t     vp,
3665         int             record_size,
3666         vm_size_t       size)
3667 {
3668         backing_store_t         bs;
3669         paging_segment_t        ps;
3670         int                     i;
3671         unsigned int            j;
3672         int                     error;
3673
3674         if ((bs = backing_store_lookup(backing_store))
3675             == BACKING_STORE_NULL)
3676                 return KERN_INVALID_ARGUMENT;
3677
3678         PSL_LOCK();
3679         for (i = 0; i <= paging_segment_max; i++) {
3680                 ps = paging_segments[i];
3681                 if (ps == PAGING_SEGMENT_NULL)
3682                         continue;
3683                 if (ps->ps_segtype != PS_FILE)
3684                         continue;
3685
3686                 /*
3687                  * Check for overlap on same device.
3688                  */
3689                 if (ps->ps_vnode == (struct vnode *)vp) {
3690                         PSL_UNLOCK();
3691                         BS_UNLOCK(bs);
3692                         return KERN_INVALID_ARGUMENT;
3693                 }
3694         }
3695         PSL_UNLOCK();
3696
3697         /*
3698          * Set up the paging segment
3699          */
3700         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3701         if (ps == PAGING_SEGMENT_NULL) {
3702                 BS_UNLOCK(bs);
3703                 return KERN_RESOURCE_SHORTAGE;
3704         }
3705
3706         ps->ps_segtype = PS_FILE;
3707         ps->ps_vnode = (struct vnode *)vp;
3708         ps->ps_offset = 0;
3709         ps->ps_record_shift = local_log2(vm_page_size / record_size);
3710         ps->ps_recnum = size;
3711         ps->ps_pgnum = size >> ps->ps_record_shift;
3712
3713         ps->ps_pgcount = ps->ps_pgnum;
3714         ps->ps_clshift = local_log2(bs->bs_clsize);
3715         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3716         ps->ps_hint = 0;
3717
3718         PS_LOCK_INIT(ps);
3719         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3720         if (!ps->ps_bmap) {
3721                 kfree(ps, sizeof *ps);
3722                 BS_UNLOCK(bs);
3723                 return KERN_RESOURCE_SHORTAGE;
3724         }
3725         for (j = 0; j < ps->ps_ncls; j++) {
3726                 clrbit(ps->ps_bmap, j);
3727         }
3728
3729         ps->ps_going_away = FALSE;
3730         ps->ps_bs = bs;
3731
3732         if ((error = ps_enter(ps)) != 0) {
3733                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3734                 kfree(ps, sizeof *ps);
3735                 BS_UNLOCK(bs);
3736                 return KERN_RESOURCE_SHORTAGE;
3737         }
3738
3739         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3740         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3741         PSL_LOCK();
3742         dp_pages_free += ps->ps_pgcount;
3743         PSL_UNLOCK();
3744
3745         BS_UNLOCK(bs);
3746
3747         bs_more_space(ps->ps_clcount);
3748
3749         DP_DEBUG(DEBUG_BS_INTERNAL,
3750                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3751                   device, offset, size, record_size,
3752                   ps->ps_record_shift, ps->ps_pgnum));
3753
3754         return KERN_SUCCESS;
3755 }
3756
3757
3758
3759 kern_return_t
3760 ps_read_file(
3761         paging_segment_t        ps,
3762         upl_t                   upl,
3763         upl_offset_t            upl_offset,
3764         vm_offset_t             offset,
3765         upl_size_t              size,
3766         unsigned int            *residualp,
3767         int                     flags)
3768 {
3769         vm_object_offset_t      f_offset;
3770         int                     error = 0;
3771         int                     result;
3772
3773         assert(dp_encryption_inited);
3774
3775         clustered_reads[atop_32(size)]++;
3776
3777         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3778
3779         /* for transfer case we need to pass uploffset and flags */
3780         error = vnode_pagein(ps->ps_vnode,
3781                                    upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
3782
3783         /* The vnode_pagein semantic is somewhat at odds with the existing   */
3784         /* device_read semantic.  Partial reads are not experienced at this  */
3785         /* level.  It is up to the bit map code and cluster read code to     */
3786         /* check that requested data locations are actually backed, and the  */
3787         /* pagein code to either read all of the requested data or return an */
3788         /* error. */
3789
3790         if (error)
3791                 result = KERN_FAILURE;
3792         else {
3793                 *residualp = 0;
3794                 result = KERN_SUCCESS;
3795         }
3796         return result;
3797 }
3798
3799 kern_return_t
3800 ps_write_file(
3801         paging_segment_t        ps,
3802         upl_t                   upl,
3803         upl_offset_t            upl_offset,
3804         vm_offset_t             offset,
3805         unsigned int            size,
3806         int                     flags)
3807 {
3808         vm_object_offset_t      f_offset;
3809         kern_return_t           result;
3810
3811         assert(dp_encryption_inited);
3812
3813         clustered_writes[atop_32(size)]++;
3814         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3815
3816         if (flags & UPL_PAGING_ENCRYPTED) {
3817                 /*
3818                  * ENCRYPTED SWAP:
3819                  * encrypt all the pages that we're going
3820                  * to pageout.
3821                  */
3822                 upl_encrypt(upl, upl_offset, size);
3823         }
3824
3825         if (vnode_pageout(ps->ps_vnode,
3826                                 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3827                 result = KERN_FAILURE;
3828         else
3829                 result = KERN_SUCCESS;
3830
3831         return result;
3832 }
3833
3834 kern_return_t
3835 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
3836         int             hi_wat,
3837         int             lo_wat,
3838         int             flags,
3839         MACH_PORT_FACE  trigger_port)
3840 {
3841         MACH_PORT_FACE release;
3842         kern_return_t kr;
3843
3844         PSL_LOCK();
3845         if (flags == SWAP_ENCRYPT_ON) {
3846                 /* ENCRYPTED SWAP: turn encryption on */
3847                 release = trigger_port;
3848                 if (!dp_encryption_inited) {
3849                         dp_encryption_inited = TRUE;
3850                         dp_encryption = TRUE;
3851                         kr = KERN_SUCCESS;
3852                 } else {
3853                         kr = KERN_FAILURE;
3854                 }
3855         } else if (flags == SWAP_ENCRYPT_OFF) {
3856                 /* ENCRYPTED SWAP: turn encryption off */
3857                 release = trigger_port;
3858                 if (!dp_encryption_inited) {
3859                         dp_encryption_inited = TRUE;
3860                         dp_encryption = FALSE;
3861                         kr = KERN_SUCCESS;
3862                 } else {
3863                         kr = KERN_FAILURE;
3864                 }
3865         } else if (flags == HI_WAT_ALERT) {
3866                 release = min_pages_trigger_port;
3867                 min_pages_trigger_port = trigger_port;
3868                 minimum_pages_remaining = hi_wat/vm_page_size;
3869                 bs_low = FALSE;
3870                 kr = KERN_SUCCESS;
3871         } else if (flags ==  LO_WAT_ALERT) {
3872                 release = max_pages_trigger_port;
3873                 max_pages_trigger_port = trigger_port;
3874                 maximum_pages_free = lo_wat/vm_page_size;
3875                 kr = KERN_SUCCESS;
3876         } else {
3877                 release = trigger_port;
3878                 kr =  KERN_INVALID_ARGUMENT;
3879         }
3880         PSL_UNLOCK();
3881
3882         if (IP_VALID(release))
3883                 ipc_port_release_send(release);
3884
3885         return kr;
3886 }
3887
3888 /*
3889  * Monitor the amount of available backing store vs. the amount of
3890  * required backing store, notify a listener (if present) when
3891  * backing store may safely be removed.
3892  *
3893  * We attempt to avoid the situation where backing store is
3894  * discarded en masse, as this can lead to thrashing as the
3895  * backing store is compacted.
3896  */
3897
3898 #define PF_INTERVAL     3       /* time between free level checks */
3899 #define PF_LATENCY      10      /* number of intervals before release */
3900
3901 static int dp_pages_free_low_count = 0;
3902 thread_call_t default_pager_backing_store_monitor_callout;
3903
3904 void
3905 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
3906                                                                         __unused thread_call_param_t p2)
3907 {
3908 //      unsigned long long      average;
3909         ipc_port_t              trigger;
3910         uint64_t                deadline;
3911
3912         /*
3913          * We determine whether it will be safe to release some
3914          * backing store by watching the free page level.  If
3915          * it remains below the maximum_pages_free threshold for
3916          * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3917          * then we deem it safe.
3918          *
3919          * Note that this establishes a maximum rate at which backing
3920          * store will be released, as each notification (currently)
3921          * only results in a single backing store object being
3922          * released.
3923          */
3924         if (dp_pages_free > maximum_pages_free) {
3925                 dp_pages_free_low_count++;
3926         } else {
3927                 dp_pages_free_low_count = 0;
3928         }
3929
3930         /* decide whether to send notification */
3931         trigger = IP_NULL;
3932         if (max_pages_trigger_port &&
3933             (backing_store_release_trigger_disable == 0) &&
3934             (dp_pages_free_low_count > PF_LATENCY)) {
3935                 trigger = max_pages_trigger_port;
3936                 max_pages_trigger_port = NULL;
3937         }
3938
3939         /* send notification */
3940         if (trigger != IP_NULL) {
3941                 VSL_LOCK();
3942                 if(backing_store_release_trigger_disable != 0) {
3943                         assert_wait((event_t)
3944                                     &backing_store_release_trigger_disable,
3945                                     THREAD_UNINT);
3946                         VSL_UNLOCK();
3947                         thread_block(THREAD_CONTINUE_NULL);
3948                 } else {
3949                         VSL_UNLOCK();
3950                 }
3951                 default_pager_space_alert(trigger, LO_WAT_ALERT);
3952                 ipc_port_release_send(trigger);
3953                 dp_pages_free_low_count = 0;
3954         }
3955
3956         clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
3957         thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
3958 }