osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*
  24  * @OSF_COPYRIGHT@
  25  */
  26 /*
  27  * Mach Operating System
  28  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  29  * All Rights Reserved.
  30  *
  31  * Permission to use, copy, modify and distribute this software and its
  32  * documentation is hereby granted, provided that both the copyright
  33  * notice and this permission notice appear in all copies of the
  34  * software, derivative works or modified versions, and any portions
  35  * thereof, and that both notices appear in supporting documentation.
  36  *
  37  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  38  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  39  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  40  *
  41  * Carnegie Mellon requests users of this software to return to
  42  *
  43  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  44  *  School of Computer Science
  45  *  Carnegie Mellon University
  46  *  Pittsburgh PA 15213-3890
  47  *
  48  * any improvements or extensions that they make and grant Carnegie Mellon
  49  * the rights to redistribute these changes.
  50  */
  51
  52 /*
  53  *      Default Pager.
  54  *              Paging File Management.
  55  */
  56
  57 #include <mach/host_priv.h>
  58 #include <mach/memory_object_control.h>
  59 #include <mach/memory_object_server.h>
  60 #include <mach/upl.h>
  61 #include <default_pager/default_pager_internal.h>
  62 #include <default_pager/default_pager_alerts.h>
  63 #include <default_pager/default_pager_object_server.h>
  64
  65 #include <ipc/ipc_types.h>
  66 #include <ipc/ipc_port.h>
  67 #include <ipc/ipc_space.h>
  68
  69 #include <kern/kern_types.h>
  70 #include <kern/host.h>
  71 #include <kern/queue.h>
  72 #include <kern/counters.h>
  73 #include <kern/sched_prim.h>
  74
  75 #include <vm/vm_kern.h>
  76 #include <vm/vm_pageout.h>
  77 #include <vm/vm_map.h>
  78 #include <vm/vm_object.h>
  79 #include <vm/vm_protos.h>
  80
  81 /* LP64todo - need large internal object support */
  82
  83 /*
  84  * ALLOC_STRIDE... the maximum number of bytes allocated from
  85  * a swap file before moving on to the next swap file... if
  86  * all swap files reside on a single disk, this value should
  87  * be very large (this is the default assumption)... if the
  88  * swap files are spread across multiple disks, than this value
  89  * should be small (128 * 1024)...
  90  *
  91  * This should be determined dynamically in the future
  92  */
  93
  94 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
  95 int physical_transfer_cluster_count = 0;
  96
  97 #define VM_SUPER_CLUSTER        0x40000
  98 #define VM_SUPER_PAGES          64
  99
 100 /*
 101  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
 102  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
 103  */
 104 #define VSTRUCT_DEF_CLSHIFT     2
 105 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
 106 int default_pager_clsize = 0;
 107
 108 /* statistics */
 109 unsigned int clustered_writes[VM_SUPER_PAGES+1];
 110 unsigned int clustered_reads[VM_SUPER_PAGES+1];
 111
 112 /*
 113  * Globals used for asynchronous paging operations:
 114  *      vs_async_list:  head of list of to-be-completed I/O ops
 115  *      async_num_queued: number of pages completed, but not yet
 116  *              processed by async thread.
 117  *      async_requests_out: number of pages of requests not completed.
 118  */
 119
 120 #if 0
 121 struct vs_async *vs_async_list;
 122 int     async_num_queued;
 123 int     async_requests_out;
 124 #endif
 125
 126
 127 #define VS_ASYNC_REUSE 1
 128 struct vs_async *vs_async_free_list;
 129
 130 mutex_t default_pager_async_lock;       /* Protects globals above */
 131
 132
 133 int vs_alloc_async_failed = 0;                  /* statistics */
 134 int vs_alloc_async_count = 0;                   /* statistics */
 135 struct vs_async *vs_alloc_async(void);          /* forward */
 136 void vs_free_async(struct vs_async *vsa);       /* forward */
 137
 138
 139 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 140 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 141
 142 #define VS_ASYNC_LOCK()         mutex_lock(&default_pager_async_lock)
 143 #define VS_ASYNC_UNLOCK()       mutex_unlock(&default_pager_async_lock)
 144 #define VS_ASYNC_LOCK_INIT()    mutex_init(&default_pager_async_lock, 0)
 145 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 146 /*
 147  *  Paging Space Hysteresis triggers and the target notification port
 148  *
 149  */
 150
 151 unsigned int    minimum_pages_remaining = 0;
 152 unsigned int    maximum_pages_free = 0;
 153 ipc_port_t      min_pages_trigger_port = NULL;
 154 ipc_port_t      max_pages_trigger_port = NULL;
 155
 156 boolean_t       bs_low = FALSE;
 157 int             backing_store_release_trigger_disable = 0;
 158
 159
 160 /* Have we decided if swap needs to be encrypted yet ? */
 161 boolean_t       dp_encryption_inited = FALSE;
 162 /* Should we encrypt swap ? */
 163 boolean_t       dp_encryption = FALSE;
 164
 165
 166 /*
 167  * Object sizes are rounded up to the next power of 2,
 168  * unless they are bigger than a given maximum size.
 169  */
 170 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 171
 172 /*
 173  * List of all backing store and segments.
 174  */
 175 struct backing_store_list_head backing_store_list;
 176 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 177 mutex_t                 paging_segments_lock;
 178 int                     paging_segment_max = 0;
 179 int                     paging_segment_count = 0;
 180 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 181
 182
 183 /*
 184  * Total pages free in system
 185  * This differs from clusters committed/avail which is a measure of the
 186  * over commitment of paging segments to backing store.  An idea which is
 187  * likely to be deprecated.
 188  */
 189 unsigned  int   dp_pages_free = 0;
 190 unsigned  int   cluster_transfer_minimum = 100;
 191
 192 /* forward declarations */
 193 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, int);     /* forward */
 194 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, unsigned int *, int);     /* forward */
 195 default_pager_thread_t *get_read_buffer( void );
 196 kern_return_t ps_vstruct_transfer_from_segment(
 197         vstruct_t        vs,
 198         paging_segment_t segment,
 199         upl_t            upl);
 200 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
 201 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
 202 kern_return_t vs_cluster_transfer(
 203         vstruct_t       vs,
 204         upl_offset_t    offset,
 205         upl_size_t      cnt,
 206         upl_t           upl);
 207 vs_map_t vs_get_map_entry(
 208         vstruct_t       vs,
 209         vm_offset_t     offset);
 210
 211
 212 default_pager_thread_t *
 213 get_read_buffer( void )
 214 {
 215         int     i;
 216
 217         DPT_LOCK(dpt_lock);
 218         while(TRUE) {
 219                 for (i=0; i<default_pager_internal_count; i++) {
 220                         if(dpt_array[i]->checked_out == FALSE) {
 221                           dpt_array[i]->checked_out = TRUE;
 222                           DPT_UNLOCK(dpt_lock);
 223                           return  dpt_array[i];
 224                         }
 225                 }
 226                 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
 227         }
 228 }
 229
 230 void
 231 bs_initialize(void)
 232 {
 233         int i;
 234
 235         /*
 236          * List of all backing store.
 237          */
 238         BSL_LOCK_INIT();
 239         queue_init(&backing_store_list.bsl_queue);
 240         PSL_LOCK_INIT();
 241
 242         VS_ASYNC_LOCK_INIT();
 243 #if     VS_ASYNC_REUSE
 244         vs_async_free_list = NULL;
 245 #endif  /* VS_ASYNC_REUSE */
 246
 247         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 248                 clustered_writes[i] = 0;
 249                 clustered_reads[i] = 0;
 250         }
 251
 252 }
 253
 254 /*
 255  * When things do not quite workout...
 256  */
 257 void bs_no_paging_space(boolean_t);     /* forward */
 258
 259 void
 260 bs_no_paging_space(
 261         boolean_t out_of_memory)
 262 {
 263
 264         if (out_of_memory)
 265                 dprintf(("*** OUT OF MEMORY ***\n"));
 266         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 267 }
 268
 269 void bs_more_space(int);        /* forward */
 270 void bs_commit(int);            /* forward */
 271
 272 boolean_t       user_warned = FALSE;
 273 unsigned int    clusters_committed = 0;
 274 unsigned int    clusters_available = 0;
 275 unsigned int    clusters_committed_peak = 0;
 276
 277 void
 278 bs_more_space(
 279         int     nclusters)
 280 {
 281         BSL_LOCK();
 282         /*
 283          * Account for new paging space.
 284          */
 285         clusters_available += nclusters;
 286
 287         if (clusters_available >= clusters_committed) {
 288                 if (verbose && user_warned) {
 289                         printf("%s%s - %d excess clusters now.\n",
 290                                my_name,
 291                                "paging space is OK now",
 292                                clusters_available - clusters_committed);
 293                         user_warned = FALSE;
 294                         clusters_committed_peak = 0;
 295                 }
 296         } else {
 297                 if (verbose && user_warned) {
 298                         printf("%s%s - still short of %d clusters.\n",
 299                                my_name,
 300                                "WARNING: paging space over-committed",
 301                                clusters_committed - clusters_available);
 302                         clusters_committed_peak -= nclusters;
 303                 }
 304         }
 305         BSL_UNLOCK();
 306
 307         return;
 308 }
 309
 310 void
 311 bs_commit(
 312         int     nclusters)
 313 {
 314         BSL_LOCK();
 315         clusters_committed += nclusters;
 316         if (clusters_committed > clusters_available) {
 317                 if (verbose && !user_warned) {
 318                         user_warned = TRUE;
 319                         printf("%s%s - short of %d clusters.\n",
 320                                my_name,
 321                                "WARNING: paging space over-committed",
 322                                clusters_committed - clusters_available);
 323                 }
 324                 if (clusters_committed > clusters_committed_peak) {
 325                         clusters_committed_peak = clusters_committed;
 326                 }
 327         } else {
 328                 if (verbose && user_warned) {
 329                         printf("%s%s - was short of up to %d clusters.\n",
 330                                my_name,
 331                                "paging space is OK now",
 332                                clusters_committed_peak - clusters_available);
 333                         user_warned = FALSE;
 334                         clusters_committed_peak = 0;
 335                 }
 336         }
 337         BSL_UNLOCK();
 338
 339         return;
 340 }
 341
 342 int default_pager_info_verbose = 1;
 343
 344 void
 345 bs_global_info(
 346         vm_size_t       *totalp,
 347         vm_size_t       *freep)
 348 {
 349         vm_size_t               pages_total, pages_free;
 350         paging_segment_t        ps;
 351         int                     i;
 352
 353         PSL_LOCK();
 354         pages_total = pages_free = 0;
 355         for (i = 0; i <= paging_segment_max; i++) {
 356                 ps = paging_segments[i];
 357                 if (ps == PAGING_SEGMENT_NULL)
 358                         continue;
 359
 360                 /*
 361                  * no need to lock: by the time this data
 362                  * gets back to any remote requestor it
 363                  * will be obsolete anyways
 364                  */
 365                 pages_total += ps->ps_pgnum;
 366                 pages_free += ps->ps_clcount << ps->ps_clshift;
 367                 DP_DEBUG(DEBUG_BS_INTERNAL,
 368                          ("segment #%d: %d total, %d free\n",
 369                           i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 370         }
 371         *totalp = pages_total;
 372         *freep = pages_free;
 373         if (verbose && user_warned && default_pager_info_verbose) {
 374                 if (clusters_available < clusters_committed) {
 375                         printf("%s %d clusters committed, %d available.\n",
 376                                my_name,
 377                                clusters_committed,
 378                                clusters_available);
 379                 }
 380         }
 381         PSL_UNLOCK();
 382 }
 383
 384 backing_store_t backing_store_alloc(void);      /* forward */
 385
 386 backing_store_t
 387 backing_store_alloc(void)
 388 {
 389         backing_store_t bs;
 390
 391         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 392         if (bs == BACKING_STORE_NULL)
 393                 panic("backing_store_alloc: no memory");
 394
 395         BS_LOCK_INIT(bs);
 396         bs->bs_port = MACH_PORT_NULL;
 397         bs->bs_priority = 0;
 398         bs->bs_clsize = 0;
 399         bs->bs_pages_total = 0;
 400         bs->bs_pages_in = 0;
 401         bs->bs_pages_in_fail = 0;
 402         bs->bs_pages_out = 0;
 403         bs->bs_pages_out_fail = 0;
 404
 405         return bs;
 406 }
 407
 408 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 409
 410 /* Even in both the component space and external versions of this pager, */
 411 /* backing_store_lookup will be called from tasks in the application space */
 412 backing_store_t
 413 backing_store_lookup(
 414         MACH_PORT_FACE port)
 415 {
 416         backing_store_t bs;
 417
 418 /*
 419         port is currently backed with a vs structure in the alias field
 420         we could create an ISBS alias and a port_is_bs call but frankly
 421         I see no reason for the test, the bs->port == port check below
 422         will work properly on junk entries.
 423
 424         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 425 */
 426         if ((port == MACH_PORT_NULL))
 427                 return BACKING_STORE_NULL;
 428
 429         BSL_LOCK();
 430         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 431                       bs_links) {
 432                 BS_LOCK(bs);
 433                 if (bs->bs_port == port) {
 434                         BSL_UNLOCK();
 435                         /* Success, return it locked. */
 436                         return bs;
 437                 }
 438                 BS_UNLOCK(bs);
 439         }
 440         BSL_UNLOCK();
 441         return BACKING_STORE_NULL;
 442 }
 443
 444 void backing_store_add(backing_store_t);        /* forward */
 445
 446 void
 447 backing_store_add(
 448         __unused backing_store_t bs)
 449 {
 450 //      MACH_PORT_FACE          port = bs->bs_port;
 451 //      MACH_PORT_FACE          pset = default_pager_default_set;
 452         kern_return_t           kr = KERN_SUCCESS;
 453
 454         if (kr != KERN_SUCCESS)
 455                 panic("backing_store_add: add to set");
 456
 457 }
 458
 459 /*
 460  * Set up default page shift, but only if not already
 461  * set and argument is within range.
 462  */
 463 boolean_t
 464 bs_set_default_clsize(unsigned int npages)
 465 {
 466         switch(npages){
 467             case 1:
 468             case 2:
 469             case 4:
 470             case 8:
 471                 if (default_pager_clsize == 0)  /* if not yet set */
 472                         vstruct_def_clshift = local_log2(npages);
 473                 return(TRUE);
 474         }
 475         return(FALSE);
 476 }
 477
 478 int bs_get_global_clsize(int clsize);   /* forward */
 479
 480 int
 481 bs_get_global_clsize(
 482         int     clsize)
 483 {
 484         int                     i;
 485         memory_object_default_t dmm;
 486         kern_return_t           kr;
 487
 488         /*
 489          * Only allow setting of cluster size once. If called
 490          * with no cluster size (default), we use the compiled-in default
 491          * for the duration. The same cluster size is used for all
 492          * paging segments.
 493          */
 494         if (default_pager_clsize == 0) {
 495                 /*
 496                  * Keep cluster size in bit shift because it's quicker
 497                  * arithmetic, and easier to keep at a power of 2.
 498                  */
 499                 if (clsize != NO_CLSIZE) {
 500                         for (i = 0; (1 << i) < clsize; i++);
 501                         if (i > MAX_CLUSTER_SHIFT)
 502                                 i = MAX_CLUSTER_SHIFT;
 503                         vstruct_def_clshift = i;
 504                 }
 505                 default_pager_clsize = (1 << vstruct_def_clshift);
 506
 507                 /*
 508                  * Let the user know the new (and definitive) cluster size.
 509                  */
 510                 if (verbose)
 511                         printf("%scluster size = %d page%s\n",
 512                                 my_name, default_pager_clsize,
 513                                 (default_pager_clsize == 1) ? "" : "s");
 514
 515                 /*
 516                  * Let the kernel know too, in case it hasn't used the
 517                  * default value provided in main() yet.
 518                  */
 519                 dmm = default_pager_object;
 520                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 521                 kr = host_default_memory_manager(host_priv_self(),
 522                                                  &dmm,
 523                                                  clsize);
 524                 memory_object_default_deallocate(dmm);
 525
 526                 if (kr != KERN_SUCCESS) {
 527                    panic("bs_get_global_cl_size:host_default_memory_manager");
 528                 }
 529                 if (dmm != default_pager_object) {
 530                   panic("bs_get_global_cl_size:there is another default pager");
 531                 }
 532         }
 533         ASSERT(default_pager_clsize > 0 &&
 534                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 535
 536         return default_pager_clsize;
 537 }
 538
 539 kern_return_t
 540 default_pager_backing_store_create(
 541         memory_object_default_t pager,
 542         int                     priority,
 543         int                     clsize,         /* in bytes */
 544         MACH_PORT_FACE          *backing_store)
 545 {
 546         backing_store_t bs;
 547         MACH_PORT_FACE  port;
 548 //      kern_return_t   kr;
 549         struct vstruct_alias *alias_struct;
 550
 551         if (pager != default_pager_object)
 552                 return KERN_INVALID_ARGUMENT;
 553
 554         bs = backing_store_alloc();
 555         port = ipc_port_alloc_kernel();
 556         ipc_port_make_send(port);
 557         assert (port != IP_NULL);
 558
 559         DP_DEBUG(DEBUG_BS_EXTERNAL,
 560                  ("priority=%d clsize=%d bs_port=0x%x\n",
 561                   priority, clsize, (int) backing_store));
 562
 563         alias_struct = (struct vstruct_alias *)
 564                                 kalloc(sizeof (struct vstruct_alias));
 565         if(alias_struct != NULL) {
 566                 alias_struct->vs = (struct vstruct *)bs;
 567                 alias_struct->name = ISVS;
 568                 port->alias = (int) alias_struct;
 569         }
 570         else {
 571                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 572                 kfree(bs, sizeof (struct backing_store));
 573                 return KERN_RESOURCE_SHORTAGE;
 574         }
 575
 576         bs->bs_port = port;
 577         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 578                 priority = BS_MAXPRI;
 579         else if (priority == BS_NOPRI)
 580                 priority = BS_MAXPRI;
 581         else
 582                 priority = BS_MINPRI;
 583         bs->bs_priority = priority;
 584
 585         bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
 586
 587         BSL_LOCK();
 588         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 589                     bs_links);
 590         BSL_UNLOCK();
 591
 592         backing_store_add(bs);
 593
 594         *backing_store = port;
 595         return KERN_SUCCESS;
 596 }
 597
 598 kern_return_t
 599 default_pager_backing_store_info(
 600         MACH_PORT_FACE          backing_store,
 601         backing_store_flavor_t  flavour,
 602         backing_store_info_t    info,
 603         mach_msg_type_number_t  *size)
 604 {
 605         backing_store_t                 bs;
 606         backing_store_basic_info_t      basic;
 607         int                             i;
 608         paging_segment_t                ps;
 609
 610         if (flavour != BACKING_STORE_BASIC_INFO ||
 611             *size < BACKING_STORE_BASIC_INFO_COUNT)
 612                 return KERN_INVALID_ARGUMENT;
 613
 614         basic = (backing_store_basic_info_t)info;
 615         *size = BACKING_STORE_BASIC_INFO_COUNT;
 616
 617         VSTATS_LOCK(&global_stats.gs_lock);
 618         basic->pageout_calls    = global_stats.gs_pageout_calls;
 619         basic->pagein_calls     = global_stats.gs_pagein_calls;
 620         basic->pages_in         = global_stats.gs_pages_in;
 621         basic->pages_out        = global_stats.gs_pages_out;
 622         basic->pages_unavail    = global_stats.gs_pages_unavail;
 623         basic->pages_init       = global_stats.gs_pages_init;
 624         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 625         VSTATS_UNLOCK(&global_stats.gs_lock);
 626
 627         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 628                 return KERN_INVALID_ARGUMENT;
 629
 630         basic->bs_pages_total   = bs->bs_pages_total;
 631         PSL_LOCK();
 632         bs->bs_pages_free = 0;
 633         for (i = 0; i <= paging_segment_max; i++) {
 634                 ps = paging_segments[i];
 635                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 636                         PS_LOCK(ps);
 637                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 638                         PS_UNLOCK(ps);
 639                 }
 640         }
 641         PSL_UNLOCK();
 642         basic->bs_pages_free    = bs->bs_pages_free;
 643         basic->bs_pages_in      = bs->bs_pages_in;
 644         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 645         basic->bs_pages_out     = bs->bs_pages_out;
 646         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 647
 648         basic->bs_priority      = bs->bs_priority;
 649         basic->bs_clsize        = ptoa_32(bs->bs_clsize);       /* in bytes */
 650
 651         BS_UNLOCK(bs);
 652
 653         return KERN_SUCCESS;
 654 }
 655
 656 int ps_delete(paging_segment_t);        /* forward */
 657
 658 int
 659 ps_delete(
 660         paging_segment_t ps)
 661 {
 662         vstruct_t       vs;
 663         kern_return_t   error = KERN_SUCCESS;
 664         int             vs_count;
 665
 666         VSL_LOCK();             /* get the lock on the list of vs's      */
 667
 668         /* The lock relationship and sequence is farily complicated      */
 669         /* this code looks at a live list, locking and unlocking the list */
 670         /* as it traverses it.  It depends on the locking behavior of    */
 671         /* default_pager_no_senders.  no_senders always locks the vstruct */
 672         /* targeted for removal before locking the vstruct list.  However */
 673         /* it will remove that member of the list without locking its    */
 674         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 675         /* it cannot be removed from the list but we must hold the list  */
 676         /* lock to be sure that its pointers to its neighbors are valid. */
 677         /* Also, we can hold off destruction of a vstruct when the list  */
 678         /* lock and the vs locks are not being held by bumping the       */
 679         /* vs_async_pending count.      */
 680
 681
 682         while(backing_store_release_trigger_disable != 0) {
 683                 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
 684         }
 685
 686         /* we will choose instead to hold a send right */
 687         vs_count = vstruct_list.vsl_count;
 688         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 689         if(vs == (vstruct_t)&vstruct_list)  {
 690                 VSL_UNLOCK();
 691                 return KERN_SUCCESS;
 692         }
 693         VS_LOCK(vs);
 694         vs_async_wait(vs);  /* wait for any pending async writes */
 695         if ((vs_count != 0) && (vs != NULL))
 696                 vs->vs_async_pending += 1;  /* hold parties calling  */
 697                                             /* vs_async_wait */
 698         VS_UNLOCK(vs);
 699         VSL_UNLOCK();
 700         while((vs_count != 0) && (vs != NULL)) {
 701                 /* We take the count of AMO's before beginning the         */
 702                 /* transfer of of the target segment.                      */
 703                 /* We are guaranteed that the target segment cannot get    */
 704                 /* more users.  We also know that queue entries are        */
 705                 /* made at the back of the list.  If some of the entries   */
 706                 /* we would check disappear while we are traversing the    */
 707                 /* list then we will either check new entries which        */
 708                 /* do not have any backing store in the target segment     */
 709                 /* or re-check old entries.  This might not be optimal     */
 710                 /* but it will always be correct. The alternative is to    */
 711                 /* take a snapshot of the list.                            */
 712                 vstruct_t       next_vs;
 713
 714                 if(dp_pages_free < cluster_transfer_minimum)
 715                         error = KERN_FAILURE;
 716                 else {
 717                         vm_object_t     transfer_object;
 718                         int             count;
 719                         upl_t           upl;
 720
 721                         transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
 722                         count = 0;
 723                         error = vm_object_upl_request(transfer_object,
 724                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 725                                 &upl, NULL, &count,
 726                                 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
 727                                             | UPL_SET_INTERNAL);
 728                         if(error == KERN_SUCCESS) {
 729                                 error = ps_vstruct_transfer_from_segment(
 730                                                         vs, ps, upl);
 731                                 upl_commit(upl, NULL, 0);
 732                                 upl_deallocate(upl);
 733                         } else {
 734                                 error = KERN_FAILURE;
 735                         }
 736                         vm_object_deallocate(transfer_object);
 737                 }
 738                 if(error) {
 739                         VS_LOCK(vs);
 740                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 741                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 742                                 vs->vs_waiting_async = FALSE;
 743                                 VS_UNLOCK(vs);
 744                                 thread_wakeup(&vs->vs_async_pending);
 745                         } else {
 746                                 VS_UNLOCK(vs);
 747                         }
 748                         return KERN_FAILURE;
 749                 }
 750
 751                 VSL_LOCK();
 752
 753                 while(backing_store_release_trigger_disable != 0) {
 754                         VSL_SLEEP(&backing_store_release_trigger_disable,
 755                                   THREAD_UNINT);
 756                 }
 757
 758                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 759                 if((next_vs != (vstruct_t)&vstruct_list) &&
 760                                 (vs != next_vs) && (vs_count != 1)) {
 761                         VS_LOCK(next_vs);
 762                         vs_async_wait(next_vs);  /* wait for any  */
 763                                                  /* pending async writes */
 764                         next_vs->vs_async_pending += 1; /* hold parties  */
 765                                                 /* calling vs_async_wait */
 766                         VS_UNLOCK(next_vs);
 767                 }
 768                 VSL_UNLOCK();
 769                 VS_LOCK(vs);
 770                 vs->vs_async_pending -= 1;
 771                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 772                         vs->vs_waiting_async = FALSE;
 773                         VS_UNLOCK(vs);
 774                         thread_wakeup(&vs->vs_async_pending);
 775                 } else {
 776                         VS_UNLOCK(vs);
 777                 }
 778                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 779                         vs = NULL;
 780                 else
 781                         vs = next_vs;
 782                 vs_count--;
 783         }
 784         return KERN_SUCCESS;
 785 }
 786
 787
 788 kern_return_t
 789 default_pager_backing_store_delete(
 790         MACH_PORT_FACE backing_store)
 791 {
 792         backing_store_t         bs;
 793         int                     i;
 794         paging_segment_t        ps;
 795         int                     error;
 796         int                     interim_pages_removed = 0;
 797 //      kern_return_t           kr;
 798
 799         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 800                 return KERN_INVALID_ARGUMENT;
 801
 802 #if 0
 803         /* not implemented */
 804         BS_UNLOCK(bs);
 805         return KERN_FAILURE;
 806 #endif
 807
 808     restart:
 809         PSL_LOCK();
 810         error = KERN_SUCCESS;
 811         for (i = 0; i <= paging_segment_max; i++) {
 812                 ps = paging_segments[i];
 813                 if (ps != PAGING_SEGMENT_NULL &&
 814                     ps->ps_bs == bs &&
 815                     ! ps->ps_going_away) {
 816                         PS_LOCK(ps);
 817                         /* disable access to this segment */
 818                         ps->ps_going_away = TRUE;
 819                         PS_UNLOCK(ps);
 820                         /*
 821                          * The "ps" segment is "off-line" now,
 822                          * we can try and delete it...
 823                          */
 824                         if(dp_pages_free < (cluster_transfer_minimum
 825                                                         + ps->ps_pgcount)) {
 826                                 error = KERN_FAILURE;
 827                                 PSL_UNLOCK();
 828                         }
 829                         else {
 830                                 /* remove all pages associated with the  */
 831                                 /* segment from the list of free pages   */
 832                                 /* when transfer is through, all target  */
 833                                 /* segment pages will appear to be free  */
 834
 835                                 dp_pages_free -=  ps->ps_pgcount;
 836                                 interim_pages_removed += ps->ps_pgcount;
 837                                 PSL_UNLOCK();
 838                                 error = ps_delete(ps);
 839                         }
 840                         if (error != KERN_SUCCESS) {
 841                                 /*
 842                                  * We couldn't delete the segment,
 843                                  * probably because there's not enough
 844                                  * virtual memory left.
 845                                  * Re-enable all the segments.
 846                                  */
 847                                 PSL_LOCK();
 848                                 break;
 849                         }
 850                         goto restart;
 851                 }
 852         }
 853
 854         if (error != KERN_SUCCESS) {
 855                 for (i = 0; i <= paging_segment_max; i++) {
 856                         ps = paging_segments[i];
 857                         if (ps != PAGING_SEGMENT_NULL &&
 858                             ps->ps_bs == bs &&
 859                             ps->ps_going_away) {
 860                                 PS_LOCK(ps);
 861                                 /* re-enable access to this segment */
 862                                 ps->ps_going_away = FALSE;
 863                                 PS_UNLOCK(ps);
 864                         }
 865                 }
 866                 dp_pages_free += interim_pages_removed;
 867                 PSL_UNLOCK();
 868                 BS_UNLOCK(bs);
 869                 return error;
 870         }
 871
 872         for (i = 0; i <= paging_segment_max; i++) {
 873                 ps = paging_segments[i];
 874                 if (ps != PAGING_SEGMENT_NULL &&
 875                     ps->ps_bs == bs) {
 876                         if(ps->ps_going_away) {
 877                                 paging_segments[i] = PAGING_SEGMENT_NULL;
 878                                 paging_segment_count--;
 879                                 PS_LOCK(ps);
 880                                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
 881                                 kfree(ps, sizeof *ps);
 882                         }
 883                 }
 884         }
 885
 886         /* Scan the entire ps array separately to make certain we find the */
 887         /* proper paging_segment_max                                       */
 888         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 889                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 890                    paging_segment_max = i;
 891         }
 892
 893         PSL_UNLOCK();
 894
 895         /*
 896          * All the segments have been deleted.
 897          * We can remove the backing store.
 898          */
 899
 900         /*
 901          * Disable lookups of this backing store.
 902          */
 903         if((void *)bs->bs_port->alias != NULL)
 904                 kfree((void *) bs->bs_port->alias,
 905                       sizeof (struct vstruct_alias));
 906         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 907         bs->bs_port = MACH_PORT_NULL;
 908         BS_UNLOCK(bs);
 909
 910         /*
 911          * Remove backing store from backing_store list.
 912          */
 913         BSL_LOCK();
 914         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
 915                      bs_links);
 916         BSL_UNLOCK();
 917
 918         /*
 919          * Free the backing store structure.
 920          */
 921         kfree(bs, sizeof *bs);
 922
 923         return KERN_SUCCESS;
 924 }
 925
 926 int     ps_enter(paging_segment_t);     /* forward */
 927
 928 int
 929 ps_enter(
 930         paging_segment_t ps)
 931 {
 932         int i;
 933
 934         PSL_LOCK();
 935
 936         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 937                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
 938                         break;
 939         }
 940
 941         if (i < MAX_NUM_PAGING_SEGMENTS) {
 942                 paging_segments[i] = ps;
 943                 if (i > paging_segment_max)
 944                         paging_segment_max = i;
 945                 paging_segment_count++;
 946                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
 947                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
 948                         ps_select_array[ps->ps_bs->bs_priority] = 0;
 949                 i = 0;
 950         } else {
 951                 PSL_UNLOCK();
 952                 return KERN_RESOURCE_SHORTAGE;
 953         }
 954
 955         PSL_UNLOCK();
 956         return i;
 957 }
 958
 959 #ifdef DEVICE_PAGING
 960 kern_return_t
 961 default_pager_add_segment(
 962         MACH_PORT_FACE  backing_store,
 963         MACH_PORT_FACE  device,
 964         recnum_t        offset,
 965         recnum_t        count,
 966         int             record_size)
 967 {
 968         backing_store_t         bs;
 969         paging_segment_t        ps;
 970         int                     i;
 971         int                     error;
 972
 973         if ((bs = backing_store_lookup(backing_store))
 974             == BACKING_STORE_NULL)
 975                 return KERN_INVALID_ARGUMENT;
 976
 977         PSL_LOCK();
 978         for (i = 0; i <= paging_segment_max; i++) {
 979                 ps = paging_segments[i];
 980                 if (ps == PAGING_SEGMENT_NULL)
 981                         continue;
 982
 983                 /*
 984                  * Check for overlap on same device.
 985                  */
 986                 if (!(ps->ps_device != device
 987                       || offset >= ps->ps_offset + ps->ps_recnum
 988                       || offset + count <= ps->ps_offset)) {
 989                         PSL_UNLOCK();
 990                         BS_UNLOCK(bs);
 991                         return KERN_INVALID_ARGUMENT;
 992                 }
 993         }
 994         PSL_UNLOCK();
 995
 996         /*
 997          * Set up the paging segment
 998          */
 999         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1000         if (ps == PAGING_SEGMENT_NULL) {
1001                 BS_UNLOCK(bs);
1002                 return KERN_RESOURCE_SHORTAGE;
1003         }
1004
1005         ps->ps_segtype = PS_PARTITION;
1006         ps->ps_device = device;
1007         ps->ps_offset = offset;
1008         ps->ps_record_shift = local_log2(vm_page_size / record_size);
1009         ps->ps_recnum = count;
1010         ps->ps_pgnum = count >> ps->ps_record_shift;
1011
1012         ps->ps_pgcount = ps->ps_pgnum;
1013         ps->ps_clshift = local_log2(bs->bs_clsize);
1014         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1015         ps->ps_hint = 0;
1016
1017         PS_LOCK_INIT(ps);
1018         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1019         if (!ps->ps_bmap) {
1020                 kfree(ps, sizeof *ps);
1021                 BS_UNLOCK(bs);
1022                 return KERN_RESOURCE_SHORTAGE;
1023         }
1024         for (i = 0; i < ps->ps_ncls; i++) {
1025                 clrbit(ps->ps_bmap, i);
1026         }
1027
1028         ps->ps_going_away = FALSE;
1029         ps->ps_bs = bs;
1030
1031         if ((error = ps_enter(ps)) != 0) {
1032                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1033                 kfree(ps, sizeof *ps);
1034                 BS_UNLOCK(bs);
1035                 return KERN_RESOURCE_SHORTAGE;
1036         }
1037
1038         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1039         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1040         BS_UNLOCK(bs);
1041
1042         PSL_LOCK();
1043         dp_pages_free += ps->ps_pgcount;
1044         PSL_UNLOCK();
1045
1046         bs_more_space(ps->ps_clcount);
1047
1048         DP_DEBUG(DEBUG_BS_INTERNAL,
1049                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1050                   device, offset, count, record_size,
1051                   ps->ps_record_shift, ps->ps_pgnum));
1052
1053         return KERN_SUCCESS;
1054 }
1055
1056 boolean_t
1057 bs_add_device(
1058         char            *dev_name,
1059         MACH_PORT_FACE  master)
1060 {
1061         security_token_t        null_security_token = {
1062                 { 0, 0 }
1063         };
1064         MACH_PORT_FACE  device;
1065         int             info[DEV_GET_SIZE_COUNT];
1066         mach_msg_type_number_t info_count;
1067         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1068         unsigned int    rec_size;
1069         recnum_t        count;
1070         int             clsize;
1071         MACH_PORT_FACE  reply_port;
1072
1073         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1074                         null_security_token, dev_name, &device))
1075                 return FALSE;
1076
1077         info_count = DEV_GET_SIZE_COUNT;
1078         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1079                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1080                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1081                 clsize = bs_get_global_clsize(0);
1082                 if (!default_pager_backing_store_create(
1083                                         default_pager_object,
1084                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1085                                         (clsize * vm_page_size),
1086                                         &bs)) {
1087                         if (!default_pager_add_segment(bs, device,
1088                                                        0, count, rec_size)) {
1089                                 return TRUE;
1090                         }
1091                         ipc_port_release_receive(bs);
1092                 }
1093         }
1094
1095         ipc_port_release_send(device);
1096         return FALSE;
1097 }
1098 #endif /* DEVICE_PAGING */
1099
1100 #if     VS_ASYNC_REUSE
1101
1102 struct vs_async *
1103 vs_alloc_async(void)
1104 {
1105         struct vs_async *vsa;
1106         MACH_PORT_FACE  reply_port;
1107 //      kern_return_t   kr;
1108
1109         VS_ASYNC_LOCK();
1110         if (vs_async_free_list == NULL) {
1111                 VS_ASYNC_UNLOCK();
1112                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1113                 if (vsa != NULL) {
1114                         /*
1115                          * Try allocating a reply port named after the
1116                          * address of the vs_async structure.
1117                          */
1118                         struct vstruct_alias    *alias_struct;
1119
1120                         reply_port = ipc_port_alloc_kernel();
1121                         alias_struct = (struct vstruct_alias *)
1122                                 kalloc(sizeof (struct vstruct_alias));
1123                         if(alias_struct != NULL) {
1124                                 alias_struct->vs = (struct vstruct *)vsa;
1125                                 alias_struct->name = ISVS;
1126                                 reply_port->alias = (int) alias_struct;
1127                                 vsa->reply_port = reply_port;
1128                                 vs_alloc_async_count++;
1129                         }
1130                         else {
1131                                 vs_alloc_async_failed++;
1132                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1133                                                                 (reply_port));
1134                                 kfree(vsa, sizeof (struct vs_async));
1135                                 vsa = NULL;
1136                         }
1137                 }
1138         } else {
1139                 vsa = vs_async_free_list;
1140                 vs_async_free_list = vs_async_free_list->vsa_next;
1141                 VS_ASYNC_UNLOCK();
1142         }
1143
1144         return vsa;
1145 }
1146
1147 void
1148 vs_free_async(
1149         struct vs_async *vsa)
1150 {
1151         VS_ASYNC_LOCK();
1152         vsa->vsa_next = vs_async_free_list;
1153         vs_async_free_list = vsa;
1154         VS_ASYNC_UNLOCK();
1155 }
1156
1157 #else   /* VS_ASYNC_REUSE */
1158
1159 struct vs_async *
1160 vs_alloc_async(void)
1161 {
1162         struct vs_async *vsa;
1163         MACH_PORT_FACE  reply_port;
1164         kern_return_t   kr;
1165
1166         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1167         if (vsa != NULL) {
1168                 /*
1169                  * Try allocating a reply port named after the
1170                  * address of the vs_async structure.
1171                  */
1172                         reply_port = ipc_port_alloc_kernel();
1173                         alias_struct = (vstruct_alias *)
1174                                 kalloc(sizeof (struct vstruct_alias));
1175                         if(alias_struct != NULL) {
1176                                 alias_struct->vs = reply_port;
1177                                 alias_struct->name = ISVS;
1178                                 reply_port->alias = (int) vsa;
1179                                 vsa->reply_port = reply_port;
1180                                 vs_alloc_async_count++;
1181                         }
1182                         else {
1183                                 vs_alloc_async_failed++;
1184                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1185                                                                 (reply_port));
1186                                 kfree(vsa, sizeof (struct vs_async));
1187                                 vsa = NULL;
1188                         }
1189         }
1190
1191         return vsa;
1192 }
1193
1194 void
1195 vs_free_async(
1196         struct vs_async *vsa)
1197 {
1198         MACH_PORT_FACE  reply_port;
1199         kern_return_t   kr;
1200
1201         reply_port = vsa->reply_port;
1202         kfree(reply_port->alias, sizeof (struct vstuct_alias));
1203         kfree(vsa, sizeof (struct vs_async));
1204         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1205 #if 0
1206         VS_ASYNC_LOCK();
1207         vs_alloc_async_count--;
1208         VS_ASYNC_UNLOCK();
1209 #endif
1210 }
1211
1212 #endif  /* VS_ASYNC_REUSE */
1213
1214 zone_t  vstruct_zone;
1215
1216 vstruct_t
1217 ps_vstruct_create(
1218         vm_size_t size)
1219 {
1220         vstruct_t       vs;
1221         unsigned int    i;
1222
1223         vs = (vstruct_t) zalloc(vstruct_zone);
1224         if (vs == VSTRUCT_NULL) {
1225                 return VSTRUCT_NULL;
1226         }
1227
1228         VS_LOCK_INIT(vs);
1229
1230         /*
1231          * The following fields will be provided later.
1232          */
1233         vs->vs_mem_obj = NULL;
1234         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1235         vs->vs_references = 1;
1236         vs->vs_seqno = 0;
1237
1238 #ifdef MACH_KERNEL
1239         vs->vs_waiting_seqno = FALSE;
1240         vs->vs_waiting_read = FALSE;
1241         vs->vs_waiting_write = FALSE;
1242         vs->vs_waiting_async = FALSE;
1243 #else
1244         mutex_init(&vs->vs_waiting_seqno, 0);
1245         mutex_init(&vs->vs_waiting_read, 0);
1246         mutex_init(&vs->vs_waiting_write, 0);
1247         mutex_init(&vs->vs_waiting_refs, 0);
1248         mutex_init(&vs->vs_waiting_async, 0);
1249 #endif
1250
1251         vs->vs_readers = 0;
1252         vs->vs_writers = 0;
1253
1254         vs->vs_errors = 0;
1255
1256         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1257         vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1258         vs->vs_async_pending = 0;
1259
1260         /*
1261          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1262          * depending on the size of the memory object.
1263          */
1264         if (INDIRECT_CLMAP(vs->vs_size)) {
1265                 vs->vs_imap = (struct vs_map **)
1266                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1267                 vs->vs_indirect = TRUE;
1268         } else {
1269                 vs->vs_dmap = (struct vs_map *)
1270                         kalloc(CLMAP_SIZE(vs->vs_size));
1271                 vs->vs_indirect = FALSE;
1272         }
1273         vs->vs_xfer_pending = FALSE;
1274         DP_DEBUG(DEBUG_VS_INTERNAL,
1275                  ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1276
1277         /*
1278          * Check to see that we got the space.
1279          */
1280         if (!vs->vs_dmap) {
1281                 kfree(vs, sizeof *vs);
1282                 return VSTRUCT_NULL;
1283         }
1284
1285         /*
1286          * Zero the indirect pointers, or clear the direct pointers.
1287          */
1288         if (vs->vs_indirect)
1289                 memset(vs->vs_imap, 0,
1290                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1291         else
1292                 for (i = 0; i < vs->vs_size; i++)
1293                         VSM_CLR(vs->vs_dmap[i]);
1294
1295         VS_MAP_LOCK_INIT(vs);
1296
1297         bs_commit(vs->vs_size);
1298
1299         return vs;
1300 }
1301
1302 paging_segment_t ps_select_segment(unsigned int, int *);        /* forward */
1303
1304 paging_segment_t
1305 ps_select_segment(
1306         unsigned int    shift,
1307         int             *psindex)
1308 {
1309         paging_segment_t        ps;
1310         int                     i;
1311         int                     j;
1312
1313         /*
1314          * Optimize case where there's only one segment.
1315          * paging_segment_max will index the one and only segment.
1316          */
1317
1318         PSL_LOCK();
1319         if (paging_segment_count == 1) {
1320                 paging_segment_t lps;   /* used to avoid extra PS_UNLOCK */
1321                 ipc_port_t trigger = IP_NULL;
1322
1323                 ps = paging_segments[paging_segment_max];
1324                 *psindex = paging_segment_max;
1325                 PS_LOCK(ps);
1326                 if (ps->ps_going_away) {
1327                         /* this segment is being turned off */
1328                         lps = PAGING_SEGMENT_NULL;
1329                 } else {
1330                         ASSERT(ps->ps_clshift >= shift);
1331                         if (ps->ps_clcount) {
1332                                 ps->ps_clcount--;
1333                                 dp_pages_free -=  1 << ps->ps_clshift;
1334                                 if(min_pages_trigger_port &&
1335                                   (dp_pages_free < minimum_pages_remaining)) {
1336                                         trigger = min_pages_trigger_port;
1337                                         min_pages_trigger_port = NULL;
1338                                         bs_low = TRUE;
1339                                 }
1340                                 lps = ps;
1341                         } else
1342                                 lps = PAGING_SEGMENT_NULL;
1343                 }
1344                 PS_UNLOCK(ps);
1345                 PSL_UNLOCK();
1346
1347                 if (trigger != IP_NULL) {
1348                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1349                         ipc_port_release_send(trigger);
1350                 }
1351                 return lps;
1352         }
1353
1354         if (paging_segment_count == 0) {
1355                 PSL_UNLOCK();
1356                 return PAGING_SEGMENT_NULL;
1357         }
1358
1359         for (i = BS_MAXPRI;
1360              i >= BS_MINPRI; i--) {
1361                 int start_index;
1362
1363                 if ((ps_select_array[i] == BS_NOPRI) ||
1364                                 (ps_select_array[i] == BS_FULLPRI))
1365                         continue;
1366                 start_index = ps_select_array[i];
1367
1368                 if(!(paging_segments[start_index])) {
1369                         j = start_index+1;
1370                         physical_transfer_cluster_count = 0;
1371                 }
1372                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1373                                 (((paging_segments[start_index])->ps_clshift)
1374                                 + vm_page_shift))) {
1375                         physical_transfer_cluster_count = 0;
1376                         j = start_index + 1;
1377                 } else {
1378                         physical_transfer_cluster_count+=1;
1379                         j = start_index;
1380                         if(start_index == 0)
1381                                 start_index = paging_segment_max;
1382                         else
1383                                 start_index = start_index - 1;
1384                 }
1385
1386                 while (1) {
1387                         if (j > paging_segment_max)
1388                                 j = 0;
1389                         if ((ps = paging_segments[j]) &&
1390                             (ps->ps_bs->bs_priority == i)) {
1391                                 /*
1392                                  * Force the ps cluster size to be
1393                                  * >= that of the vstruct.
1394                                  */
1395                                 PS_LOCK(ps);
1396                                 if (ps->ps_going_away) {
1397                                         /* this segment is being turned off */
1398                                 } else if ((ps->ps_clcount) &&
1399                                            (ps->ps_clshift >= shift)) {
1400                                         ipc_port_t trigger = IP_NULL;
1401
1402                                         ps->ps_clcount--;
1403                                         dp_pages_free -=  1 << ps->ps_clshift;
1404                                         if(min_pages_trigger_port &&
1405                                                 (dp_pages_free <
1406                                                 minimum_pages_remaining)) {
1407                                                 trigger = min_pages_trigger_port;
1408                                                 min_pages_trigger_port = NULL;
1409                                         }
1410                                         PS_UNLOCK(ps);
1411                                         /*
1412                                          * found one, quit looking.
1413                                          */
1414                                         ps_select_array[i] = j;
1415                                         PSL_UNLOCK();
1416
1417                                         if (trigger != IP_NULL) {
1418                                                 default_pager_space_alert(
1419                                                         trigger,
1420                                                         HI_WAT_ALERT);
1421                                                 ipc_port_release_send(trigger);
1422                                         }
1423                                         *psindex = j;
1424                                         return ps;
1425                                 }
1426                                 PS_UNLOCK(ps);
1427                         }
1428                         if (j == start_index) {
1429                                 /*
1430                                  * none at this priority -- mark it full
1431                                  */
1432                                 ps_select_array[i] = BS_FULLPRI;
1433                                 break;
1434                         }
1435                         j++;
1436                 }
1437         }
1438         PSL_UNLOCK();
1439         return PAGING_SEGMENT_NULL;
1440 }
1441
1442 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1443
1444 vm_offset_t
1445 ps_allocate_cluster(
1446         vstruct_t               vs,
1447         int                     *psindex,
1448         paging_segment_t        use_ps)
1449 {
1450         unsigned int            byte_num;
1451         int                     bit_num = 0;
1452         paging_segment_t        ps;
1453         vm_offset_t             cluster;
1454         ipc_port_t              trigger = IP_NULL;
1455
1456         /*
1457          * Find best paging segment.
1458          * ps_select_segment will decrement cluster count on ps.
1459          * Must pass cluster shift to find the most appropriate segment.
1460          */
1461         /* NOTE:  The addition of paging segment delete capability threatened
1462          * to seriously complicate the treatment of paging segments in this
1463          * module and the ones that call it (notably ps_clmap), because of the
1464          * difficulty in assuring that the paging segment would continue to
1465          * exist between being unlocked and locked.   This was
1466          * avoided because all calls to this module are based in either
1467          * dp_memory_object calls which rely on the vs lock, or by
1468          * the transfer function which is part of the segment delete path.
1469          * The transfer function which is part of paging segment delete is
1470          * protected from multiple callers by the backing store lock.
1471          * The paging segment delete function treats mappings to a paging
1472          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1473          * while data is transferred to the remaining segments.  This is in
1474          * line with the view that incomplete or in-transition mappings between
1475          * data, a vstruct, and backing store are protected by the vs lock.
1476          * This and the ordering of the paging segment "going_away" bit setting
1477          * protects us.
1478          */
1479         if (use_ps != PAGING_SEGMENT_NULL) {
1480                 ps = use_ps;
1481                 PSL_LOCK();
1482                 PS_LOCK(ps);
1483
1484                 ASSERT(ps->ps_clcount != 0);
1485
1486                 ps->ps_clcount--;
1487                 dp_pages_free -=  1 << ps->ps_clshift;
1488                 if(min_pages_trigger_port &&
1489                                 (dp_pages_free < minimum_pages_remaining)) {
1490                         trigger = min_pages_trigger_port;
1491                         min_pages_trigger_port = NULL;
1492                 }
1493                 PSL_UNLOCK();
1494                 PS_UNLOCK(ps);
1495                 if (trigger != IP_NULL) {
1496                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1497                         ipc_port_release_send(trigger);
1498                 }
1499
1500         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1501                    PAGING_SEGMENT_NULL) {
1502                 static uint32_t lastnotify = 0;
1503                 uint32_t now, nanoseconds_dummy;
1504
1505                 /*
1506                  * Emit a notification of the low-paging resource condition
1507                  * but don't issue it more than once every five seconds.  This
1508                  * prevents us from overflowing logs with thousands of
1509                  * repetitions of the message.
1510                  */
1511                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1512                 if (now > lastnotify + 5) {
1513                         dprintf(("no space in available paging segments\n"));
1514                         lastnotify = now;
1515                 }
1516
1517                 /* the count got off maybe, reset to zero */
1518                 PSL_LOCK();
1519                 dp_pages_free = 0;
1520                 if(min_pages_trigger_port) {
1521                         trigger = min_pages_trigger_port;
1522                         min_pages_trigger_port = NULL;
1523                         bs_low = TRUE;
1524                 }
1525                 PSL_UNLOCK();
1526                 if (trigger != IP_NULL) {
1527                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1528                         ipc_port_release_send(trigger);
1529                 }
1530                 return (vm_offset_t) -1;
1531         }
1532
1533         /*
1534          * Look for an available cluster.  At the end of the loop,
1535          * byte_num is the byte offset and bit_num is the bit offset of the
1536          * first zero bit in the paging segment bitmap.
1537          */
1538         PS_LOCK(ps);
1539         byte_num = ps->ps_hint;
1540         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1541                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1542                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1543                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1544                                         break;
1545                         }
1546                         ASSERT(bit_num != NBBY);
1547                         break;
1548                 }
1549         }
1550         ps->ps_hint = byte_num;
1551         cluster = (byte_num*NBBY) + bit_num;
1552
1553         /* Space was reserved, so this must be true */
1554         ASSERT(cluster < ps->ps_ncls);
1555
1556         setbit(ps->ps_bmap, cluster);
1557         PS_UNLOCK(ps);
1558
1559         return cluster;
1560 }
1561
1562 void ps_deallocate_cluster(paging_segment_t, vm_offset_t);      /* forward */
1563
1564 void
1565 ps_deallocate_cluster(
1566         paging_segment_t        ps,
1567         vm_offset_t             cluster)
1568 {
1569
1570         if (cluster >= (vm_offset_t) ps->ps_ncls)
1571                 panic("ps_deallocate_cluster: Invalid cluster number");
1572
1573         /*
1574          * Lock the paging segment, clear the cluster's bitmap and increment the
1575          * number of free cluster.
1576          */
1577         PSL_LOCK();
1578         PS_LOCK(ps);
1579         clrbit(ps->ps_bmap, cluster);
1580         ++ps->ps_clcount;
1581         dp_pages_free +=  1 << ps->ps_clshift;
1582         PSL_UNLOCK();
1583
1584         /*
1585          * Move the hint down to the freed cluster if it is
1586          * less than the current hint.
1587          */
1588         if ((cluster/NBBY) < ps->ps_hint) {
1589                 ps->ps_hint = (cluster/NBBY);
1590         }
1591
1592         PS_UNLOCK(ps);
1593
1594         /*
1595          * If we're freeing space on a full priority, reset the array.
1596          */
1597         PSL_LOCK();
1598         if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1599                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1600         PSL_UNLOCK();
1601
1602         return;
1603 }
1604
1605 void ps_dealloc_vsmap(struct vs_map *, vm_size_t);      /* forward */
1606
1607 void
1608 ps_dealloc_vsmap(
1609         struct vs_map   *vsmap,
1610         vm_size_t       size)
1611 {
1612         unsigned int i;
1613         for (i = 0; i < size; i++)
1614                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1615                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1616                                               VSM_CLOFF(vsmap[i]));
1617 }
1618
1619 void
1620 ps_vstruct_dealloc(
1621         vstruct_t vs)
1622 {
1623         unsigned int    i;
1624 //      spl_t   s;
1625
1626         VS_MAP_LOCK(vs);
1627
1628         /*
1629          * If this is an indirect structure, then we walk through the valid
1630          * (non-zero) indirect pointers and deallocate the clusters
1631          * associated with each used map entry (via ps_dealloc_vsmap).
1632          * When all of the clusters in an indirect block have been
1633          * freed, we deallocate the block.  When all of the indirect
1634          * blocks have been deallocated we deallocate the memory
1635          * holding the indirect pointers.
1636          */
1637         if (vs->vs_indirect) {
1638                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1639                         if (vs->vs_imap[i] != NULL) {
1640                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1641                                 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1642                         }
1643                 }
1644                 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1645         } else {
1646                 /*
1647                  * Direct map.  Free used clusters, then memory.
1648                  */
1649                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1650                 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1651         }
1652         VS_MAP_UNLOCK(vs);
1653
1654         bs_commit(- vs->vs_size);
1655
1656         zfree(vstruct_zone, vs);
1657 }
1658
1659 int ps_map_extend(vstruct_t, unsigned int);     /* forward */
1660
1661 int ps_map_extend(
1662         vstruct_t       vs,
1663         unsigned int    new_size)
1664 {
1665         struct vs_map   **new_imap;
1666         struct vs_map   *new_dmap = NULL;
1667         int             newdsize;
1668         int             i;
1669         void            *old_map = NULL;
1670         int             old_map_size = 0;
1671
1672         if (vs->vs_size >= new_size) {
1673                 /*
1674                  * Someone has already done the work.
1675                  */
1676                 return 0;
1677         }
1678
1679         /*
1680          * If the new size extends into the indirect range, then we have one
1681          * of two cases: we are going from indirect to indirect, or we are
1682          * going from direct to indirect.  If we are going from indirect to
1683          * indirect, then it is possible that the new size will fit in the old
1684          * indirect map.  If this is the case, then just reset the size of the
1685          * vstruct map and we are done.  If the new size will not
1686          * fit into the old indirect map, then we have to allocate a new
1687          * indirect map and copy the old map pointers into this new map.
1688          *
1689          * If we are going from direct to indirect, then we have to allocate a
1690          * new indirect map and copy the old direct pages into the first
1691          * indirect page of the new map.
1692          * NOTE: allocating memory here is dangerous, as we're in the
1693          * pageout path.
1694          */
1695         if (INDIRECT_CLMAP(new_size)) {
1696                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1697
1698                 /*
1699                  * Get a new indirect map and zero it.
1700                  */
1701                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1702                 if (vs->vs_indirect &&
1703                     (new_map_size == old_map_size)) {
1704                         bs_commit(new_size - vs->vs_size);
1705                         vs->vs_size = new_size;
1706                         return 0;
1707                 }
1708
1709                 new_imap = (struct vs_map **)kalloc(new_map_size);
1710                 if (new_imap == NULL) {
1711                         return -1;
1712                 }
1713                 memset(new_imap, 0, new_map_size);
1714
1715                 if (vs->vs_indirect) {
1716                         /* Copy old entries into new map */
1717                         memcpy(new_imap, vs->vs_imap, old_map_size);
1718                         /* Arrange to free the old map */
1719                         old_map = (void *) vs->vs_imap;
1720                         newdsize = 0;
1721                 } else {        /* Old map was a direct map */
1722                         /* Allocate an indirect page */
1723                         if ((new_imap[0] = (struct vs_map *)
1724                              kalloc(CLMAP_THRESHOLD)) == NULL) {
1725                                 kfree(new_imap, new_map_size);
1726                                 return -1;
1727                         }
1728                         new_dmap = new_imap[0];
1729                         newdsize = CLMAP_ENTRIES;
1730                 }
1731         } else {
1732                 new_imap = NULL;
1733                 newdsize = new_size;
1734                 /*
1735                  * If the new map is a direct map, then the old map must
1736                  * also have been a direct map.  All we have to do is
1737                  * to allocate a new direct map, copy the old entries
1738                  * into it and free the old map.
1739                  */
1740                 if ((new_dmap = (struct vs_map *)
1741                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
1742                         return -1;
1743                 }
1744         }
1745         if (newdsize) {
1746
1747                 /* Free the old map */
1748                 old_map = (void *) vs->vs_dmap;
1749                 old_map_size = CLMAP_SIZE(vs->vs_size);
1750
1751                 /* Copy info from the old map into the new map */
1752                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1753
1754                 /* Initialize the rest of the new map */
1755                 for (i = vs->vs_size; i < newdsize; i++)
1756                         VSM_CLR(new_dmap[i]);
1757         }
1758         if (new_imap) {
1759                 vs->vs_imap = new_imap;
1760                 vs->vs_indirect = TRUE;
1761         } else
1762                 vs->vs_dmap = new_dmap;
1763         bs_commit(new_size - vs->vs_size);
1764         vs->vs_size = new_size;
1765         if (old_map)
1766                 kfree(old_map, old_map_size);
1767         return 0;
1768 }
1769
1770 vm_offset_t
1771 ps_clmap(
1772         vstruct_t       vs,
1773         vm_offset_t     offset,
1774         struct clmap    *clmap,
1775         int             flag,
1776         vm_size_t       size,
1777         int             error)
1778 {
1779         vm_offset_t     cluster;        /* The cluster of offset.       */
1780         vm_offset_t     newcl;          /* The new cluster allocated.   */
1781         vm_offset_t     newoff;
1782         unsigned int    i;
1783         struct vs_map   *vsmap;
1784
1785         VS_MAP_LOCK(vs);
1786
1787         ASSERT(vs->vs_dmap);
1788         cluster = atop_32(offset) >> vs->vs_clshift;
1789
1790         /*
1791          * Initialize cluster error value
1792          */
1793         clmap->cl_error = 0;
1794
1795         /*
1796          * If the object has grown, extend the page map.
1797          */
1798         if (cluster >= vs->vs_size) {
1799                 if (flag == CL_FIND) {
1800                         /* Do not allocate if just doing a lookup */
1801                         VS_MAP_UNLOCK(vs);
1802                         return (vm_offset_t) -1;
1803                 }
1804                 if (ps_map_extend(vs, cluster + 1)) {
1805                         VS_MAP_UNLOCK(vs);
1806                         return (vm_offset_t) -1;
1807                 }
1808         }
1809
1810         /*
1811          * Look for the desired cluster.  If the map is indirect, then we
1812          * have a two level lookup.  First find the indirect block, then
1813          * find the actual cluster.  If the indirect block has not yet
1814          * been allocated, then do so.  If the cluster has not yet been
1815          * allocated, then do so.
1816          *
1817          * If any of the allocations fail, then return an error.
1818          * Don't allocate if just doing a lookup.
1819          */
1820         if (vs->vs_indirect) {
1821                 long    ind_block = cluster/CLMAP_ENTRIES;
1822
1823                 /* Is the indirect block allocated? */
1824                 vsmap = vs->vs_imap[ind_block];
1825                 if (vsmap == NULL) {
1826                         if (flag == CL_FIND) {
1827                                 VS_MAP_UNLOCK(vs);
1828                                 return (vm_offset_t) -1;
1829                         }
1830
1831                         /* Allocate the indirect block */
1832                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1833                         if (vsmap == NULL) {
1834                                 VS_MAP_UNLOCK(vs);
1835                                 return (vm_offset_t) -1;
1836                         }
1837                         /* Initialize the cluster offsets */
1838                         for (i = 0; i < CLMAP_ENTRIES; i++)
1839                                 VSM_CLR(vsmap[i]);
1840                         vs->vs_imap[ind_block] = vsmap;
1841                 }
1842         } else
1843                 vsmap = vs->vs_dmap;
1844
1845         ASSERT(vsmap);
1846         vsmap += cluster%CLMAP_ENTRIES;
1847
1848         /*
1849          * At this point, vsmap points to the struct vs_map desired.
1850          *
1851          * Look in the map for the cluster, if there was an error on a
1852          * previous write, flag it and return.  If it is not yet
1853          * allocated, then allocate it, if we're writing; if we're
1854          * doing a lookup and the cluster's not allocated, return error.
1855          */
1856         if (VSM_ISERR(*vsmap)) {
1857                 clmap->cl_error = VSM_GETERR(*vsmap);
1858                 VS_MAP_UNLOCK(vs);
1859                 return (vm_offset_t) -1;
1860         } else if (VSM_ISCLR(*vsmap)) {
1861                 int psindex;
1862
1863                 if (flag == CL_FIND) {
1864                         /*
1865                          * If there's an error and the entry is clear, then
1866                          * we've run out of swap space.  Record the error
1867                          * here and return.
1868                          */
1869                         if (error) {
1870                                 VSM_SETERR(*vsmap, error);
1871                         }
1872                         VS_MAP_UNLOCK(vs);
1873                         return (vm_offset_t) -1;
1874                 } else {
1875                         /*
1876                          * Attempt to allocate a cluster from the paging segment
1877                          */
1878                         newcl = ps_allocate_cluster(vs, &psindex,
1879                                                     PAGING_SEGMENT_NULL);
1880                         if (newcl == (vm_offset_t) -1) {
1881                                 VS_MAP_UNLOCK(vs);
1882                                 return (vm_offset_t) -1;
1883                         }
1884                         VSM_CLR(*vsmap);
1885                         VSM_SETCLOFF(*vsmap, newcl);
1886                         VSM_SETPS(*vsmap, psindex);
1887                 }
1888         } else
1889                 newcl = VSM_CLOFF(*vsmap);
1890
1891         /*
1892          * Fill in pertinent fields of the clmap
1893          */
1894         clmap->cl_ps = VSM_PS(*vsmap);
1895         clmap->cl_numpages = VSCLSIZE(vs);
1896         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1897
1898         /*
1899          * Byte offset in paging segment is byte offset to cluster plus
1900          * byte offset within cluster.  It looks ugly, but should be
1901          * relatively quick.
1902          */
1903         ASSERT(trunc_page(offset) == offset);
1904         newcl = ptoa_32(newcl) << vs->vs_clshift;
1905         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1906         if (flag == CL_ALLOC) {
1907                 /*
1908                  * set bits in the allocation bitmap according to which
1909                  * pages were requested.  size is in bytes.
1910                  */
1911                 i = atop_32(newoff);
1912                 while ((size > 0) && (i < VSCLSIZE(vs))) {
1913                         VSM_SETALLOC(*vsmap, i);
1914                         i++;
1915                         size -= vm_page_size;
1916                 }
1917         }
1918         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1919         if (newoff) {
1920                 /*
1921                  * Offset is not cluster aligned, so number of pages
1922                  * and bitmaps must be adjusted
1923                  */
1924                 clmap->cl_numpages -= atop_32(newoff);
1925                 CLMAP_SHIFT(clmap, vs);
1926                 CLMAP_SHIFTALLOC(clmap, vs);
1927         }
1928
1929         /*
1930          *
1931          * The setting of valid bits and handling of write errors
1932          * must be done here, while we hold the lock on the map.
1933          * It logically should be done in ps_vs_write_complete().
1934          * The size and error information has been passed from
1935          * ps_vs_write_complete().  If the size parameter is non-zero,
1936          * then there is work to be done.  If error is also non-zero,
1937          * then the error number is recorded in the cluster and the
1938          * entire cluster is in error.
1939          */
1940         if (size && flag == CL_FIND) {
1941                 vm_offset_t off = (vm_offset_t) 0;
1942
1943                 if (!error) {
1944                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1945                              i++) {
1946                                 VSM_SETPG(*vsmap, i);
1947                                 size -= vm_page_size;
1948                         }
1949                         ASSERT(i <= VSCLSIZE(vs));
1950                 } else {
1951                         BS_STAT(clmap->cl_ps->ps_bs,
1952                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1953                                         atop_32(size));
1954                         off = VSM_CLOFF(*vsmap);
1955                         VSM_SETERR(*vsmap, error);
1956                 }
1957                 /*
1958                  * Deallocate cluster if error, and no valid pages
1959                  * already present.
1960                  */
1961                 if (off != (vm_offset_t) 0)
1962                         ps_deallocate_cluster(clmap->cl_ps, off);
1963                 VS_MAP_UNLOCK(vs);
1964                 return (vm_offset_t) 0;
1965         } else
1966                 VS_MAP_UNLOCK(vs);
1967
1968         DP_DEBUG(DEBUG_VS_INTERNAL,
1969                  ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1970                   newcl+newoff, (int) vs, (int) vsmap, flag));
1971         DP_DEBUG(DEBUG_VS_INTERNAL,
1972                  ("     clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1973                   (int) clmap->cl_ps, clmap->cl_numpages,
1974                   (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1975
1976         return (newcl + newoff);
1977 }
1978
1979 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t);     /* forward */
1980
1981 void
1982 ps_clunmap(
1983         vstruct_t       vs,
1984         vm_offset_t     offset,
1985         vm_size_t       length)
1986 {
1987         vm_offset_t             cluster; /* The cluster number of offset */
1988         struct vs_map           *vsmap;
1989
1990         VS_MAP_LOCK(vs);
1991
1992         /*
1993          * Loop through all clusters in this range, freeing paging segment
1994          * clusters and map entries as encountered.
1995          */
1996         while (length > 0) {
1997                 vm_offset_t     newoff;
1998                 unsigned int    i;
1999
2000                 cluster = atop_32(offset) >> vs->vs_clshift;
2001                 if (vs->vs_indirect)    /* indirect map */
2002                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2003                 else
2004                         vsmap = vs->vs_dmap;
2005                 if (vsmap == NULL) {
2006                         VS_MAP_UNLOCK(vs);
2007                         return;
2008                 }
2009                 vsmap += cluster%CLMAP_ENTRIES;
2010                 if (VSM_ISCLR(*vsmap)) {
2011                         length -= vm_page_size;
2012                         offset += vm_page_size;
2013                         continue;
2014                 }
2015                 /*
2016                  * We've got a valid mapping.  Clear it and deallocate
2017                  * paging segment cluster pages.
2018                  * Optimize for entire cluster cleraing.
2019                  */
2020                 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2021                         /*
2022                          * Not cluster aligned.
2023                          */
2024                         ASSERT(trunc_page(newoff) == newoff);
2025                         i = atop_32(newoff);
2026                 } else
2027                         i = 0;
2028                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2029                         VSM_CLRPG(*vsmap, i);
2030                         VSM_CLRALLOC(*vsmap, i);
2031                         length -= vm_page_size;
2032                         offset += vm_page_size;
2033                         i++;
2034                 }
2035
2036                 /*
2037                  * If map entry is empty, clear and deallocate cluster.
2038                  */
2039                 if (!VSM_ALLOC(*vsmap)) {
2040                         ps_deallocate_cluster(VSM_PS(*vsmap),
2041                                               VSM_CLOFF(*vsmap));
2042                         VSM_CLR(*vsmap);
2043                 }
2044         }
2045
2046         VS_MAP_UNLOCK(vs);
2047 }
2048
2049 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2050
2051 void
2052 ps_vs_write_complete(
2053         vstruct_t       vs,
2054         vm_offset_t     offset,
2055         vm_size_t       size,
2056         int             error)
2057 {
2058         struct clmap    clmap;
2059
2060         /*
2061          * Get the struct vsmap for this cluster.
2062          * Use READ, even though it was written, because the
2063          * cluster MUST be present, unless there was an error
2064          * in the original ps_clmap (e.g. no space), in which
2065          * case, nothing happens.
2066          *
2067          * Must pass enough information to ps_clmap to allow it
2068          * to set the vs_map structure bitmap under lock.
2069          */
2070         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2071 }
2072
2073 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int);    /* forward */
2074
2075 void
2076 vs_cl_write_complete(
2077         vstruct_t                                       vs,
2078         __unused paging_segment_t       ps,
2079         vm_offset_t                                     offset,
2080         __unused vm_offset_t            addr,
2081         vm_size_t                                       size,
2082         boolean_t                                       async,
2083         int                                                     error)
2084 {
2085 //      kern_return_t   kr;
2086
2087         if (error) {
2088                 /*
2089                  * For internal objects, the error is recorded on a
2090                  * per-cluster basis by ps_clmap() which is called
2091                  * by ps_vs_write_complete() below.
2092                  */
2093                 dprintf(("write failed error = 0x%x\n", error));
2094                 /* add upl_abort code here */
2095         } else
2096                 GSTAT(global_stats.gs_pages_out += atop_32(size));
2097         /*
2098          * Notify the vstruct mapping code, so it can do its accounting.
2099          */
2100         ps_vs_write_complete(vs, offset, size, error);
2101
2102         if (async) {
2103                 VS_LOCK(vs);
2104                 ASSERT(vs->vs_async_pending > 0);
2105                 vs->vs_async_pending -= size;
2106                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2107                         vs->vs_waiting_async = FALSE;
2108                         VS_UNLOCK(vs);
2109                         /* mutex_unlock(&vs->vs_waiting_async); */
2110                         thread_wakeup(&vs->vs_async_pending);
2111                 } else {
2112                         VS_UNLOCK(vs);
2113                 }
2114         }
2115 }
2116
2117 #ifdef DEVICE_PAGING
2118 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2119
2120 kern_return_t
2121 device_write_reply(
2122         MACH_PORT_FACE  reply_port,
2123         kern_return_t   device_code,
2124         io_buf_len_t    bytes_written)
2125 {
2126         struct vs_async *vsa;
2127
2128         vsa = (struct vs_async *)
2129                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2130
2131         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2132                 device_code = KERN_FAILURE;
2133         }
2134
2135         vsa->vsa_error = device_code;
2136
2137
2138         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2139         if(vsa->vsa_flags & VSA_TRANSFER) {
2140                 /* revisit when async disk segments redone */
2141                 if(vsa->vsa_error) {
2142                    /* need to consider error condition.  re-write data or */
2143                    /* throw it away here. */
2144                    vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2145                 }
2146                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2147                                                 vsa->vsa_size, vsa->vsa_error);
2148         } else {
2149                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2150                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2151                              vsa->vsa_error);
2152         }
2153         VS_FREE_ASYNC(vsa);
2154
2155         return KERN_SUCCESS;
2156 }
2157
2158 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2159 kern_return_t
2160 device_write_reply_inband(
2161         MACH_PORT_FACE          reply_port,
2162         kern_return_t           return_code,
2163         io_buf_len_t            bytes_written)
2164 {
2165         panic("device_write_reply_inband: illegal");
2166         return KERN_SUCCESS;
2167 }
2168
2169 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2170 kern_return_t
2171 device_read_reply(
2172         MACH_PORT_FACE          reply_port,
2173         kern_return_t           return_code,
2174         io_buf_ptr_t            data,
2175         mach_msg_type_number_t  dataCnt)
2176 {
2177         struct vs_async *vsa;
2178         vsa = (struct vs_async *)
2179                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2180         vsa->vsa_addr = (vm_offset_t)data;
2181         vsa->vsa_size = (vm_size_t)dataCnt;
2182         vsa->vsa_error = return_code;
2183         thread_wakeup(&vsa->vsa_lock);
2184         return KERN_SUCCESS;
2185 }
2186
2187 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2188 kern_return_t
2189 device_read_reply_inband(
2190         MACH_PORT_FACE          reply_port,
2191         kern_return_t           return_code,
2192         io_buf_ptr_inband_t     data,
2193         mach_msg_type_number_t  dataCnt)
2194 {
2195         panic("device_read_reply_inband: illegal");
2196         return KERN_SUCCESS;
2197 }
2198
2199 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2200 kern_return_t
2201 device_read_reply_overwrite(
2202         MACH_PORT_FACE          reply_port,
2203         kern_return_t           return_code,
2204         io_buf_len_t            bytes_read)
2205 {
2206         panic("device_read_reply_overwrite: illegal\n");
2207         return KERN_SUCCESS;
2208 }
2209
2210 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2211 kern_return_t
2212 device_open_reply(
2213         MACH_PORT_FACE          reply_port,
2214         kern_return_t           return_code,
2215         MACH_PORT_FACE          device_port)
2216 {
2217         panic("device_open_reply: illegal\n");
2218         return KERN_SUCCESS;
2219 }
2220
2221 kern_return_t
2222 ps_read_device(
2223         paging_segment_t        ps,
2224         vm_offset_t             offset,
2225         vm_offset_t             *bufferp,
2226         unsigned int            size,
2227         unsigned int            *residualp,
2228         int                     flags)
2229 {
2230         kern_return_t   kr;
2231         recnum_t        dev_offset;
2232         unsigned int    bytes_wanted;
2233         unsigned int    bytes_read;
2234         unsigned int    total_read;
2235         vm_offset_t     dev_buffer;
2236         vm_offset_t     buf_ptr;
2237         unsigned int    records_read;
2238         struct vs_async *vsa;
2239         mutex_t vs_waiting_read_reply;
2240
2241         device_t        device;
2242         vm_map_copy_t   device_data = NULL;
2243         default_pager_thread_t *dpt = NULL;
2244
2245         device = dev_port_lookup(ps->ps_device);
2246         clustered_reads[atop_32(size)]++;
2247
2248         dev_offset = (ps->ps_offset +
2249                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2250         bytes_wanted = size;
2251         total_read = 0;
2252         *bufferp = (vm_offset_t)NULL;
2253
2254         do {
2255                 vsa = VS_ALLOC_ASYNC();
2256                 if (vsa) {
2257                         vsa->vsa_vs = NULL;
2258                         vsa->vsa_addr = 0;
2259                         vsa->vsa_offset = 0;
2260                         vsa->vsa_size = 0;
2261                         vsa->vsa_ps = NULL;
2262                 }
2263                 mutex_init(&vsa->vsa_lock, 0);
2264                 ip_lock(vsa->reply_port);
2265                 vsa->reply_port->ip_sorights++;
2266                 ip_reference(vsa->reply_port);
2267                 ip_unlock(vsa->reply_port);
2268                 kr = ds_device_read_common(device,
2269                                  vsa->reply_port,
2270                                  (mach_msg_type_name_t)
2271                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2272                                  (dev_mode_t) 0,
2273                                  dev_offset,
2274                                  bytes_wanted,
2275                                  (IO_READ | IO_CALL),
2276                                  (io_buf_ptr_t *) &dev_buffer,
2277                                  (mach_msg_type_number_t *) &bytes_read);
2278                 if(kr == MIG_NO_REPLY) {
2279                         assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2280                         thread_block(THREAD_CONTINUE_NULL);
2281
2282                         dev_buffer = vsa->vsa_addr;
2283                         bytes_read = (unsigned int)vsa->vsa_size;
2284                         kr = vsa->vsa_error;
2285                 }
2286                 VS_FREE_ASYNC(vsa);
2287                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2288                         break;
2289                 }
2290                 total_read += bytes_read;
2291
2292                 /*
2293                  * If we got the entire range, use the returned dev_buffer.
2294                  */
2295                 if (bytes_read == size) {
2296                         *bufferp = (vm_offset_t)dev_buffer;
2297                         break;
2298                 }
2299
2300 #if 1
2301                 dprintf(("read only %d bytes out of %d\n",
2302                          bytes_read, bytes_wanted));
2303 #endif
2304                 if(dpt == NULL) {
2305                         dpt = get_read_buffer();
2306                         buf_ptr = dpt->dpt_buffer;
2307                         *bufferp = (vm_offset_t)buf_ptr;
2308                 }
2309                 /*
2310                  * Otherwise, copy the data into the provided buffer (*bufferp)
2311                  * and append the rest of the range as it comes in.
2312                  */
2313                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2314                 buf_ptr += bytes_read;
2315                 bytes_wanted -= bytes_read;
2316                 records_read = (bytes_read >>
2317                                 (vm_page_shift - ps->ps_record_shift));
2318                 dev_offset += records_read;
2319                 DP_DEBUG(DEBUG_VS_INTERNAL,
2320                          ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2321                           dev_buffer, bytes_read));
2322                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2323                     != KERN_SUCCESS)
2324                         Panic("dealloc buf");
2325         } while (bytes_wanted);
2326
2327         *residualp = size - total_read;
2328         if((dev_buffer != *bufferp) && (total_read != 0)) {
2329                 vm_offset_t temp_buffer;
2330                 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2331                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2332                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2333                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2334                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2335                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2336                         (vm_map_copy_t *)&device_data, FALSE))
2337                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2338         }
2339         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2340                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2341                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2342                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2343                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2344                         (vm_map_copy_t *)&device_data, FALSE))
2345                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2346         }
2347         else {
2348                 device_data = NULL;
2349         }
2350         *bufferp = (vm_offset_t)device_data;
2351
2352         if(dpt != NULL) {
2353                 /* Free the receive buffer */
2354                 dpt->checked_out = 0;
2355                 thread_wakeup(&dpt_array);
2356         }
2357         return KERN_SUCCESS;
2358 }
2359
2360 kern_return_t
2361 ps_write_device(
2362         paging_segment_t        ps,
2363         vm_offset_t             offset,
2364         vm_offset_t             addr,
2365         unsigned int            size,
2366         struct vs_async         *vsa)
2367 {
2368         recnum_t        dev_offset;
2369         io_buf_len_t    bytes_to_write, bytes_written;
2370         recnum_t        records_written;
2371         kern_return_t   kr;
2372         MACH_PORT_FACE  reply_port;
2373
2374
2375
2376         clustered_writes[atop_32(size)]++;
2377
2378         dev_offset = (ps->ps_offset +
2379                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2380         bytes_to_write = size;
2381
2382         if (vsa) {
2383                 /*
2384                  * Asynchronous write.
2385                  */
2386                 reply_port = vsa->reply_port;
2387                 ip_lock(reply_port);
2388                 reply_port->ip_sorights++;
2389                 ip_reference(reply_port);
2390                 ip_unlock(reply_port);
2391                 {
2392                 device_t        device;
2393                 device = dev_port_lookup(ps->ps_device);
2394
2395                 vsa->vsa_addr = addr;
2396                 kr=ds_device_write_common(device,
2397                         reply_port,
2398                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2399                         (dev_mode_t) 0,
2400                         dev_offset,
2401                         (io_buf_ptr_t)  addr,
2402                         size,
2403                         (IO_WRITE | IO_CALL),
2404                         &bytes_written);
2405                 }
2406                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2407                         if (verbose)
2408                                 dprintf(("%s0x%x, addr=0x%x,"
2409                                          "size=0x%x,offset=0x%x\n",
2410                                          "device_write_request returned ",
2411                                          kr, addr, size, offset));
2412                         BS_STAT(ps->ps_bs,
2413                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2414                         /* do the completion notification to free resources */
2415                         device_write_reply(reply_port, kr, 0);
2416                         return PAGER_ERROR;
2417                 }
2418         } else do {
2419                 /*
2420                  * Synchronous write.
2421                  */
2422                 {
2423                 device_t        device;
2424                 device = dev_port_lookup(ps->ps_device);
2425                 kr=ds_device_write_common(device,
2426                         IP_NULL, 0,
2427                         (dev_mode_t) 0,
2428                         dev_offset,
2429                         (io_buf_ptr_t)  addr,
2430                         size,
2431                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2432                         &bytes_written);
2433                 }
2434                 if (kr != KERN_SUCCESS) {
2435                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2436                                  "device_write returned ",
2437                                  kr, addr, size, offset));
2438                         BS_STAT(ps->ps_bs,
2439                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2440                         return PAGER_ERROR;
2441                 }
2442                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2443                         Panic("fragmented write");
2444                 records_written = (bytes_written >>
2445                                    (vm_page_shift - ps->ps_record_shift));
2446                 dev_offset += records_written;
2447 #if 1
2448                 if (bytes_written != bytes_to_write) {
2449                         dprintf(("wrote only %d bytes out of %d\n",
2450                                  bytes_written, bytes_to_write));
2451                 }
2452 #endif
2453                 bytes_to_write -= bytes_written;
2454                 addr += bytes_written;
2455         } while (bytes_to_write > 0);
2456
2457         return PAGER_SUCCESS;
2458 }
2459
2460
2461 #else /* !DEVICE_PAGING */
2462
2463 kern_return_t
2464 ps_read_device(
2465         __unused paging_segment_t       ps,
2466         __unused vm_offset_t            offset,
2467         __unused vm_offset_t            *bufferp,
2468         __unused unsigned int           size,
2469         __unused unsigned int           *residualp,
2470         __unused int                            flags)
2471 {
2472   panic("ps_read_device not supported");
2473 }
2474
2475 kern_return_t
2476 ps_write_device(
2477         __unused paging_segment_t       ps,
2478         __unused vm_offset_t            offset,
2479         __unused vm_offset_t            addr,
2480         __unused unsigned int           size,
2481         __unused struct vs_async        *vsa)
2482 {
2483   panic("ps_write_device not supported");
2484 }
2485
2486 #endif /* DEVICE_PAGING */
2487 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t);      /* forward */
2488
2489 void
2490 pvs_object_data_provided(
2491         __unused vstruct_t              vs,
2492         __unused upl_t                  upl,
2493         __unused upl_offset_t   offset,
2494         upl_size_t                              size)
2495 {
2496
2497         DP_DEBUG(DEBUG_VS_INTERNAL,
2498                  ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2499                   upl, offset, size));
2500
2501         ASSERT(size > 0);
2502         GSTAT(global_stats.gs_pages_in += atop_32(size));
2503
2504
2505 #if     USE_PRECIOUS
2506         ps_clunmap(vs, offset, size);
2507 #endif  /* USE_PRECIOUS */
2508
2509 }
2510
2511 kern_return_t
2512 pvs_cluster_read(
2513         vstruct_t       vs,
2514         vm_offset_t     vs_offset,
2515         vm_size_t       cnt)
2516 {
2517         upl_t                   upl;
2518         kern_return_t           error = KERN_SUCCESS;
2519         int                                     size;
2520         int                     residual;
2521         unsigned int            request_flags;
2522         int                                     seg_index;
2523         int                                     pages_in_cl;
2524         int                     cl_size;
2525         int                     cl_mask;
2526         int                                     cl_index;
2527         int                     xfer_size;
2528         vm_offset_t       ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2529         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2530         struct clmap            clmap;
2531
2532         pages_in_cl = 1 << vs->vs_clshift;
2533         cl_size = pages_in_cl * vm_page_size;
2534         cl_mask = cl_size - 1;
2535
2536         /*
2537          * This loop will be executed multiple times until the entire
2538          * request has been satisfied... if the request spans cluster
2539          * boundaries, the clusters will be checked for logical continunity,
2540          * if contiguous the I/O request will span multiple clusters, otherwise
2541          * it will be broken up into the minimal set of I/O's
2542          *
2543          * If there are holes in a request (either unallocated pages in a paging
2544          * segment or an unallocated paging segment), we stop
2545          * reading at the hole, inform the VM of any data read, inform
2546          * the VM of an unavailable range, then loop again, hoping to
2547          * find valid pages later in the requested range.  This continues until
2548          * the entire range has been examined, and read, if present.
2549          */
2550
2551 #if     USE_PRECIOUS
2552         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
2553 #else
2554         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
2555 #endif
2556
2557         assert(dp_encryption_inited);
2558         if (dp_encryption) {
2559                 /*
2560                  * ENCRYPTED SWAP:
2561                  * request that the UPL be prepared for
2562                  * decryption.
2563                  */
2564                 request_flags |= UPL_ENCRYPT;
2565         }
2566
2567         while (cnt && (error == KERN_SUCCESS)) {
2568                 int     ps_info_valid;
2569                 int     page_list_count;
2570
2571                 if((vs_offset & cl_mask) &&
2572                         (cnt > (VM_SUPER_CLUSTER -
2573                                 (vs_offset & cl_mask)))) {
2574                         size = VM_SUPER_CLUSTER;
2575                         size -= vs_offset & cl_mask;
2576                 } else if (cnt > VM_SUPER_CLUSTER) {
2577                         size = VM_SUPER_CLUSTER;
2578                 } else {
2579                         size = cnt;
2580                 }
2581                 cnt -= size;
2582
2583                 ps_info_valid = 0;
2584                 seg_index     = 0;
2585
2586                 while (size > 0 && error == KERN_SUCCESS) {
2587                         int           abort_size;
2588                         int           failed_size;
2589                         int           beg_pseg;
2590                         int           beg_indx;
2591                         vm_offset_t   cur_offset;
2592
2593
2594                         if ( !ps_info_valid) {
2595                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2596                                 psp[seg_index]       = CLMAP_PS(clmap);
2597                                 ps_info_valid = 1;
2598                         }
2599                         /*
2600                          * skip over unallocated physical segments
2601                          */
2602                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2603                                 abort_size = cl_size - (vs_offset & cl_mask);
2604                                 abort_size = MIN(abort_size, size);
2605
2606                                 page_list_count = 0;
2607                                 memory_object_super_upl_request(
2608                                         vs->vs_control,
2609                                         (memory_object_offset_t)vs_offset,
2610                                         abort_size, abort_size,
2611                                         &upl, NULL, &page_list_count,
2612                                         request_flags);
2613
2614                                 if (clmap.cl_error) {
2615                                         upl_abort(upl, UPL_ABORT_ERROR);
2616                                 } else {
2617                                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2618                                 }
2619                                 upl_deallocate(upl);
2620
2621                                 size       -= abort_size;
2622                                 vs_offset  += abort_size;
2623
2624                                 seg_index++;
2625                                 ps_info_valid = 0;
2626                                 continue;
2627                         }
2628                         cl_index = (vs_offset & cl_mask) / vm_page_size;
2629
2630                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2631                                 /*
2632                                  * skip over unallocated pages
2633                                  */
2634                                 if (CLMAP_ISSET(clmap, cl_index))
2635                                         break;
2636                                 abort_size += vm_page_size;
2637                         }
2638                         if (abort_size) {
2639                                 /*
2640                                  * Let VM system know about holes in clusters.
2641                                  */
2642                                 GSTAT(global_stats.gs_pages_unavail += atop_32(abort_size));
2643
2644                                 page_list_count = 0;
2645                                 memory_object_super_upl_request(
2646                                         vs->vs_control,
2647                                         (memory_object_offset_t)vs_offset,
2648                                         abort_size, abort_size,
2649                                         &upl, NULL, &page_list_count,
2650                                         request_flags);
2651
2652                                 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2653                                 upl_deallocate(upl);
2654
2655                                 size       -= abort_size;
2656                                 vs_offset  += abort_size;
2657
2658                                 if (cl_index == pages_in_cl) {
2659                                         /*
2660                                          * if we're at the end of this physical cluster
2661                                          * then bump to the next one and continue looking
2662                                          */
2663                                         seg_index++;
2664                                         ps_info_valid = 0;
2665                                         continue;
2666                                 }
2667                                 if (size == 0)
2668                                         break;
2669                         }
2670                         /*
2671                          * remember the starting point of the first allocated page
2672                          * for the I/O we're about to issue
2673                          */
2674                         beg_pseg   = seg_index;
2675                         beg_indx   = cl_index;
2676                         cur_offset = vs_offset;
2677
2678                         /*
2679                          * calculate the size of the I/O that we can do...
2680                          * this may span multiple physical segments if
2681                          * they are contiguous
2682                          */
2683                         for (xfer_size = 0; xfer_size < size; ) {
2684
2685                                 while (cl_index < pages_in_cl
2686                                                 && xfer_size < size) {
2687                                         /*
2688                                          * accumulate allocated pages within
2689                                          * a physical segment
2690                                          */
2691                                         if (CLMAP_ISSET(clmap, cl_index)) {
2692                                                 xfer_size  += vm_page_size;
2693                                                 cur_offset += vm_page_size;
2694                                                 cl_index++;
2695
2696                                                 BS_STAT(psp[seg_index]->ps_bs,
2697                                                         psp[seg_index]->ps_bs->bs_pages_in++);
2698                                         } else
2699                                                 break;
2700                                 }
2701                                 if (cl_index < pages_in_cl
2702                                                 || xfer_size >= size) {
2703                                         /*
2704                                          * we've hit an unallocated page or
2705                                          * the end of this request... go fire
2706                                          * the I/O
2707                                          */
2708                                         break;
2709                                 }
2710                                 /*
2711                                  * we've hit the end of the current physical
2712                                  * segment and there's more to do, so try
2713                                  * moving to the next one
2714                                  */
2715                                 seg_index++;
2716
2717                                 ps_offset[seg_index] =
2718                                         ps_clmap(vs,
2719                                                 cur_offset & ~cl_mask,
2720                                                 &clmap, CL_FIND, 0, 0);
2721                                 psp[seg_index] = CLMAP_PS(clmap);
2722                                 ps_info_valid = 1;
2723
2724                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2725                                         /*
2726                                          * if the physical segment we're about
2727                                          * to step into is not contiguous to
2728                                          * the one we're currently in, or it's
2729                                          * in a different paging file, or
2730                                          * it hasn't been allocated....
2731                                          * we stop here and generate the I/O
2732                                          */
2733                                         break;
2734                                 }
2735                                 /*
2736                                  * start with first page of the next physical
2737                                  *  segment
2738                                  */
2739                                 cl_index = 0;
2740                         }
2741                         if (xfer_size) {
2742                                 /*
2743                                  * we have a contiguous range of allocated pages
2744                                  * to read from
2745                                  */
2746                                 page_list_count = 0;
2747                                 memory_object_super_upl_request(vs->vs_control,
2748                                         (memory_object_offset_t)vs_offset,
2749                                         xfer_size, xfer_size,
2750                                         &upl, NULL, &page_list_count,
2751                                         request_flags | UPL_SET_INTERNAL);
2752
2753                                 error = ps_read_file(psp[beg_pseg],
2754                                         upl, (upl_offset_t) 0,
2755                                         ps_offset[beg_pseg] +
2756                                                 (beg_indx * vm_page_size),
2757                                         xfer_size, &residual, 0);
2758                         } else
2759                                 continue;
2760
2761                         failed_size = 0;
2762
2763                         /*
2764                          * Adjust counts and send response to VM.  Optimize
2765                          * for the common case, i.e. no error and/or partial
2766                          * data. If there was an error, then we need to error
2767                          * the entire range, even if some data was successfully
2768                          * read. If there was a partial read we may supply some
2769                          * data and may error some as well.  In all cases the
2770                          * VM must receive some notification for every page
2771                          * in the range.
2772                          */
2773                         if ((error == KERN_SUCCESS) && (residual == 0)) {
2774                                 /*
2775                                  * Got everything we asked for, supply the data
2776                                  * to the VM.  Note that as a side effect of
2777                                  * supplying the data, the buffer holding the
2778                                  * supplied data is deallocated from the pager's
2779                                  *  address space.
2780                                  */
2781                                 pvs_object_data_provided(
2782                                         vs, upl, vs_offset, xfer_size);
2783                         } else {
2784                                 failed_size = xfer_size;
2785
2786                                 if (error == KERN_SUCCESS) {
2787                                         if (residual == xfer_size) {
2788                                         /*
2789                                          * If a read operation returns no error
2790                                          * and no data moved, we turn it into
2791                                          * an error, assuming we're reading at
2792                                          * or beyong EOF.
2793                                          * Fall through and error the entire
2794                                          * range.
2795                                          */
2796                                                 error = KERN_FAILURE;
2797                                         } else {
2798                                         /*
2799                                          * Otherwise, we have partial read. If
2800                                          * the part read is a integral number
2801                                          * of pages supply it. Otherwise round
2802                                          * it up to a page boundary, zero fill
2803                                          * the unread part, and supply it.
2804                                          * Fall through and error the remainder
2805                                          * of the range, if any.
2806                                          */
2807                                                 int fill, lsize;
2808
2809                                                 fill = residual
2810                                                         & ~vm_page_size;
2811                                                 lsize = (xfer_size - residual)
2812                                                                          + fill;
2813                                                 pvs_object_data_provided(
2814                                                         vs, upl,
2815                                                         vs_offset, lsize);
2816
2817                                                 if (lsize < xfer_size) {
2818                                                         failed_size =
2819                                                             xfer_size - lsize;
2820                                                         error = KERN_FAILURE;
2821                                                 }
2822                                         }
2823                                 }
2824                         }
2825                         /*
2826                          * If there was an error in any part of the range, tell
2827                          * the VM. Note that error is explicitly checked again
2828                          *  since it can be modified above.
2829                          */
2830                         if (error != KERN_SUCCESS) {
2831                                 BS_STAT(psp[beg_pseg]->ps_bs,
2832                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail
2833                                                 += atop_32(failed_size));
2834                         }
2835                         size       -= xfer_size;
2836                         vs_offset  += xfer_size;
2837                 }
2838
2839         } /* END while (cnt && (error == 0)) */
2840         return error;
2841 }
2842
2843 int vs_do_async_write = 1;
2844
2845 kern_return_t
2846 vs_cluster_write(
2847         vstruct_t       vs,
2848         upl_t           internal_upl,
2849         upl_offset_t    offset,
2850         upl_size_t      cnt,
2851         boolean_t       dp_internal,
2852         int             flags)
2853 {
2854         upl_size_t      transfer_size;
2855         int             error = 0;
2856         struct clmap    clmap;
2857
2858         vm_offset_t     actual_offset;  /* Offset within paging segment */
2859         paging_segment_t ps;
2860         vm_offset_t     mobj_base_addr;
2861         vm_offset_t     mobj_target_addr;
2862
2863         upl_t           upl;
2864         upl_page_info_t *pl;
2865         int             page_index;
2866         int             list_size;
2867         int             pages_in_cl;
2868         unsigned int    cl_size;
2869         int             base_index;
2870         unsigned int    seg_size;
2871
2872         pages_in_cl = 1 << vs->vs_clshift;
2873         cl_size = pages_in_cl * vm_page_size;
2874
2875         if (!dp_internal) {
2876                 int          page_list_count;
2877                 int          request_flags;
2878                 unsigned int super_size;
2879                 int          first_dirty;
2880                 int          num_dirty;
2881                 int          num_of_pages;
2882                 int          seg_index;
2883                 upl_offset_t  upl_offset;
2884                 vm_offset_t  seg_offset;
2885                 vm_offset_t  ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2886                 paging_segment_t   psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2887
2888
2889                 if (bs_low) {
2890                         super_size = cl_size;
2891
2892                         request_flags = UPL_NOBLOCK |
2893                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2894                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2895                 } else {
2896                         super_size = VM_SUPER_CLUSTER;
2897
2898                         request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2899                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2900                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2901                 }
2902
2903                 if (!dp_encryption_inited) {
2904                         /*
2905                          * ENCRYPTED SWAP:
2906                          * Once we've started using swap, we
2907                          * can't change our mind on whether
2908                          * it needs to be encrypted or
2909                          * not.
2910                          */
2911                         dp_encryption_inited = TRUE;
2912                 }
2913                 if (dp_encryption) {
2914                         /*
2915                          * ENCRYPTED SWAP:
2916                          * request that the UPL be prepared for
2917                          * encryption.
2918                          */
2919                         request_flags |= UPL_ENCRYPT;
2920                         flags |= UPL_PAGING_ENCRYPTED;
2921                 }
2922
2923                 page_list_count = 0;
2924                 memory_object_super_upl_request(vs->vs_control,
2925                                 (memory_object_offset_t)offset,
2926                                 cnt, super_size,
2927                                 &upl, NULL, &page_list_count,
2928                                 request_flags | UPL_FOR_PAGEOUT);
2929
2930                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2931
2932                 seg_size = cl_size - (upl->offset % cl_size);
2933                 upl_offset = upl->offset & ~(cl_size - 1);
2934
2935                 for (seg_index = 0, transfer_size = upl->size;
2936                                                 transfer_size > 0; ) {
2937                         ps_offset[seg_index] =
2938                                 ps_clmap(vs,
2939                                         upl_offset,
2940                                         &clmap, CL_ALLOC,
2941                                         cl_size, 0);
2942
2943                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2944                                 upl_abort(upl, 0);
2945                                 upl_deallocate(upl);
2946
2947                                 return KERN_FAILURE;
2948
2949                         }
2950                         psp[seg_index] = CLMAP_PS(clmap);
2951
2952                         if (transfer_size > seg_size) {
2953                                 transfer_size -= seg_size;
2954                                 upl_offset += cl_size;
2955                                 seg_size    = cl_size;
2956                                 seg_index++;
2957                         } else
2958                                 transfer_size = 0;
2959                 }
2960                 /*
2961                  * Ignore any non-present pages at the end of the
2962                  * UPL.
2963                  */
2964                 for (page_index = upl->size / vm_page_size; page_index > 0;)
2965                         if (UPL_PAGE_PRESENT(pl, --page_index))
2966                                 break;
2967                 num_of_pages = page_index + 1;
2968
2969                 base_index = (upl->offset % cl_size) / PAGE_SIZE;
2970
2971                 for (page_index = 0; page_index < num_of_pages; ) {
2972                         /*
2973                          * skip over non-dirty pages
2974                          */
2975                         for ( ; page_index < num_of_pages; page_index++) {
2976                                 if (UPL_DIRTY_PAGE(pl, page_index)
2977                                         || UPL_PRECIOUS_PAGE(pl, page_index))
2978                                         /*
2979                                          * this is a page we need to write
2980                                          * go see if we can buddy it up with
2981                                          * others that are contiguous to it
2982                                          */
2983                                         break;
2984                                 /*
2985                                  * if the page is not-dirty, but present we
2986                                  * need to commit it...  This is an unusual
2987                                  * case since we only asked for dirty pages
2988                                  */
2989                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
2990                                         boolean_t empty = FALSE;
2991                                         upl_commit_range(upl,
2992                                                  page_index * vm_page_size,
2993                                                  vm_page_size,
2994                                                  UPL_COMMIT_NOTIFY_EMPTY,
2995                                                  pl,
2996                                                  page_list_count,
2997                                                  &empty);
2998                                         if (empty) {
2999                                                 assert(page_index ==
3000                                                        num_of_pages - 1);
3001                                                 upl_deallocate(upl);
3002                                         }
3003                                 }
3004                         }
3005                         if (page_index == num_of_pages)
3006                                 /*
3007                                  * no more pages to look at, we're out of here
3008                                  */
3009                                 break;
3010
3011                         /*
3012                          * gather up contiguous dirty pages... we have at
3013                          * least 1 * otherwise we would have bailed above
3014                          * make sure that each physical segment that we step
3015                          * into is contiguous to the one we're currently in
3016                          * if it's not, we have to stop and write what we have
3017                          */
3018                         for (first_dirty = page_index;
3019                                         page_index < num_of_pages; ) {
3020                                 if ( !UPL_DIRTY_PAGE(pl, page_index)
3021                                         && !UPL_PRECIOUS_PAGE(pl, page_index))
3022                                         break;
3023                                 page_index++;
3024                                 /*
3025                                  * if we just looked at the last page in the UPL
3026                                  * we don't need to check for physical segment
3027                                  * continuity
3028                                  */
3029                                 if (page_index < num_of_pages) {
3030                                         int cur_seg;
3031                                         int nxt_seg;
3032
3033                                         cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3034                                         nxt_seg = (base_index + page_index)/pages_in_cl;
3035
3036                                         if (cur_seg != nxt_seg) {
3037                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3038                                                 /*
3039                                                  * if the segment we're about
3040                                                  * to step into is not
3041                                                  * contiguous to the one we're
3042                                                  * currently in, or it's in a
3043                                                  * different paging file....
3044                                                  * we stop here and generate
3045                                                  * the I/O
3046                                                  */
3047                                                         break;
3048                                         }
3049                                 }
3050                         }
3051                         num_dirty = page_index - first_dirty;
3052
3053                         if (num_dirty) {
3054                                 upl_offset = first_dirty * vm_page_size;
3055                                 transfer_size = num_dirty * vm_page_size;
3056
3057                                 while (transfer_size) {
3058
3059                                         if ((seg_size = cl_size -
3060                                                 ((upl->offset + upl_offset) % cl_size))
3061                                                         > transfer_size)
3062                                                 seg_size = transfer_size;
3063
3064                                         ps_vs_write_complete(vs,
3065                                                 upl->offset + upl_offset,
3066                                                 seg_size, error);
3067
3068                                         transfer_size -= seg_size;
3069                                         upl_offset += seg_size;
3070                                 }
3071                                 upl_offset = first_dirty * vm_page_size;
3072                                 transfer_size = num_dirty * vm_page_size;
3073
3074                                 seg_index  = (base_index + first_dirty) / pages_in_cl;
3075                                 seg_offset = (upl->offset + upl_offset) % cl_size;
3076
3077                                 error = ps_write_file(psp[seg_index],
3078                                                 upl, upl_offset,
3079                                                 ps_offset[seg_index]
3080                                                                 + seg_offset,
3081                                                 transfer_size, flags);
3082                         } else {
3083                                 boolean_t empty = FALSE;
3084                                 upl_abort_range(upl,
3085                                                 first_dirty * vm_page_size,
3086                                                 num_dirty   * vm_page_size,
3087                                                 UPL_ABORT_NOTIFY_EMPTY,
3088                                                 &empty);
3089                                 if (empty) {
3090                                         assert(page_index == num_of_pages);
3091                                         upl_deallocate(upl);
3092                                 }
3093                         }
3094                 }
3095
3096         } else {
3097                 assert(cnt  <= (vm_page_size << vs->vs_clshift));
3098                 list_size = cnt;
3099
3100                 page_index = 0;
3101                 /* The caller provides a mapped_data which is derived  */
3102                 /* from a temporary object.  The targeted pages are    */
3103                 /* guaranteed to be set at offset 0 in the mapped_data */
3104                 /* The actual offset however must still be derived     */
3105                 /* from the offset in the vs in question               */
3106                 mobj_base_addr = offset;
3107                 mobj_target_addr = mobj_base_addr;
3108
3109                 for (transfer_size = list_size; transfer_size != 0;) {
3110                         actual_offset = ps_clmap(vs, mobj_target_addr,
3111                                 &clmap, CL_ALLOC,
3112                                 transfer_size < cl_size ?
3113                                         transfer_size : cl_size, 0);
3114                         if(actual_offset == (vm_offset_t) -1) {
3115                                 error = 1;
3116                                 break;
3117                         }
3118                         cnt = MIN(transfer_size,
3119                                 CLMAP_NPGS(clmap) * vm_page_size);
3120                         ps = CLMAP_PS(clmap);
3121                         /* Assume that the caller has given us contiguous */
3122                         /* pages */
3123                         if(cnt) {
3124                                 ps_vs_write_complete(vs, mobj_target_addr,
3125                                                                 cnt, error);
3126                                 error = ps_write_file(ps, internal_upl,
3127                                                 0, actual_offset,
3128                                                 cnt, flags);
3129                                 if (error)
3130                                         break;
3131                            }
3132                         if (error)
3133                                 break;
3134                         actual_offset += cnt;
3135                         mobj_target_addr += cnt;
3136                         transfer_size -= cnt;
3137                         cnt = 0;
3138
3139                         if (error)
3140                                 break;
3141                 }
3142         }
3143         if(error)
3144                 return KERN_FAILURE;
3145         else
3146                 return KERN_SUCCESS;
3147 }
3148
3149 vm_size_t
3150 ps_vstruct_allocated_size(
3151         vstruct_t       vs)
3152 {
3153         int             num_pages;
3154         struct vs_map   *vsmap;
3155         unsigned int    i, j, k;
3156
3157         num_pages = 0;
3158         if (vs->vs_indirect) {
3159                 /* loop on indirect maps */
3160                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3161                         vsmap = vs->vs_imap[i];
3162                         if (vsmap == NULL)
3163                                 continue;
3164                         /* loop on clusters in this indirect map */
3165                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3166                                 if (VSM_ISCLR(vsmap[j]) ||
3167                                     VSM_ISERR(vsmap[j]))
3168                                         continue;
3169                                 /* loop on pages in this cluster */
3170                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3171                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3172                                                 num_pages++;
3173                                 }
3174                         }
3175                 }
3176         } else {
3177                 vsmap = vs->vs_dmap;
3178                 if (vsmap == NULL)
3179                         return 0;
3180                 /* loop on clusters in the direct map */
3181                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3182                         if (VSM_ISCLR(vsmap[j]) ||
3183                             VSM_ISERR(vsmap[j]))
3184                                 continue;
3185                         /* loop on pages in this cluster */
3186                         for (k = 0; k < VSCLSIZE(vs); k++) {
3187                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3188                                         num_pages++;
3189                         }
3190                 }
3191         }
3192
3193         return ptoa_32(num_pages);
3194 }
3195
3196 size_t
3197 ps_vstruct_allocated_pages(
3198         vstruct_t               vs,
3199         default_pager_page_t    *pages,
3200         size_t                  pages_size)
3201 {
3202         unsigned int    num_pages;
3203         struct vs_map   *vsmap;
3204         vm_offset_t     offset;
3205         unsigned int    i, j, k;
3206
3207         num_pages = 0;
3208         offset = 0;
3209         if (vs->vs_indirect) {
3210                 /* loop on indirect maps */
3211                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3212                         vsmap = vs->vs_imap[i];
3213                         if (vsmap == NULL) {
3214                                 offset += (vm_page_size * CLMAP_ENTRIES *
3215                                            VSCLSIZE(vs));
3216                                 continue;
3217                         }
3218                         /* loop on clusters in this indirect map */
3219                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3220                                 if (VSM_ISCLR(vsmap[j]) ||
3221                                     VSM_ISERR(vsmap[j])) {
3222                                         offset += vm_page_size * VSCLSIZE(vs);
3223                                         continue;
3224                                 }
3225                                 /* loop on pages in this cluster */
3226                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3227                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3228                                                 num_pages++;
3229                                                 if (num_pages < pages_size)
3230                                                         pages++->dpp_offset =
3231                                                                 offset;
3232                                         }
3233                                         offset += vm_page_size;
3234                                 }
3235                         }
3236                 }
3237         } else {
3238                 vsmap = vs->vs_dmap;
3239                 if (vsmap == NULL)
3240                         return 0;
3241                 /* loop on clusters in the direct map */
3242                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3243                         if (VSM_ISCLR(vsmap[j]) ||
3244                             VSM_ISERR(vsmap[j])) {
3245                                 offset += vm_page_size * VSCLSIZE(vs);
3246                                 continue;
3247                         }
3248                         /* loop on pages in this cluster */
3249                         for (k = 0; k < VSCLSIZE(vs); k++) {
3250                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3251                                         num_pages++;
3252                                         if (num_pages < pages_size)
3253                                                 pages++->dpp_offset = offset;
3254                                 }
3255                                 offset += vm_page_size;
3256                         }
3257                 }
3258         }
3259
3260         return num_pages;
3261 }
3262
3263
3264 kern_return_t
3265 ps_vstruct_transfer_from_segment(
3266         vstruct_t        vs,
3267         paging_segment_t segment,
3268         upl_t            upl)
3269 {
3270         struct vs_map   *vsmap;
3271 //      struct vs_map   old_vsmap;
3272 //      struct vs_map   new_vsmap;
3273         unsigned int    i, j;
3274
3275         VS_LOCK(vs);    /* block all work on this vstruct */
3276                         /* can't allow the normal multiple write */
3277                         /* semantic because writes may conflict */
3278         vs->vs_xfer_pending = TRUE;
3279         vs_wait_for_sync_writers(vs);
3280         vs_start_write(vs);
3281         vs_wait_for_readers(vs);
3282         /* we will unlock the vs to allow other writes while transferring */
3283         /* and will be guaranteed of the persistance of the vs struct     */
3284         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3285         /* vs_async_pending */
3286         /* OK we now have guaranteed no other parties are accessing this */
3287         /* vs.  Now that we are also supporting simple lock versions of  */
3288         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3289         /* our purpose in holding it before was the multiple write case */
3290         /* we now use the boolean xfer_pending to do that.  We can use  */
3291         /* a boolean instead of a count because we have guaranteed single */
3292         /* file access to this code in its caller */
3293         VS_UNLOCK(vs);
3294 vs_changed:
3295         if (vs->vs_indirect) {
3296                 unsigned int    vsmap_size;
3297                 int             clmap_off;
3298                 /* loop on indirect maps */
3299                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3300                         vsmap = vs->vs_imap[i];
3301                         if (vsmap == NULL)
3302                                 continue;
3303                         /* loop on clusters in this indirect map */
3304                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3305                                            VSCLSIZE(vs) * i);
3306                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3307                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3308                         else
3309                                 vsmap_size = CLMAP_ENTRIES;
3310                         for (j = 0; j < vsmap_size; j++) {
3311                                 if (VSM_ISCLR(vsmap[j]) ||
3312                                     VSM_ISERR(vsmap[j]) ||
3313                                     (VSM_PS(vsmap[j]) != segment))
3314                                         continue;
3315                                 if(vs_cluster_transfer(vs,
3316                                         (vm_page_size * (j << vs->vs_clshift))
3317                                         + clmap_off,
3318                                         vm_page_size << vs->vs_clshift,
3319                                         upl)
3320                                                 != KERN_SUCCESS) {
3321                                    VS_LOCK(vs);
3322                                    vs->vs_xfer_pending = FALSE;
3323                                    VS_UNLOCK(vs);
3324                                    vs_finish_write(vs);
3325                                    return KERN_FAILURE;
3326                                 }
3327                                 /* allow other readers/writers during transfer*/
3328                                 VS_LOCK(vs);
3329                                 vs->vs_xfer_pending = FALSE;
3330                                 VS_UNLOCK(vs);
3331                                 vs_finish_write(vs);
3332                                 VS_LOCK(vs);
3333                                 vs->vs_xfer_pending = TRUE;
3334                                 vs_wait_for_sync_writers(vs);
3335                                 vs_start_write(vs);
3336                                 vs_wait_for_readers(vs);
3337                                 VS_UNLOCK(vs);
3338                                 if (!(vs->vs_indirect)) {
3339                                         goto vs_changed;
3340                                 }
3341                         }
3342                 }
3343         } else {
3344                 vsmap = vs->vs_dmap;
3345                 if (vsmap == NULL) {
3346                         VS_LOCK(vs);
3347                         vs->vs_xfer_pending = FALSE;
3348                         VS_UNLOCK(vs);
3349                         vs_finish_write(vs);
3350                         return KERN_SUCCESS;
3351                 }
3352                 /* loop on clusters in the direct map */
3353                 for (j = 0; j < vs->vs_size; j++) {
3354                         if (VSM_ISCLR(vsmap[j]) ||
3355                             VSM_ISERR(vsmap[j]) ||
3356                             (VSM_PS(vsmap[j]) != segment))
3357                                 continue;
3358                         if(vs_cluster_transfer(vs,
3359                                 vm_page_size * (j << vs->vs_clshift),
3360                                 vm_page_size << vs->vs_clshift,
3361                                 upl) != KERN_SUCCESS) {
3362                            VS_LOCK(vs);
3363                            vs->vs_xfer_pending = FALSE;
3364                            VS_UNLOCK(vs);
3365                            vs_finish_write(vs);
3366                            return KERN_FAILURE;
3367                         }
3368                         /* allow other readers/writers during transfer*/
3369                         VS_LOCK(vs);
3370                         vs->vs_xfer_pending = FALSE;
3371                         VS_UNLOCK(vs);
3372                         vs_finish_write(vs);
3373                         VS_LOCK(vs);
3374                         vs->vs_xfer_pending = TRUE;
3375                         VS_UNLOCK(vs);
3376                         vs_wait_for_sync_writers(vs);
3377                         vs_start_write(vs);
3378                         vs_wait_for_readers(vs);
3379                         if (vs->vs_indirect) {
3380                                 goto vs_changed;
3381                         }
3382                 }
3383         }
3384
3385         VS_LOCK(vs);
3386         vs->vs_xfer_pending = FALSE;
3387         VS_UNLOCK(vs);
3388         vs_finish_write(vs);
3389         return KERN_SUCCESS;
3390 }
3391
3392
3393
3394 vs_map_t
3395 vs_get_map_entry(
3396         vstruct_t       vs,
3397         vm_offset_t     offset)
3398 {
3399         struct vs_map   *vsmap;
3400         vm_offset_t     cluster;
3401
3402         cluster = atop_32(offset) >> vs->vs_clshift;
3403         if (vs->vs_indirect) {
3404                 long    ind_block = cluster/CLMAP_ENTRIES;
3405
3406                 /* Is the indirect block allocated? */
3407                 vsmap = vs->vs_imap[ind_block];
3408                 if(vsmap == (vs_map_t) NULL)
3409                         return vsmap;
3410         } else
3411                 vsmap = vs->vs_dmap;
3412         vsmap += cluster%CLMAP_ENTRIES;
3413         return vsmap;
3414 }
3415
3416 kern_return_t
3417 vs_cluster_transfer(
3418         vstruct_t       vs,
3419         vm_offset_t     offset,
3420         vm_size_t       cnt,
3421         upl_t           upl)
3422 {
3423         vm_offset_t             actual_offset;
3424         paging_segment_t        ps;
3425         struct clmap            clmap;
3426         kern_return_t           error = KERN_SUCCESS;
3427         unsigned int            size, size_wanted;
3428         int                     i;
3429         unsigned int            residual;
3430         unsigned int            unavail_size;
3431 //      default_pager_thread_t  *dpt;
3432 //      boolean_t               dealloc;
3433         struct  vs_map          *vsmap_ptr = NULL;
3434         struct  vs_map          read_vsmap;
3435         struct  vs_map          original_read_vsmap;
3436         struct  vs_map          write_vsmap;
3437 //      upl_t                           sync_upl;
3438 //      vm_offset_t                     ioaddr;
3439
3440         /* vs_cluster_transfer reads in the pages of a cluster and
3441          * then writes these pages back to new backing store.  The
3442          * segment the pages are being read from is assumed to have
3443          * been taken off-line and is no longer considered for new
3444          * space requests.
3445          */
3446
3447         /*
3448          * This loop will be executed once per cluster referenced.
3449          * Typically this means once, since it's unlikely that the
3450          * VM system will ask for anything spanning cluster boundaries.
3451          *
3452          * If there are holes in a cluster (in a paging segment), we stop
3453          * reading at the hole, then loop again, hoping to
3454          * find valid pages later in the cluster.  This continues until
3455          * the entire range has been examined, and read, if present.  The
3456          * pages are written as they are read.  If a failure occurs after
3457          * some pages are written the unmap call at the bottom of the loop
3458          * recovers the backing store and the old backing store remains
3459          * in effect.
3460          */
3461
3462         VSM_CLR(write_vsmap);
3463         VSM_CLR(original_read_vsmap);
3464         /* grab the actual object's pages to sync with I/O */
3465         while (cnt && (error == KERN_SUCCESS)) {
3466                 vsmap_ptr = vs_get_map_entry(vs, offset);
3467                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3468
3469                 if (actual_offset == (vm_offset_t) -1) {
3470
3471                         /*
3472                          * Nothing left to write in this cluster at least
3473                          * set write cluster information for any previous
3474                          * write, clear for next cluster, if there is one
3475                          */
3476                         unsigned int local_size, clmask, clsize;
3477
3478                         clsize = vm_page_size << vs->vs_clshift;
3479                         clmask = clsize - 1;
3480                         local_size = clsize - (offset & clmask);
3481                         ASSERT(local_size);
3482                         local_size = MIN(local_size, cnt);
3483
3484                         /* This cluster has no data in it beyond what may */
3485                         /* have been found on a previous iteration through */
3486                         /* the loop "write_vsmap" */
3487                         *vsmap_ptr = write_vsmap;
3488                         VSM_CLR(write_vsmap);
3489                         VSM_CLR(original_read_vsmap);
3490
3491                         cnt -= local_size;
3492                         offset += local_size;
3493                         continue;
3494                 }
3495
3496                 /*
3497                  * Count up contiguous available or unavailable
3498                  * pages.
3499                  */
3500                 ps = CLMAP_PS(clmap);
3501                 ASSERT(ps);
3502                 size = 0;
3503                 unavail_size = 0;
3504                 for (i = 0;
3505                      (size < cnt) && (unavail_size < cnt) &&
3506                      (i < CLMAP_NPGS(clmap)); i++) {
3507                         if (CLMAP_ISSET(clmap, i)) {
3508                                 if (unavail_size != 0)
3509                                         break;
3510                                 size += vm_page_size;
3511                                 BS_STAT(ps->ps_bs,
3512                                         ps->ps_bs->bs_pages_in++);
3513                         } else {
3514                                 if (size != 0)
3515                                         break;
3516                                 unavail_size += vm_page_size;
3517                         }
3518                 }
3519
3520                 if (size == 0) {
3521                         ASSERT(unavail_size);
3522                         cnt -= unavail_size;
3523                         offset += unavail_size;
3524                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3525                                 == 0) {
3526                                 /* There is no more to transfer in this
3527                                    cluster
3528                                 */
3529                                 *vsmap_ptr = write_vsmap;
3530                                 VSM_CLR(write_vsmap);
3531                                 VSM_CLR(original_read_vsmap);
3532                         }
3533                         continue;
3534                 }
3535
3536                 if(VSM_ISCLR(original_read_vsmap))
3537                         original_read_vsmap = *vsmap_ptr;
3538
3539                 if(ps->ps_segtype == PS_PARTITION) {
3540 /*
3541                         NEED TO ISSUE WITH SYNC & NO COMMIT
3542                         error = ps_read_device(ps, actual_offset, &buffer,
3543                                        size, &residual, flags);
3544 */
3545                 } else {
3546                         /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3547                         error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
3548                                         size, &residual,
3549                                         (UPL_IOSYNC | UPL_NOCOMMIT));
3550                 }
3551
3552                 read_vsmap = *vsmap_ptr;
3553
3554
3555                 /*
3556                  * Adjust counts and put data in new BS.  Optimize for the
3557                  * common case, i.e. no error and/or partial data.
3558                  * If there was an error, then we need to error the entire
3559                  * range, even if some data was successfully read.
3560                  *
3561                  */
3562                 if ((error == KERN_SUCCESS) && (residual == 0)) {
3563
3564                         /*
3565                          * Got everything we asked for, supply the data to
3566                          * the new BS.  Note that as a side effect of supplying
3567                          * the data, the buffer holding the supplied data is
3568                          * deallocated from the pager's address space unless
3569                          * the write is unsuccessful.
3570                          */
3571
3572                         /* note buffer will be cleaned up in all cases by */
3573                         /* internal_cluster_write or if an error on write */
3574                         /* the vm_map_copy_page_discard call              */
3575                         *vsmap_ptr = write_vsmap;
3576
3577                         if(vs_cluster_write(vs, upl, offset,
3578                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3579                                 error = KERN_FAILURE;
3580                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
3581                                         /* unmap the new backing store object */
3582                                         ps_clunmap(vs, offset, size);
3583                                 }
3584                                 /* original vsmap */
3585                                 *vsmap_ptr = original_read_vsmap;
3586                                 VSM_CLR(write_vsmap);
3587                         } else {
3588                                if((offset + size) &
3589                                         ((vm_page_size << vs->vs_clshift)
3590                                         - 1)) {
3591                                         /* There is more to transfer in this
3592                                            cluster
3593                                         */
3594                                         write_vsmap = *vsmap_ptr;
3595                                         *vsmap_ptr = read_vsmap;
3596                                 } else {
3597                                         /* discard the old backing object */
3598                                         write_vsmap = *vsmap_ptr;
3599                                         *vsmap_ptr = read_vsmap;
3600                                         ps_clunmap(vs, offset, size);
3601                                         *vsmap_ptr = write_vsmap;
3602                                         VSM_CLR(write_vsmap);
3603                                         VSM_CLR(original_read_vsmap);
3604                                 }
3605                         }
3606                 } else {
3607                         size_wanted = size;
3608                         if (error == KERN_SUCCESS) {
3609                                 if (residual == size) {
3610                                         /*
3611                                          * If a read operation returns no error
3612                                          * and no data moved, we turn it into
3613                                          * an error, assuming we're reading at
3614                                          * or beyond EOF.
3615                                          * Fall through and error the entire
3616                                          * range.
3617                                          */
3618                                         error = KERN_FAILURE;
3619                                         *vsmap_ptr = write_vsmap;
3620                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3621                                         /* unmap the new backing store object */
3622                                         ps_clunmap(vs, offset, size);
3623                                         }
3624                                         *vsmap_ptr = original_read_vsmap;
3625                                         VSM_CLR(write_vsmap);
3626                                         continue;
3627                                 } else {
3628                                         /*
3629                                          * Otherwise, we have partial read.
3630                                          * This is also considered an error
3631                                          * for the purposes of cluster transfer
3632                                          */
3633                                         error = KERN_FAILURE;
3634                                         *vsmap_ptr = write_vsmap;
3635                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3636                                         /* unmap the new backing store object */
3637                                         ps_clunmap(vs, offset, size);
3638                                         }
3639                                         *vsmap_ptr = original_read_vsmap;
3640                                         VSM_CLR(write_vsmap);
3641                                         continue;
3642                                 }
3643                         }
3644
3645                 }
3646                 cnt -= size;
3647                 offset += size;
3648
3649         } /* END while (cnt && (error == 0)) */
3650         if(!VSM_ISCLR(write_vsmap))
3651                 *vsmap_ptr = write_vsmap;
3652
3653         return error;
3654 }
3655
3656 kern_return_t
3657 default_pager_add_file(
3658         MACH_PORT_FACE  backing_store,
3659         vnode_ptr_t     vp,
3660         int             record_size,
3661         vm_size_t       size)
3662 {
3663         backing_store_t         bs;
3664         paging_segment_t        ps;
3665         int                     i;
3666         unsigned int            j;
3667         int                     error;
3668
3669         if ((bs = backing_store_lookup(backing_store))
3670             == BACKING_STORE_NULL)
3671                 return KERN_INVALID_ARGUMENT;
3672
3673         PSL_LOCK();
3674         for (i = 0; i <= paging_segment_max; i++) {
3675                 ps = paging_segments[i];
3676                 if (ps == PAGING_SEGMENT_NULL)
3677                         continue;
3678                 if (ps->ps_segtype != PS_FILE)
3679                         continue;
3680
3681                 /*
3682                  * Check for overlap on same device.
3683                  */
3684                 if (ps->ps_vnode == (struct vnode *)vp) {
3685                         PSL_UNLOCK();
3686                         BS_UNLOCK(bs);
3687                         return KERN_INVALID_ARGUMENT;
3688                 }
3689         }
3690         PSL_UNLOCK();
3691
3692         /*
3693          * Set up the paging segment
3694          */
3695         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3696         if (ps == PAGING_SEGMENT_NULL) {
3697                 BS_UNLOCK(bs);
3698                 return KERN_RESOURCE_SHORTAGE;
3699         }
3700
3701         ps->ps_segtype = PS_FILE;
3702         ps->ps_vnode = (struct vnode *)vp;
3703         ps->ps_offset = 0;
3704         ps->ps_record_shift = local_log2(vm_page_size / record_size);
3705         ps->ps_recnum = size;
3706         ps->ps_pgnum = size >> ps->ps_record_shift;
3707
3708         ps->ps_pgcount = ps->ps_pgnum;
3709         ps->ps_clshift = local_log2(bs->bs_clsize);
3710         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3711         ps->ps_hint = 0;
3712
3713         PS_LOCK_INIT(ps);
3714         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3715         if (!ps->ps_bmap) {
3716                 kfree(ps, sizeof *ps);
3717                 BS_UNLOCK(bs);
3718                 return KERN_RESOURCE_SHORTAGE;
3719         }
3720         for (j = 0; j < ps->ps_ncls; j++) {
3721                 clrbit(ps->ps_bmap, j);
3722         }
3723
3724         ps->ps_going_away = FALSE;
3725         ps->ps_bs = bs;
3726
3727         if ((error = ps_enter(ps)) != 0) {
3728                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3729                 kfree(ps, sizeof *ps);
3730                 BS_UNLOCK(bs);
3731                 return KERN_RESOURCE_SHORTAGE;
3732         }
3733
3734         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3735         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3736         PSL_LOCK();
3737         dp_pages_free += ps->ps_pgcount;
3738         PSL_UNLOCK();
3739
3740         BS_UNLOCK(bs);
3741
3742         bs_more_space(ps->ps_clcount);
3743
3744         DP_DEBUG(DEBUG_BS_INTERNAL,
3745                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3746                   device, offset, size, record_size,
3747                   ps->ps_record_shift, ps->ps_pgnum));
3748
3749         return KERN_SUCCESS;
3750 }
3751
3752
3753
3754 kern_return_t
3755 ps_read_file(
3756         paging_segment_t        ps,
3757         upl_t                   upl,
3758         upl_offset_t            upl_offset,
3759         vm_offset_t             offset,
3760         upl_size_t              size,
3761         unsigned int            *residualp,
3762         int                     flags)
3763 {
3764         vm_object_offset_t      f_offset;
3765         int                     error = 0;
3766         int                     result;
3767
3768         assert(dp_encryption_inited);
3769
3770         clustered_reads[atop_32(size)]++;
3771
3772         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3773
3774         /* for transfer case we need to pass uploffset and flags */
3775         error = vnode_pagein(ps->ps_vnode,
3776                                    upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
3777
3778         /* The vnode_pagein semantic is somewhat at odds with the existing   */
3779         /* device_read semantic.  Partial reads are not experienced at this  */
3780         /* level.  It is up to the bit map code and cluster read code to     */
3781         /* check that requested data locations are actually backed, and the  */
3782         /* pagein code to either read all of the requested data or return an */
3783         /* error. */
3784
3785         if (error)
3786                 result = KERN_FAILURE;
3787         else {
3788                 *residualp = 0;
3789                 result = KERN_SUCCESS;
3790         }
3791         return result;
3792 }
3793
3794 kern_return_t
3795 ps_write_file(
3796         paging_segment_t        ps,
3797         upl_t                   upl,
3798         upl_offset_t            upl_offset,
3799         vm_offset_t             offset,
3800         unsigned int            size,
3801         int                     flags)
3802 {
3803         vm_object_offset_t      f_offset;
3804         kern_return_t           result;
3805
3806         assert(dp_encryption_inited);
3807
3808         clustered_writes[atop_32(size)]++;
3809         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3810
3811         if (flags & UPL_PAGING_ENCRYPTED) {
3812                 /*
3813                  * ENCRYPTED SWAP:
3814                  * encrypt all the pages that we're going
3815                  * to pageout.
3816                  */
3817                 upl_encrypt(upl, upl_offset, size);
3818         }
3819
3820         if (vnode_pageout(ps->ps_vnode,
3821                                 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3822                 result = KERN_FAILURE;
3823         else
3824                 result = KERN_SUCCESS;
3825
3826         return result;
3827 }
3828
3829 kern_return_t
3830 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
3831         int             hi_wat,
3832         int             lo_wat,
3833         int             flags,
3834         MACH_PORT_FACE  trigger_port)
3835 {
3836         MACH_PORT_FACE release;
3837         kern_return_t kr;
3838
3839         PSL_LOCK();
3840         if (flags == SWAP_ENCRYPT_ON) {
3841                 /* ENCRYPTED SWAP: turn encryption on */
3842                 release = trigger_port;
3843                 if (!dp_encryption_inited) {
3844                         dp_encryption_inited = TRUE;
3845                         dp_encryption = TRUE;
3846                         kr = KERN_SUCCESS;
3847                 } else {
3848                         kr = KERN_FAILURE;
3849                 }
3850         } else if (flags == SWAP_ENCRYPT_OFF) {
3851                 /* ENCRYPTED SWAP: turn encryption off */
3852                 release = trigger_port;
3853                 if (!dp_encryption_inited) {
3854                         dp_encryption_inited = TRUE;
3855                         dp_encryption = FALSE;
3856                         kr = KERN_SUCCESS;
3857                 } else {
3858                         kr = KERN_FAILURE;
3859                 }
3860         } else if (flags == HI_WAT_ALERT) {
3861                 release = min_pages_trigger_port;
3862                 min_pages_trigger_port = trigger_port;
3863                 minimum_pages_remaining = hi_wat/vm_page_size;
3864                 bs_low = FALSE;
3865                 kr = KERN_SUCCESS;
3866         } else if (flags ==  LO_WAT_ALERT) {
3867                 release = max_pages_trigger_port;
3868                 max_pages_trigger_port = trigger_port;
3869                 maximum_pages_free = lo_wat/vm_page_size;
3870                 kr = KERN_SUCCESS;
3871         } else {
3872                 release = trigger_port;
3873                 kr =  KERN_INVALID_ARGUMENT;
3874         }
3875         PSL_UNLOCK();
3876
3877         if (IP_VALID(release))
3878                 ipc_port_release_send(release);
3879
3880         return kr;
3881 }
3882
3883 /*
3884  * Monitor the amount of available backing store vs. the amount of
3885  * required backing store, notify a listener (if present) when
3886  * backing store may safely be removed.
3887  *
3888  * We attempt to avoid the situation where backing store is
3889  * discarded en masse, as this can lead to thrashing as the
3890  * backing store is compacted.
3891  */
3892
3893 #define PF_INTERVAL     3       /* time between free level checks */
3894 #define PF_LATENCY      10      /* number of intervals before release */
3895
3896 static int dp_pages_free_low_count = 0;
3897 thread_call_t default_pager_backing_store_monitor_callout;
3898
3899 void
3900 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
3901                                                                         __unused thread_call_param_t p2)
3902 {
3903 //      unsigned long long      average;
3904         ipc_port_t              trigger;
3905         uint64_t                deadline;
3906
3907         /*
3908          * We determine whether it will be safe to release some
3909          * backing store by watching the free page level.  If
3910          * it remains below the maximum_pages_free threshold for
3911          * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3912          * then we deem it safe.
3913          *
3914          * Note that this establishes a maximum rate at which backing
3915          * store will be released, as each notification (currently)
3916          * only results in a single backing store object being
3917          * released.
3918          */
3919         if (dp_pages_free > maximum_pages_free) {
3920                 dp_pages_free_low_count++;
3921         } else {
3922                 dp_pages_free_low_count = 0;
3923         }
3924
3925         /* decide whether to send notification */
3926         trigger = IP_NULL;
3927         if (max_pages_trigger_port &&
3928             (backing_store_release_trigger_disable == 0) &&
3929             (dp_pages_free_low_count > PF_LATENCY)) {
3930                 trigger = max_pages_trigger_port;
3931                 max_pages_trigger_port = NULL;
3932         }
3933
3934         /* send notification */
3935         if (trigger != IP_NULL) {
3936                 VSL_LOCK();
3937                 if(backing_store_release_trigger_disable != 0) {
3938                         assert_wait((event_t)
3939                                     &backing_store_release_trigger_disable,
3940                                     THREAD_UNINT);
3941                         VSL_UNLOCK();
3942                         thread_block(THREAD_CONTINUE_NULL);
3943                 } else {
3944                         VSL_UNLOCK();
3945                 }
3946                 default_pager_space_alert(trigger, LO_WAT_ALERT);
3947                 ipc_port_release_send(trigger);
3948                 dp_pages_free_low_count = 0;
3949         }
3950
3951         clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
3952         thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
3953 }