osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56
  57 /*
  58  *      Default Pager.
  59  *              Paging File Management.
  60  */
  61
  62 #include <mach/host_priv.h>
  63 #include <mach/memory_object_control.h>
  64 #include <mach/memory_object_server.h>
  65 #include <mach/upl.h>
  66 #include <default_pager/default_pager_internal.h>
  67 #include <default_pager/default_pager_alerts.h>
  68 #include <default_pager/default_pager_object_server.h>
  69
  70 #include <ipc/ipc_types.h>
  71 #include <ipc/ipc_port.h>
  72 #include <ipc/ipc_space.h>
  73
  74 #include <kern/kern_types.h>
  75 #include <kern/host.h>
  76 #include <kern/queue.h>
  77 #include <kern/counters.h>
  78 #include <kern/sched_prim.h>
  79
  80 #include <vm/vm_kern.h>
  81 #include <vm/vm_pageout.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/vm_object.h>
  84 #include <vm/vm_protos.h>
  85
  86
  87 /* LP64todo - need large internal object support */
  88
  89 /*
  90  * ALLOC_STRIDE... the maximum number of bytes allocated from
  91  * a swap file before moving on to the next swap file... if
  92  * all swap files reside on a single disk, this value should
  93  * be very large (this is the default assumption)... if the
  94  * swap files are spread across multiple disks, than this value
  95  * should be small (128 * 1024)...
  96  *
  97  * This should be determined dynamically in the future
  98  */
  99
 100 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
 101 int physical_transfer_cluster_count = 0;
 102
 103 #define VM_SUPER_CLUSTER        0x40000
 104 #define VM_SUPER_PAGES          64
 105
 106 /*
 107  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
 108  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
 109  */
 110 #define VSTRUCT_DEF_CLSHIFT     2
 111 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
 112 int default_pager_clsize = 0;
 113
 114 /* statistics */
 115 unsigned int clustered_writes[VM_SUPER_PAGES+1];
 116 unsigned int clustered_reads[VM_SUPER_PAGES+1];
 117
 118 /*
 119  * Globals used for asynchronous paging operations:
 120  *      vs_async_list:  head of list of to-be-completed I/O ops
 121  *      async_num_queued: number of pages completed, but not yet
 122  *              processed by async thread.
 123  *      async_requests_out: number of pages of requests not completed.
 124  */
 125
 126 #if 0
 127 struct vs_async *vs_async_list;
 128 int     async_num_queued;
 129 int     async_requests_out;
 130 #endif
 131
 132
 133 #define VS_ASYNC_REUSE 1
 134 struct vs_async *vs_async_free_list;
 135
 136 mutex_t default_pager_async_lock;       /* Protects globals above */
 137
 138
 139 int vs_alloc_async_failed = 0;                  /* statistics */
 140 int vs_alloc_async_count = 0;                   /* statistics */
 141 struct vs_async *vs_alloc_async(void);          /* forward */
 142 void vs_free_async(struct vs_async *vsa);       /* forward */
 143
 144
 145 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 146 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 147
 148 #define VS_ASYNC_LOCK()         mutex_lock(&default_pager_async_lock)
 149 #define VS_ASYNC_UNLOCK()       mutex_unlock(&default_pager_async_lock)
 150 #define VS_ASYNC_LOCK_INIT()    mutex_init(&default_pager_async_lock, 0)
 151 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 152 /*
 153  *  Paging Space Hysteresis triggers and the target notification port
 154  *
 155  */
 156
 157 unsigned int    minimum_pages_remaining = 0;
 158 unsigned int    maximum_pages_free = 0;
 159 ipc_port_t      min_pages_trigger_port = NULL;
 160 ipc_port_t      max_pages_trigger_port = NULL;
 161
 162 boolean_t       bs_low = FALSE;
 163 int             backing_store_release_trigger_disable = 0;
 164
 165
 166 /* Have we decided if swap needs to be encrypted yet ? */
 167 boolean_t       dp_encryption_inited = FALSE;
 168 /* Should we encrypt swap ? */
 169 boolean_t       dp_encryption = FALSE;
 170
 171
 172 /*
 173  * Object sizes are rounded up to the next power of 2,
 174  * unless they are bigger than a given maximum size.
 175  */
 176 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 177
 178 /*
 179  * List of all backing store and segments.
 180  */
 181 struct backing_store_list_head backing_store_list;
 182 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 183 mutex_t                 paging_segments_lock;
 184 int                     paging_segment_max = 0;
 185 int                     paging_segment_count = 0;
 186 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 187
 188
 189 /*
 190  * Total pages free in system
 191  * This differs from clusters committed/avail which is a measure of the
 192  * over commitment of paging segments to backing store.  An idea which is
 193  * likely to be deprecated.
 194  */
 195 unsigned  int   dp_pages_free = 0;
 196 unsigned  int   cluster_transfer_minimum = 100;
 197
 198 /* forward declarations */
 199 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, int);     /* forward */
 200 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, unsigned int *, int);     /* forward */
 201 default_pager_thread_t *get_read_buffer( void );
 202 kern_return_t ps_vstruct_transfer_from_segment(
 203         vstruct_t        vs,
 204         paging_segment_t segment,
 205         upl_t            upl);
 206 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
 207 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
 208 kern_return_t vs_cluster_transfer(
 209         vstruct_t       vs,
 210         upl_offset_t    offset,
 211         upl_size_t      cnt,
 212         upl_t           upl);
 213 vs_map_t vs_get_map_entry(
 214         vstruct_t       vs,
 215         vm_offset_t     offset);
 216
 217
 218 default_pager_thread_t *
 219 get_read_buffer( void )
 220 {
 221         int     i;
 222
 223         DPT_LOCK(dpt_lock);
 224         while(TRUE) {
 225                 for (i=0; i<default_pager_internal_count; i++) {
 226                         if(dpt_array[i]->checked_out == FALSE) {
 227                           dpt_array[i]->checked_out = TRUE;
 228                           DPT_UNLOCK(dpt_lock);
 229                           return  dpt_array[i];
 230                         }
 231                 }
 232                 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
 233         }
 234 }
 235
 236 void
 237 bs_initialize(void)
 238 {
 239         int i;
 240
 241         /*
 242          * List of all backing store.
 243          */
 244         BSL_LOCK_INIT();
 245         queue_init(&backing_store_list.bsl_queue);
 246         PSL_LOCK_INIT();
 247
 248         VS_ASYNC_LOCK_INIT();
 249 #if     VS_ASYNC_REUSE
 250         vs_async_free_list = NULL;
 251 #endif  /* VS_ASYNC_REUSE */
 252
 253         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 254                 clustered_writes[i] = 0;
 255                 clustered_reads[i] = 0;
 256         }
 257
 258 }
 259
 260 /*
 261  * When things do not quite workout...
 262  */
 263 void bs_no_paging_space(boolean_t);     /* forward */
 264
 265 void
 266 bs_no_paging_space(
 267         boolean_t out_of_memory)
 268 {
 269
 270         if (out_of_memory)
 271                 dprintf(("*** OUT OF MEMORY ***\n"));
 272         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 273 }
 274
 275 void bs_more_space(int);        /* forward */
 276 void bs_commit(int);            /* forward */
 277
 278 boolean_t       user_warned = FALSE;
 279 unsigned int    clusters_committed = 0;
 280 unsigned int    clusters_available = 0;
 281 unsigned int    clusters_committed_peak = 0;
 282
 283 void
 284 bs_more_space(
 285         int     nclusters)
 286 {
 287         BSL_LOCK();
 288         /*
 289          * Account for new paging space.
 290          */
 291         clusters_available += nclusters;
 292
 293         if (clusters_available >= clusters_committed) {
 294                 if (verbose && user_warned) {
 295                         printf("%s%s - %d excess clusters now.\n",
 296                                my_name,
 297                                "paging space is OK now",
 298                                clusters_available - clusters_committed);
 299                         user_warned = FALSE;
 300                         clusters_committed_peak = 0;
 301                 }
 302         } else {
 303                 if (verbose && user_warned) {
 304                         printf("%s%s - still short of %d clusters.\n",
 305                                my_name,
 306                                "WARNING: paging space over-committed",
 307                                clusters_committed - clusters_available);
 308                         clusters_committed_peak -= nclusters;
 309                 }
 310         }
 311         BSL_UNLOCK();
 312
 313         return;
 314 }
 315
 316 void
 317 bs_commit(
 318         int     nclusters)
 319 {
 320         BSL_LOCK();
 321         clusters_committed += nclusters;
 322         if (clusters_committed > clusters_available) {
 323                 if (verbose && !user_warned) {
 324                         user_warned = TRUE;
 325                         printf("%s%s - short of %d clusters.\n",
 326                                my_name,
 327                                "WARNING: paging space over-committed",
 328                                clusters_committed - clusters_available);
 329                 }
 330                 if (clusters_committed > clusters_committed_peak) {
 331                         clusters_committed_peak = clusters_committed;
 332                 }
 333         } else {
 334                 if (verbose && user_warned) {
 335                         printf("%s%s - was short of up to %d clusters.\n",
 336                                my_name,
 337                                "paging space is OK now",
 338                                clusters_committed_peak - clusters_available);
 339                         user_warned = FALSE;
 340                         clusters_committed_peak = 0;
 341                 }
 342         }
 343         BSL_UNLOCK();
 344
 345         return;
 346 }
 347
 348 int default_pager_info_verbose = 1;
 349
 350 void
 351 bs_global_info(
 352         vm_size_t       *totalp,
 353         vm_size_t       *freep)
 354 {
 355         vm_size_t               pages_total, pages_free;
 356         paging_segment_t        ps;
 357         int                     i;
 358
 359         PSL_LOCK();
 360         pages_total = pages_free = 0;
 361         for (i = 0; i <= paging_segment_max; i++) {
 362                 ps = paging_segments[i];
 363                 if (ps == PAGING_SEGMENT_NULL)
 364                         continue;
 365
 366                 /*
 367                  * no need to lock: by the time this data
 368                  * gets back to any remote requestor it
 369                  * will be obsolete anyways
 370                  */
 371                 pages_total += ps->ps_pgnum;
 372                 pages_free += ps->ps_clcount << ps->ps_clshift;
 373                 DP_DEBUG(DEBUG_BS_INTERNAL,
 374                          ("segment #%d: %d total, %d free\n",
 375                           i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 376         }
 377         *totalp = pages_total;
 378         *freep = pages_free;
 379         if (verbose && user_warned && default_pager_info_verbose) {
 380                 if (clusters_available < clusters_committed) {
 381                         printf("%s %d clusters committed, %d available.\n",
 382                                my_name,
 383                                clusters_committed,
 384                                clusters_available);
 385                 }
 386         }
 387         PSL_UNLOCK();
 388 }
 389
 390 backing_store_t backing_store_alloc(void);      /* forward */
 391
 392 backing_store_t
 393 backing_store_alloc(void)
 394 {
 395         backing_store_t bs;
 396
 397         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 398         if (bs == BACKING_STORE_NULL)
 399                 panic("backing_store_alloc: no memory");
 400
 401         BS_LOCK_INIT(bs);
 402         bs->bs_port = MACH_PORT_NULL;
 403         bs->bs_priority = 0;
 404         bs->bs_clsize = 0;
 405         bs->bs_pages_total = 0;
 406         bs->bs_pages_in = 0;
 407         bs->bs_pages_in_fail = 0;
 408         bs->bs_pages_out = 0;
 409         bs->bs_pages_out_fail = 0;
 410
 411         return bs;
 412 }
 413
 414 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 415
 416 /* Even in both the component space and external versions of this pager, */
 417 /* backing_store_lookup will be called from tasks in the application space */
 418 backing_store_t
 419 backing_store_lookup(
 420         MACH_PORT_FACE port)
 421 {
 422         backing_store_t bs;
 423
 424 /*
 425         port is currently backed with a vs structure in the alias field
 426         we could create an ISBS alias and a port_is_bs call but frankly
 427         I see no reason for the test, the bs->port == port check below
 428         will work properly on junk entries.
 429
 430         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 431 */
 432         if ((port == MACH_PORT_NULL))
 433                 return BACKING_STORE_NULL;
 434
 435         BSL_LOCK();
 436         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 437                       bs_links) {
 438                 BS_LOCK(bs);
 439                 if (bs->bs_port == port) {
 440                         BSL_UNLOCK();
 441                         /* Success, return it locked. */
 442                         return bs;
 443                 }
 444                 BS_UNLOCK(bs);
 445         }
 446         BSL_UNLOCK();
 447         return BACKING_STORE_NULL;
 448 }
 449
 450 void backing_store_add(backing_store_t);        /* forward */
 451
 452 void
 453 backing_store_add(
 454         __unused backing_store_t bs)
 455 {
 456 //      MACH_PORT_FACE          port = bs->bs_port;
 457 //      MACH_PORT_FACE          pset = default_pager_default_set;
 458         kern_return_t           kr = KERN_SUCCESS;
 459
 460         if (kr != KERN_SUCCESS)
 461                 panic("backing_store_add: add to set");
 462
 463 }
 464
 465 /*
 466  * Set up default page shift, but only if not already
 467  * set and argument is within range.
 468  */
 469 boolean_t
 470 bs_set_default_clsize(unsigned int npages)
 471 {
 472         switch(npages){
 473             case 1:
 474             case 2:
 475             case 4:
 476             case 8:
 477                 if (default_pager_clsize == 0)  /* if not yet set */
 478                         vstruct_def_clshift = local_log2(npages);
 479                 return(TRUE);
 480         }
 481         return(FALSE);
 482 }
 483
 484 int bs_get_global_clsize(int clsize);   /* forward */
 485
 486 int
 487 bs_get_global_clsize(
 488         int     clsize)
 489 {
 490         int                     i;
 491         memory_object_default_t dmm;
 492         kern_return_t           kr;
 493
 494         /*
 495          * Only allow setting of cluster size once. If called
 496          * with no cluster size (default), we use the compiled-in default
 497          * for the duration. The same cluster size is used for all
 498          * paging segments.
 499          */
 500         if (default_pager_clsize == 0) {
 501                 /*
 502                  * Keep cluster size in bit shift because it's quicker
 503                  * arithmetic, and easier to keep at a power of 2.
 504                  */
 505                 if (clsize != NO_CLSIZE) {
 506                         for (i = 0; (1 << i) < clsize; i++);
 507                         if (i > MAX_CLUSTER_SHIFT)
 508                                 i = MAX_CLUSTER_SHIFT;
 509                         vstruct_def_clshift = i;
 510                 }
 511                 default_pager_clsize = (1 << vstruct_def_clshift);
 512
 513                 /*
 514                  * Let the user know the new (and definitive) cluster size.
 515                  */
 516                 if (verbose)
 517                         printf("%scluster size = %d page%s\n",
 518                                 my_name, default_pager_clsize,
 519                                 (default_pager_clsize == 1) ? "" : "s");
 520
 521                 /*
 522                  * Let the kernel know too, in case it hasn't used the
 523                  * default value provided in main() yet.
 524                  */
 525                 dmm = default_pager_object;
 526                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 527                 kr = host_default_memory_manager(host_priv_self(),
 528                                                  &dmm,
 529                                                  clsize);
 530                 memory_object_default_deallocate(dmm);
 531
 532                 if (kr != KERN_SUCCESS) {
 533                    panic("bs_get_global_cl_size:host_default_memory_manager");
 534                 }
 535                 if (dmm != default_pager_object) {
 536                   panic("bs_get_global_cl_size:there is another default pager");
 537                 }
 538         }
 539         ASSERT(default_pager_clsize > 0 &&
 540                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 541
 542         return default_pager_clsize;
 543 }
 544
 545 kern_return_t
 546 default_pager_backing_store_create(
 547         memory_object_default_t pager,
 548         int                     priority,
 549         int                     clsize,         /* in bytes */
 550         MACH_PORT_FACE          *backing_store)
 551 {
 552         backing_store_t bs;
 553         MACH_PORT_FACE  port;
 554 //      kern_return_t   kr;
 555         struct vstruct_alias *alias_struct;
 556
 557         if (pager != default_pager_object)
 558                 return KERN_INVALID_ARGUMENT;
 559
 560         bs = backing_store_alloc();
 561         port = ipc_port_alloc_kernel();
 562         ipc_port_make_send(port);
 563         assert (port != IP_NULL);
 564
 565         DP_DEBUG(DEBUG_BS_EXTERNAL,
 566                  ("priority=%d clsize=%d bs_port=0x%x\n",
 567                   priority, clsize, (int) backing_store));
 568
 569         alias_struct = (struct vstruct_alias *)
 570                                 kalloc(sizeof (struct vstruct_alias));
 571         if(alias_struct != NULL) {
 572                 alias_struct->vs = (struct vstruct *)bs;
 573                 alias_struct->name = &default_pager_ops;
 574                 port->alias = (int) alias_struct;
 575         }
 576         else {
 577                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 578                 kfree(bs, sizeof (struct backing_store));
 579                 return KERN_RESOURCE_SHORTAGE;
 580         }
 581
 582         bs->bs_port = port;
 583         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 584                 priority = BS_MAXPRI;
 585         else if (priority == BS_NOPRI)
 586                 priority = BS_MAXPRI;
 587         else
 588                 priority = BS_MINPRI;
 589         bs->bs_priority = priority;
 590
 591         bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
 592
 593         BSL_LOCK();
 594         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 595                     bs_links);
 596         BSL_UNLOCK();
 597
 598         backing_store_add(bs);
 599
 600         *backing_store = port;
 601         return KERN_SUCCESS;
 602 }
 603
 604 kern_return_t
 605 default_pager_backing_store_info(
 606         MACH_PORT_FACE          backing_store,
 607         backing_store_flavor_t  flavour,
 608         backing_store_info_t    info,
 609         mach_msg_type_number_t  *size)
 610 {
 611         backing_store_t                 bs;
 612         backing_store_basic_info_t      basic;
 613         int                             i;
 614         paging_segment_t                ps;
 615
 616         if (flavour != BACKING_STORE_BASIC_INFO ||
 617             *size < BACKING_STORE_BASIC_INFO_COUNT)
 618                 return KERN_INVALID_ARGUMENT;
 619
 620         basic = (backing_store_basic_info_t)info;
 621         *size = BACKING_STORE_BASIC_INFO_COUNT;
 622
 623         VSTATS_LOCK(&global_stats.gs_lock);
 624         basic->pageout_calls    = global_stats.gs_pageout_calls;
 625         basic->pagein_calls     = global_stats.gs_pagein_calls;
 626         basic->pages_in         = global_stats.gs_pages_in;
 627         basic->pages_out        = global_stats.gs_pages_out;
 628         basic->pages_unavail    = global_stats.gs_pages_unavail;
 629         basic->pages_init       = global_stats.gs_pages_init;
 630         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 631         VSTATS_UNLOCK(&global_stats.gs_lock);
 632
 633         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 634                 return KERN_INVALID_ARGUMENT;
 635
 636         basic->bs_pages_total   = bs->bs_pages_total;
 637         PSL_LOCK();
 638         bs->bs_pages_free = 0;
 639         for (i = 0; i <= paging_segment_max; i++) {
 640                 ps = paging_segments[i];
 641                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 642                         PS_LOCK(ps);
 643                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 644                         PS_UNLOCK(ps);
 645                 }
 646         }
 647         PSL_UNLOCK();
 648         basic->bs_pages_free    = bs->bs_pages_free;
 649         basic->bs_pages_in      = bs->bs_pages_in;
 650         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 651         basic->bs_pages_out     = bs->bs_pages_out;
 652         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 653
 654         basic->bs_priority      = bs->bs_priority;
 655         basic->bs_clsize        = ptoa_32(bs->bs_clsize);       /* in bytes */
 656
 657         BS_UNLOCK(bs);
 658
 659         return KERN_SUCCESS;
 660 }
 661
 662 int ps_delete(paging_segment_t);        /* forward */
 663
 664 int
 665 ps_delete(
 666         paging_segment_t ps)
 667 {
 668         vstruct_t       vs;
 669         kern_return_t   error = KERN_SUCCESS;
 670         int             vs_count;
 671
 672         VSL_LOCK();             /* get the lock on the list of vs's      */
 673
 674         /* The lock relationship and sequence is farily complicated      */
 675         /* this code looks at a live list, locking and unlocking the list */
 676         /* as it traverses it.  It depends on the locking behavior of    */
 677         /* default_pager_no_senders.  no_senders always locks the vstruct */
 678         /* targeted for removal before locking the vstruct list.  However */
 679         /* it will remove that member of the list without locking its    */
 680         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 681         /* it cannot be removed from the list but we must hold the list  */
 682         /* lock to be sure that its pointers to its neighbors are valid. */
 683         /* Also, we can hold off destruction of a vstruct when the list  */
 684         /* lock and the vs locks are not being held by bumping the       */
 685         /* vs_async_pending count.      */
 686
 687
 688         while(backing_store_release_trigger_disable != 0) {
 689                 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
 690         }
 691
 692         /* we will choose instead to hold a send right */
 693         vs_count = vstruct_list.vsl_count;
 694         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 695         if(vs == (vstruct_t)&vstruct_list)  {
 696                 VSL_UNLOCK();
 697                 return KERN_SUCCESS;
 698         }
 699         VS_LOCK(vs);
 700         vs_async_wait(vs);  /* wait for any pending async writes */
 701         if ((vs_count != 0) && (vs != NULL))
 702                 vs->vs_async_pending += 1;  /* hold parties calling  */
 703                                             /* vs_async_wait */
 704         VS_UNLOCK(vs);
 705         VSL_UNLOCK();
 706         while((vs_count != 0) && (vs != NULL)) {
 707                 /* We take the count of AMO's before beginning the         */
 708                 /* transfer of of the target segment.                      */
 709                 /* We are guaranteed that the target segment cannot get    */
 710                 /* more users.  We also know that queue entries are        */
 711                 /* made at the back of the list.  If some of the entries   */
 712                 /* we would check disappear while we are traversing the    */
 713                 /* list then we will either check new entries which        */
 714                 /* do not have any backing store in the target segment     */
 715                 /* or re-check old entries.  This might not be optimal     */
 716                 /* but it will always be correct. The alternative is to    */
 717                 /* take a snapshot of the list.                            */
 718                 vstruct_t       next_vs;
 719
 720                 if(dp_pages_free < cluster_transfer_minimum)
 721                         error = KERN_FAILURE;
 722                 else {
 723                         vm_object_t     transfer_object;
 724                         unsigned int    count;
 725                         upl_t           upl;
 726
 727                         transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
 728                         count = 0;
 729                         error = vm_object_upl_request(transfer_object,
 730                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 731                                 &upl, NULL, &count,
 732                                 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_LITE | UPL_SET_INTERNAL);
 733
 734                         if(error == KERN_SUCCESS) {
 735                                 error = ps_vstruct_transfer_from_segment(
 736                                                         vs, ps, upl);
 737                                 upl_commit(upl, NULL, 0);
 738                                 upl_deallocate(upl);
 739                         } else {
 740                                 error = KERN_FAILURE;
 741                         }
 742                         vm_object_deallocate(transfer_object);
 743                 }
 744                 if(error) {
 745                         VS_LOCK(vs);
 746                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 747                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 748                                 vs->vs_waiting_async = FALSE;
 749                                 VS_UNLOCK(vs);
 750                                 thread_wakeup(&vs->vs_async_pending);
 751                         } else {
 752                                 VS_UNLOCK(vs);
 753                         }
 754                         return KERN_FAILURE;
 755                 }
 756
 757                 VSL_LOCK();
 758
 759                 while(backing_store_release_trigger_disable != 0) {
 760                         VSL_SLEEP(&backing_store_release_trigger_disable,
 761                                   THREAD_UNINT);
 762                 }
 763
 764                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 765                 if((next_vs != (vstruct_t)&vstruct_list) &&
 766                                 (vs != next_vs) && (vs_count != 1)) {
 767                         VS_LOCK(next_vs);
 768                         vs_async_wait(next_vs);  /* wait for any  */
 769                                                  /* pending async writes */
 770                         next_vs->vs_async_pending += 1; /* hold parties  */
 771                                                 /* calling vs_async_wait */
 772                         VS_UNLOCK(next_vs);
 773                 }
 774                 VSL_UNLOCK();
 775                 VS_LOCK(vs);
 776                 vs->vs_async_pending -= 1;
 777                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 778                         vs->vs_waiting_async = FALSE;
 779                         VS_UNLOCK(vs);
 780                         thread_wakeup(&vs->vs_async_pending);
 781                 } else {
 782                         VS_UNLOCK(vs);
 783                 }
 784                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 785                         vs = NULL;
 786                 else
 787                         vs = next_vs;
 788                 vs_count--;
 789         }
 790         return KERN_SUCCESS;
 791 }
 792
 793
 794 kern_return_t
 795 default_pager_backing_store_delete(
 796         MACH_PORT_FACE backing_store)
 797 {
 798         backing_store_t         bs;
 799         int                     i;
 800         paging_segment_t        ps;
 801         int                     error;
 802         int                     interim_pages_removed = 0;
 803 //      kern_return_t           kr;
 804
 805         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 806                 return KERN_INVALID_ARGUMENT;
 807
 808 #if 0
 809         /* not implemented */
 810         BS_UNLOCK(bs);
 811         return KERN_FAILURE;
 812 #endif
 813
 814     restart:
 815         PSL_LOCK();
 816         error = KERN_SUCCESS;
 817         for (i = 0; i <= paging_segment_max; i++) {
 818                 ps = paging_segments[i];
 819                 if (ps != PAGING_SEGMENT_NULL &&
 820                     ps->ps_bs == bs &&
 821                     ! ps->ps_going_away) {
 822                         PS_LOCK(ps);
 823                         /* disable access to this segment */
 824                         ps->ps_going_away = TRUE;
 825                         PS_UNLOCK(ps);
 826                         /*
 827                          * The "ps" segment is "off-line" now,
 828                          * we can try and delete it...
 829                          */
 830                         if(dp_pages_free < (cluster_transfer_minimum
 831                                                         + ps->ps_pgcount)) {
 832                                 error = KERN_FAILURE;
 833                                 PSL_UNLOCK();
 834                         }
 835                         else {
 836                                 /* remove all pages associated with the  */
 837                                 /* segment from the list of free pages   */
 838                                 /* when transfer is through, all target  */
 839                                 /* segment pages will appear to be free  */
 840
 841                                 dp_pages_free -=  ps->ps_pgcount;
 842                                 interim_pages_removed += ps->ps_pgcount;
 843                                 PSL_UNLOCK();
 844                                 error = ps_delete(ps);
 845                         }
 846                         if (error != KERN_SUCCESS) {
 847                                 /*
 848                                  * We couldn't delete the segment,
 849                                  * probably because there's not enough
 850                                  * virtual memory left.
 851                                  * Re-enable all the segments.
 852                                  */
 853                                 PSL_LOCK();
 854                                 break;
 855                         }
 856                         goto restart;
 857                 }
 858         }
 859
 860         if (error != KERN_SUCCESS) {
 861                 for (i = 0; i <= paging_segment_max; i++) {
 862                         ps = paging_segments[i];
 863                         if (ps != PAGING_SEGMENT_NULL &&
 864                             ps->ps_bs == bs &&
 865                             ps->ps_going_away) {
 866                                 PS_LOCK(ps);
 867                                 /* re-enable access to this segment */
 868                                 ps->ps_going_away = FALSE;
 869                                 PS_UNLOCK(ps);
 870                         }
 871                 }
 872                 dp_pages_free += interim_pages_removed;
 873                 PSL_UNLOCK();
 874                 BS_UNLOCK(bs);
 875                 return error;
 876         }
 877
 878         for (i = 0; i <= paging_segment_max; i++) {
 879                 ps = paging_segments[i];
 880                 if (ps != PAGING_SEGMENT_NULL &&
 881                     ps->ps_bs == bs) {
 882                         if(ps->ps_going_away) {
 883                                 paging_segments[i] = PAGING_SEGMENT_NULL;
 884                                 paging_segment_count--;
 885                                 PS_LOCK(ps);
 886                                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
 887                                 kfree(ps, sizeof *ps);
 888                         }
 889                 }
 890         }
 891
 892         /* Scan the entire ps array separately to make certain we find the */
 893         /* proper paging_segment_max                                       */
 894         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 895                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 896                    paging_segment_max = i;
 897         }
 898
 899         PSL_UNLOCK();
 900
 901         /*
 902          * All the segments have been deleted.
 903          * We can remove the backing store.
 904          */
 905
 906         /*
 907          * Disable lookups of this backing store.
 908          */
 909         if((void *)bs->bs_port->alias != NULL)
 910                 kfree((void *) bs->bs_port->alias,
 911                       sizeof (struct vstruct_alias));
 912         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 913         bs->bs_port = MACH_PORT_NULL;
 914         BS_UNLOCK(bs);
 915
 916         /*
 917          * Remove backing store from backing_store list.
 918          */
 919         BSL_LOCK();
 920         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
 921                      bs_links);
 922         BSL_UNLOCK();
 923
 924         /*
 925          * Free the backing store structure.
 926          */
 927         kfree(bs, sizeof *bs);
 928
 929         return KERN_SUCCESS;
 930 }
 931
 932 int     ps_enter(paging_segment_t);     /* forward */
 933
 934 int
 935 ps_enter(
 936         paging_segment_t ps)
 937 {
 938         int i;
 939
 940         PSL_LOCK();
 941
 942         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 943                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
 944                         break;
 945         }
 946
 947         if (i < MAX_NUM_PAGING_SEGMENTS) {
 948                 paging_segments[i] = ps;
 949                 if (i > paging_segment_max)
 950                         paging_segment_max = i;
 951                 paging_segment_count++;
 952                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
 953                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
 954                         ps_select_array[ps->ps_bs->bs_priority] = 0;
 955                 i = 0;
 956         } else {
 957                 PSL_UNLOCK();
 958                 return KERN_RESOURCE_SHORTAGE;
 959         }
 960
 961         PSL_UNLOCK();
 962         return i;
 963 }
 964
 965 #ifdef DEVICE_PAGING
 966 kern_return_t
 967 default_pager_add_segment(
 968         MACH_PORT_FACE  backing_store,
 969         MACH_PORT_FACE  device,
 970         recnum_t        offset,
 971         recnum_t        count,
 972         int             record_size)
 973 {
 974         backing_store_t         bs;
 975         paging_segment_t        ps;
 976         int                     i;
 977         int                     error;
 978
 979         if ((bs = backing_store_lookup(backing_store))
 980             == BACKING_STORE_NULL)
 981                 return KERN_INVALID_ARGUMENT;
 982
 983         PSL_LOCK();
 984         for (i = 0; i <= paging_segment_max; i++) {
 985                 ps = paging_segments[i];
 986                 if (ps == PAGING_SEGMENT_NULL)
 987                         continue;
 988
 989                 /*
 990                  * Check for overlap on same device.
 991                  */
 992                 if (!(ps->ps_device != device
 993                       || offset >= ps->ps_offset + ps->ps_recnum
 994                       || offset + count <= ps->ps_offset)) {
 995                         PSL_UNLOCK();
 996                         BS_UNLOCK(bs);
 997                         return KERN_INVALID_ARGUMENT;
 998                 }
 999         }
1000         PSL_UNLOCK();
1001
1002         /*
1003          * Set up the paging segment
1004          */
1005         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1006         if (ps == PAGING_SEGMENT_NULL) {
1007                 BS_UNLOCK(bs);
1008                 return KERN_RESOURCE_SHORTAGE;
1009         }
1010
1011         ps->ps_segtype = PS_PARTITION;
1012         ps->ps_device = device;
1013         ps->ps_offset = offset;
1014         ps->ps_record_shift = local_log2(vm_page_size / record_size);
1015         ps->ps_recnum = count;
1016         ps->ps_pgnum = count >> ps->ps_record_shift;
1017
1018         ps->ps_pgcount = ps->ps_pgnum;
1019         ps->ps_clshift = local_log2(bs->bs_clsize);
1020         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1021         ps->ps_hint = 0;
1022
1023         PS_LOCK_INIT(ps);
1024         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1025         if (!ps->ps_bmap) {
1026                 kfree(ps, sizeof *ps);
1027                 BS_UNLOCK(bs);
1028                 return KERN_RESOURCE_SHORTAGE;
1029         }
1030         for (i = 0; i < ps->ps_ncls; i++) {
1031                 clrbit(ps->ps_bmap, i);
1032         }
1033
1034         ps->ps_going_away = FALSE;
1035         ps->ps_bs = bs;
1036
1037         if ((error = ps_enter(ps)) != 0) {
1038                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1039                 kfree(ps, sizeof *ps);
1040                 BS_UNLOCK(bs);
1041                 return KERN_RESOURCE_SHORTAGE;
1042         }
1043
1044         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1045         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1046         BS_UNLOCK(bs);
1047
1048         PSL_LOCK();
1049         dp_pages_free += ps->ps_pgcount;
1050         PSL_UNLOCK();
1051
1052         bs_more_space(ps->ps_clcount);
1053
1054         DP_DEBUG(DEBUG_BS_INTERNAL,
1055                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1056                   device, offset, count, record_size,
1057                   ps->ps_record_shift, ps->ps_pgnum));
1058
1059         return KERN_SUCCESS;
1060 }
1061
1062 boolean_t
1063 bs_add_device(
1064         char            *dev_name,
1065         MACH_PORT_FACE  master)
1066 {
1067         security_token_t        null_security_token = {
1068                 { 0, 0 }
1069         };
1070         MACH_PORT_FACE  device;
1071         int             info[DEV_GET_SIZE_COUNT];
1072         mach_msg_type_number_t info_count;
1073         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1074         unsigned int    rec_size;
1075         recnum_t        count;
1076         int             clsize;
1077         MACH_PORT_FACE  reply_port;
1078
1079         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1080                         null_security_token, dev_name, &device))
1081                 return FALSE;
1082
1083         info_count = DEV_GET_SIZE_COUNT;
1084         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1085                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1086                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1087                 clsize = bs_get_global_clsize(0);
1088                 if (!default_pager_backing_store_create(
1089                                         default_pager_object,
1090                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1091                                         (clsize * vm_page_size),
1092                                         &bs)) {
1093                         if (!default_pager_add_segment(bs, device,
1094                                                        0, count, rec_size)) {
1095                                 return TRUE;
1096                         }
1097                         ipc_port_release_receive(bs);
1098                 }
1099         }
1100
1101         ipc_port_release_send(device);
1102         return FALSE;
1103 }
1104 #endif /* DEVICE_PAGING */
1105
1106 #if     VS_ASYNC_REUSE
1107
1108 struct vs_async *
1109 vs_alloc_async(void)
1110 {
1111         struct vs_async *vsa;
1112         MACH_PORT_FACE  reply_port;
1113 //      kern_return_t   kr;
1114
1115         VS_ASYNC_LOCK();
1116         if (vs_async_free_list == NULL) {
1117                 VS_ASYNC_UNLOCK();
1118                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1119                 if (vsa != NULL) {
1120                         /*
1121                          * Try allocating a reply port named after the
1122                          * address of the vs_async structure.
1123                          */
1124                         struct vstruct_alias    *alias_struct;
1125
1126                         reply_port = ipc_port_alloc_kernel();
1127                         alias_struct = (struct vstruct_alias *)
1128                                 kalloc(sizeof (struct vstruct_alias));
1129                         if(alias_struct != NULL) {
1130                                 alias_struct->vs = (struct vstruct *)vsa;
1131                                 alias_struct->name = &default_pager_ops;
1132                                 reply_port->alias = (int) alias_struct;
1133                                 vsa->reply_port = reply_port;
1134                                 vs_alloc_async_count++;
1135                         }
1136                         else {
1137                                 vs_alloc_async_failed++;
1138                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1139                                                                 (reply_port));
1140                                 kfree(vsa, sizeof (struct vs_async));
1141                                 vsa = NULL;
1142                         }
1143                 }
1144         } else {
1145                 vsa = vs_async_free_list;
1146                 vs_async_free_list = vs_async_free_list->vsa_next;
1147                 VS_ASYNC_UNLOCK();
1148         }
1149
1150         return vsa;
1151 }
1152
1153 void
1154 vs_free_async(
1155         struct vs_async *vsa)
1156 {
1157         VS_ASYNC_LOCK();
1158         vsa->vsa_next = vs_async_free_list;
1159         vs_async_free_list = vsa;
1160         VS_ASYNC_UNLOCK();
1161 }
1162
1163 #else   /* VS_ASYNC_REUSE */
1164
1165 struct vs_async *
1166 vs_alloc_async(void)
1167 {
1168         struct vs_async *vsa;
1169         MACH_PORT_FACE  reply_port;
1170         kern_return_t   kr;
1171
1172         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1173         if (vsa != NULL) {
1174                 /*
1175                  * Try allocating a reply port named after the
1176                  * address of the vs_async structure.
1177                  */
1178                         reply_port = ipc_port_alloc_kernel();
1179                         alias_struct = (vstruct_alias *)
1180                                 kalloc(sizeof (struct vstruct_alias));
1181                         if(alias_struct != NULL) {
1182                                 alias_struct->vs = reply_port;
1183                                 alias_struct->name = &default_pager_ops;
1184                                 reply_port->alias = (int) vsa;
1185                                 vsa->reply_port = reply_port;
1186                                 vs_alloc_async_count++;
1187                         }
1188                         else {
1189                                 vs_alloc_async_failed++;
1190                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1191                                                                 (reply_port));
1192                                 kfree(vsa, sizeof (struct vs_async));
1193                                 vsa = NULL;
1194                         }
1195         }
1196
1197         return vsa;
1198 }
1199
1200 void
1201 vs_free_async(
1202         struct vs_async *vsa)
1203 {
1204         MACH_PORT_FACE  reply_port;
1205         kern_return_t   kr;
1206
1207         reply_port = vsa->reply_port;
1208         kfree(reply_port->alias, sizeof (struct vstuct_alias));
1209         kfree(vsa, sizeof (struct vs_async));
1210         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1211 #if 0
1212         VS_ASYNC_LOCK();
1213         vs_alloc_async_count--;
1214         VS_ASYNC_UNLOCK();
1215 #endif
1216 }
1217
1218 #endif  /* VS_ASYNC_REUSE */
1219
1220 zone_t  vstruct_zone;
1221
1222 vstruct_t
1223 ps_vstruct_create(
1224         vm_size_t size)
1225 {
1226         vstruct_t       vs;
1227         unsigned int    i;
1228
1229         vs = (vstruct_t) zalloc(vstruct_zone);
1230         if (vs == VSTRUCT_NULL) {
1231                 return VSTRUCT_NULL;
1232         }
1233
1234         VS_LOCK_INIT(vs);
1235
1236         /*
1237          * The following fields will be provided later.
1238          */
1239         vs->vs_pager_ops = NULL;
1240         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1241         vs->vs_references = 1;
1242         vs->vs_seqno = 0;
1243
1244 #ifdef MACH_KERNEL
1245         vs->vs_waiting_seqno = FALSE;
1246         vs->vs_waiting_read = FALSE;
1247         vs->vs_waiting_write = FALSE;
1248         vs->vs_waiting_async = FALSE;
1249 #else
1250         mutex_init(&vs->vs_waiting_seqno, 0);
1251         mutex_init(&vs->vs_waiting_read, 0);
1252         mutex_init(&vs->vs_waiting_write, 0);
1253         mutex_init(&vs->vs_waiting_refs, 0);
1254         mutex_init(&vs->vs_waiting_async, 0);
1255 #endif
1256
1257         vs->vs_readers = 0;
1258         vs->vs_writers = 0;
1259
1260         vs->vs_errors = 0;
1261
1262         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1263         vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1264         vs->vs_async_pending = 0;
1265
1266         /*
1267          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1268          * depending on the size of the memory object.
1269          */
1270         if (INDIRECT_CLMAP(vs->vs_size)) {
1271                 vs->vs_imap = (struct vs_map **)
1272                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1273                 vs->vs_indirect = TRUE;
1274         } else {
1275                 vs->vs_dmap = (struct vs_map *)
1276                         kalloc(CLMAP_SIZE(vs->vs_size));
1277                 vs->vs_indirect = FALSE;
1278         }
1279         vs->vs_xfer_pending = FALSE;
1280         DP_DEBUG(DEBUG_VS_INTERNAL,
1281                  ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1282
1283         /*
1284          * Check to see that we got the space.
1285          */
1286         if (!vs->vs_dmap) {
1287                 kfree(vs, sizeof *vs);
1288                 return VSTRUCT_NULL;
1289         }
1290
1291         /*
1292          * Zero the indirect pointers, or clear the direct pointers.
1293          */
1294         if (vs->vs_indirect)
1295                 memset(vs->vs_imap, 0,
1296                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1297         else
1298                 for (i = 0; i < vs->vs_size; i++)
1299                         VSM_CLR(vs->vs_dmap[i]);
1300
1301         VS_MAP_LOCK_INIT(vs);
1302
1303         bs_commit(vs->vs_size);
1304
1305         return vs;
1306 }
1307
1308 paging_segment_t ps_select_segment(unsigned int, int *);        /* forward */
1309
1310 paging_segment_t
1311 ps_select_segment(
1312         unsigned int    shift,
1313         int             *psindex)
1314 {
1315         paging_segment_t        ps;
1316         int                     i;
1317         int                     j;
1318
1319         /*
1320          * Optimize case where there's only one segment.
1321          * paging_segment_max will index the one and only segment.
1322          */
1323
1324         PSL_LOCK();
1325         if (paging_segment_count == 1) {
1326                 paging_segment_t lps;   /* used to avoid extra PS_UNLOCK */
1327                 ipc_port_t trigger = IP_NULL;
1328
1329                 ps = paging_segments[paging_segment_max];
1330                 *psindex = paging_segment_max;
1331                 PS_LOCK(ps);
1332                 if (ps->ps_going_away) {
1333                         /* this segment is being turned off */
1334                         lps = PAGING_SEGMENT_NULL;
1335                 } else {
1336                         ASSERT(ps->ps_clshift >= shift);
1337                         if (ps->ps_clcount) {
1338                                 ps->ps_clcount--;
1339                                 dp_pages_free -=  1 << ps->ps_clshift;
1340                                 if(min_pages_trigger_port &&
1341                                   (dp_pages_free < minimum_pages_remaining)) {
1342                                         trigger = min_pages_trigger_port;
1343                                         min_pages_trigger_port = NULL;
1344                                         bs_low = TRUE;
1345                                 }
1346                                 lps = ps;
1347                         } else
1348                                 lps = PAGING_SEGMENT_NULL;
1349                 }
1350                 PS_UNLOCK(ps);
1351                 PSL_UNLOCK();
1352
1353                 if (trigger != IP_NULL) {
1354                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1355                         ipc_port_release_send(trigger);
1356                 }
1357                 return lps;
1358         }
1359
1360         if (paging_segment_count == 0) {
1361                 PSL_UNLOCK();
1362                 return PAGING_SEGMENT_NULL;
1363         }
1364
1365         for (i = BS_MAXPRI;
1366              i >= BS_MINPRI; i--) {
1367                 int start_index;
1368
1369                 if ((ps_select_array[i] == BS_NOPRI) ||
1370                                 (ps_select_array[i] == BS_FULLPRI))
1371                         continue;
1372                 start_index = ps_select_array[i];
1373
1374                 if(!(paging_segments[start_index])) {
1375                         j = start_index+1;
1376                         physical_transfer_cluster_count = 0;
1377                 }
1378                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1379                                 (((paging_segments[start_index])->ps_clshift)
1380                                 + vm_page_shift))) {
1381                         physical_transfer_cluster_count = 0;
1382                         j = start_index + 1;
1383                 } else {
1384                         physical_transfer_cluster_count+=1;
1385                         j = start_index;
1386                         if(start_index == 0)
1387                                 start_index = paging_segment_max;
1388                         else
1389                                 start_index = start_index - 1;
1390                 }
1391
1392                 while (1) {
1393                         if (j > paging_segment_max)
1394                                 j = 0;
1395                         if ((ps = paging_segments[j]) &&
1396                             (ps->ps_bs->bs_priority == i)) {
1397                                 /*
1398                                  * Force the ps cluster size to be
1399                                  * >= that of the vstruct.
1400                                  */
1401                                 PS_LOCK(ps);
1402                                 if (ps->ps_going_away) {
1403                                         /* this segment is being turned off */
1404                                 } else if ((ps->ps_clcount) &&
1405                                            (ps->ps_clshift >= shift)) {
1406                                         ipc_port_t trigger = IP_NULL;
1407
1408                                         ps->ps_clcount--;
1409                                         dp_pages_free -=  1 << ps->ps_clshift;
1410                                         if(min_pages_trigger_port &&
1411                                                 (dp_pages_free <
1412                                                 minimum_pages_remaining)) {
1413                                                 trigger = min_pages_trigger_port;
1414                                                 min_pages_trigger_port = NULL;
1415                                         }
1416                                         PS_UNLOCK(ps);
1417                                         /*
1418                                          * found one, quit looking.
1419                                          */
1420                                         ps_select_array[i] = j;
1421                                         PSL_UNLOCK();
1422
1423                                         if (trigger != IP_NULL) {
1424                                                 default_pager_space_alert(
1425                                                         trigger,
1426                                                         HI_WAT_ALERT);
1427                                                 ipc_port_release_send(trigger);
1428                                         }
1429                                         *psindex = j;
1430                                         return ps;
1431                                 }
1432                                 PS_UNLOCK(ps);
1433                         }
1434                         if (j == start_index) {
1435                                 /*
1436                                  * none at this priority -- mark it full
1437                                  */
1438                                 ps_select_array[i] = BS_FULLPRI;
1439                                 break;
1440                         }
1441                         j++;
1442                 }
1443         }
1444         PSL_UNLOCK();
1445         return PAGING_SEGMENT_NULL;
1446 }
1447
1448 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1449
1450 vm_offset_t
1451 ps_allocate_cluster(
1452         vstruct_t               vs,
1453         int                     *psindex,
1454         paging_segment_t        use_ps)
1455 {
1456         unsigned int            byte_num;
1457         int                     bit_num = 0;
1458         paging_segment_t        ps;
1459         vm_offset_t             cluster;
1460         ipc_port_t              trigger = IP_NULL;
1461
1462         /*
1463          * Find best paging segment.
1464          * ps_select_segment will decrement cluster count on ps.
1465          * Must pass cluster shift to find the most appropriate segment.
1466          */
1467         /* NOTE:  The addition of paging segment delete capability threatened
1468          * to seriously complicate the treatment of paging segments in this
1469          * module and the ones that call it (notably ps_clmap), because of the
1470          * difficulty in assuring that the paging segment would continue to
1471          * exist between being unlocked and locked.   This was
1472          * avoided because all calls to this module are based in either
1473          * dp_memory_object calls which rely on the vs lock, or by
1474          * the transfer function which is part of the segment delete path.
1475          * The transfer function which is part of paging segment delete is
1476          * protected from multiple callers by the backing store lock.
1477          * The paging segment delete function treats mappings to a paging
1478          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1479          * while data is transferred to the remaining segments.  This is in
1480          * line with the view that incomplete or in-transition mappings between
1481          * data, a vstruct, and backing store are protected by the vs lock.
1482          * This and the ordering of the paging segment "going_away" bit setting
1483          * protects us.
1484          */
1485         if (use_ps != PAGING_SEGMENT_NULL) {
1486                 ps = use_ps;
1487                 PSL_LOCK();
1488                 PS_LOCK(ps);
1489
1490                 ASSERT(ps->ps_clcount != 0);
1491
1492                 ps->ps_clcount--;
1493                 dp_pages_free -=  1 << ps->ps_clshift;
1494                 if(min_pages_trigger_port &&
1495                                 (dp_pages_free < minimum_pages_remaining)) {
1496                         trigger = min_pages_trigger_port;
1497                         min_pages_trigger_port = NULL;
1498                 }
1499                 PSL_UNLOCK();
1500                 PS_UNLOCK(ps);
1501                 if (trigger != IP_NULL) {
1502                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1503                         ipc_port_release_send(trigger);
1504                 }
1505
1506         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1507                    PAGING_SEGMENT_NULL) {
1508                 static uint32_t lastnotify = 0;
1509                 uint32_t now, nanoseconds_dummy;
1510
1511                 /*
1512                  * Emit a notification of the low-paging resource condition
1513                  * but don't issue it more than once every five seconds.  This
1514                  * prevents us from overflowing logs with thousands of
1515                  * repetitions of the message.
1516                  */
1517                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1518                 if (now > lastnotify + 5) {
1519                         dprintf(("no space in available paging segments\n"));
1520                         lastnotify = now;
1521                 }
1522
1523                 /* the count got off maybe, reset to zero */
1524                 PSL_LOCK();
1525                 dp_pages_free = 0;
1526                 if(min_pages_trigger_port) {
1527                         trigger = min_pages_trigger_port;
1528                         min_pages_trigger_port = NULL;
1529                         bs_low = TRUE;
1530                 }
1531                 PSL_UNLOCK();
1532                 if (trigger != IP_NULL) {
1533                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1534                         ipc_port_release_send(trigger);
1535                 }
1536                 return (vm_offset_t) -1;
1537         }
1538
1539         /*
1540          * Look for an available cluster.  At the end of the loop,
1541          * byte_num is the byte offset and bit_num is the bit offset of the
1542          * first zero bit in the paging segment bitmap.
1543          */
1544         PS_LOCK(ps);
1545         byte_num = ps->ps_hint;
1546         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1547                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1548                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1549                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1550                                         break;
1551                         }
1552                         ASSERT(bit_num != NBBY);
1553                         break;
1554                 }
1555         }
1556         ps->ps_hint = byte_num;
1557         cluster = (byte_num*NBBY) + bit_num;
1558
1559         /* Space was reserved, so this must be true */
1560         ASSERT(cluster < ps->ps_ncls);
1561
1562         setbit(ps->ps_bmap, cluster);
1563         PS_UNLOCK(ps);
1564
1565         return cluster;
1566 }
1567
1568 void ps_deallocate_cluster(paging_segment_t, vm_offset_t);      /* forward */
1569
1570 void
1571 ps_deallocate_cluster(
1572         paging_segment_t        ps,
1573         vm_offset_t             cluster)
1574 {
1575
1576         if (cluster >= (vm_offset_t) ps->ps_ncls)
1577                 panic("ps_deallocate_cluster: Invalid cluster number");
1578
1579         /*
1580          * Lock the paging segment, clear the cluster's bitmap and increment the
1581          * number of free cluster.
1582          */
1583         PSL_LOCK();
1584         PS_LOCK(ps);
1585         clrbit(ps->ps_bmap, cluster);
1586         ++ps->ps_clcount;
1587         dp_pages_free +=  1 << ps->ps_clshift;
1588         PSL_UNLOCK();
1589
1590         /*
1591          * Move the hint down to the freed cluster if it is
1592          * less than the current hint.
1593          */
1594         if ((cluster/NBBY) < ps->ps_hint) {
1595                 ps->ps_hint = (cluster/NBBY);
1596         }
1597
1598         PS_UNLOCK(ps);
1599
1600         /*
1601          * If we're freeing space on a full priority, reset the array.
1602          */
1603         PSL_LOCK();
1604         if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1605                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1606         PSL_UNLOCK();
1607
1608         return;
1609 }
1610
1611 void ps_dealloc_vsmap(struct vs_map *, vm_size_t);      /* forward */
1612
1613 void
1614 ps_dealloc_vsmap(
1615         struct vs_map   *vsmap,
1616         vm_size_t       size)
1617 {
1618         unsigned int i;
1619         for (i = 0; i < size; i++)
1620                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1621                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1622                                               VSM_CLOFF(vsmap[i]));
1623 }
1624
1625 void
1626 ps_vstruct_dealloc(
1627         vstruct_t vs)
1628 {
1629         unsigned int    i;
1630 //      spl_t   s;
1631
1632         VS_MAP_LOCK(vs);
1633
1634         /*
1635          * If this is an indirect structure, then we walk through the valid
1636          * (non-zero) indirect pointers and deallocate the clusters
1637          * associated with each used map entry (via ps_dealloc_vsmap).
1638          * When all of the clusters in an indirect block have been
1639          * freed, we deallocate the block.  When all of the indirect
1640          * blocks have been deallocated we deallocate the memory
1641          * holding the indirect pointers.
1642          */
1643         if (vs->vs_indirect) {
1644                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1645                         if (vs->vs_imap[i] != NULL) {
1646                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1647                                 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1648                         }
1649                 }
1650                 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1651         } else {
1652                 /*
1653                  * Direct map.  Free used clusters, then memory.
1654                  */
1655                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1656                 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1657         }
1658         VS_MAP_UNLOCK(vs);
1659
1660         bs_commit(- vs->vs_size);
1661
1662         zfree(vstruct_zone, vs);
1663 }
1664
1665 int ps_map_extend(vstruct_t, unsigned int);     /* forward */
1666
1667 int ps_map_extend(
1668         vstruct_t       vs,
1669         unsigned int    new_size)
1670 {
1671         struct vs_map   **new_imap;
1672         struct vs_map   *new_dmap = NULL;
1673         int             newdsize;
1674         int             i;
1675         void            *old_map = NULL;
1676         int             old_map_size = 0;
1677
1678         if (vs->vs_size >= new_size) {
1679                 /*
1680                  * Someone has already done the work.
1681                  */
1682                 return 0;
1683         }
1684
1685         /*
1686          * If the new size extends into the indirect range, then we have one
1687          * of two cases: we are going from indirect to indirect, or we are
1688          * going from direct to indirect.  If we are going from indirect to
1689          * indirect, then it is possible that the new size will fit in the old
1690          * indirect map.  If this is the case, then just reset the size of the
1691          * vstruct map and we are done.  If the new size will not
1692          * fit into the old indirect map, then we have to allocate a new
1693          * indirect map and copy the old map pointers into this new map.
1694          *
1695          * If we are going from direct to indirect, then we have to allocate a
1696          * new indirect map and copy the old direct pages into the first
1697          * indirect page of the new map.
1698          * NOTE: allocating memory here is dangerous, as we're in the
1699          * pageout path.
1700          */
1701         if (INDIRECT_CLMAP(new_size)) {
1702                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1703
1704                 /*
1705                  * Get a new indirect map and zero it.
1706                  */
1707                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1708                 if (vs->vs_indirect &&
1709                     (new_map_size == old_map_size)) {
1710                         bs_commit(new_size - vs->vs_size);
1711                         vs->vs_size = new_size;
1712                         return 0;
1713                 }
1714
1715                 new_imap = (struct vs_map **)kalloc(new_map_size);
1716                 if (new_imap == NULL) {
1717                         return -1;
1718                 }
1719                 memset(new_imap, 0, new_map_size);
1720
1721                 if (vs->vs_indirect) {
1722                         /* Copy old entries into new map */
1723                         memcpy(new_imap, vs->vs_imap, old_map_size);
1724                         /* Arrange to free the old map */
1725                         old_map = (void *) vs->vs_imap;
1726                         newdsize = 0;
1727                 } else {        /* Old map was a direct map */
1728                         /* Allocate an indirect page */
1729                         if ((new_imap[0] = (struct vs_map *)
1730                              kalloc(CLMAP_THRESHOLD)) == NULL) {
1731                                 kfree(new_imap, new_map_size);
1732                                 return -1;
1733                         }
1734                         new_dmap = new_imap[0];
1735                         newdsize = CLMAP_ENTRIES;
1736                 }
1737         } else {
1738                 new_imap = NULL;
1739                 newdsize = new_size;
1740                 /*
1741                  * If the new map is a direct map, then the old map must
1742                  * also have been a direct map.  All we have to do is
1743                  * to allocate a new direct map, copy the old entries
1744                  * into it and free the old map.
1745                  */
1746                 if ((new_dmap = (struct vs_map *)
1747                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
1748                         return -1;
1749                 }
1750         }
1751         if (newdsize) {
1752
1753                 /* Free the old map */
1754                 old_map = (void *) vs->vs_dmap;
1755                 old_map_size = CLMAP_SIZE(vs->vs_size);
1756
1757                 /* Copy info from the old map into the new map */
1758                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1759
1760                 /* Initialize the rest of the new map */
1761                 for (i = vs->vs_size; i < newdsize; i++)
1762                         VSM_CLR(new_dmap[i]);
1763         }
1764         if (new_imap) {
1765                 vs->vs_imap = new_imap;
1766                 vs->vs_indirect = TRUE;
1767         } else
1768                 vs->vs_dmap = new_dmap;
1769         bs_commit(new_size - vs->vs_size);
1770         vs->vs_size = new_size;
1771         if (old_map)
1772                 kfree(old_map, old_map_size);
1773         return 0;
1774 }
1775
1776 vm_offset_t
1777 ps_clmap(
1778         vstruct_t       vs,
1779         vm_offset_t     offset,
1780         struct clmap    *clmap,
1781         int             flag,
1782         vm_size_t       size,
1783         int             error)
1784 {
1785         vm_offset_t     cluster;        /* The cluster of offset.       */
1786         vm_offset_t     newcl;          /* The new cluster allocated.   */
1787         vm_offset_t     newoff;
1788         unsigned int    i;
1789         struct vs_map   *vsmap;
1790
1791         VS_MAP_LOCK(vs);
1792
1793         ASSERT(vs->vs_dmap);
1794         cluster = atop_32(offset) >> vs->vs_clshift;
1795
1796         /*
1797          * Initialize cluster error value
1798          */
1799         clmap->cl_error = 0;
1800
1801         /*
1802          * If the object has grown, extend the page map.
1803          */
1804         if (cluster >= vs->vs_size) {
1805                 if (flag == CL_FIND) {
1806                         /* Do not allocate if just doing a lookup */
1807                         VS_MAP_UNLOCK(vs);
1808                         return (vm_offset_t) -1;
1809                 }
1810                 if (ps_map_extend(vs, cluster + 1)) {
1811                         VS_MAP_UNLOCK(vs);
1812                         return (vm_offset_t) -1;
1813                 }
1814         }
1815
1816         /*
1817          * Look for the desired cluster.  If the map is indirect, then we
1818          * have a two level lookup.  First find the indirect block, then
1819          * find the actual cluster.  If the indirect block has not yet
1820          * been allocated, then do so.  If the cluster has not yet been
1821          * allocated, then do so.
1822          *
1823          * If any of the allocations fail, then return an error.
1824          * Don't allocate if just doing a lookup.
1825          */
1826         if (vs->vs_indirect) {
1827                 long    ind_block = cluster/CLMAP_ENTRIES;
1828
1829                 /* Is the indirect block allocated? */
1830                 vsmap = vs->vs_imap[ind_block];
1831                 if (vsmap == NULL) {
1832                         if (flag == CL_FIND) {
1833                                 VS_MAP_UNLOCK(vs);
1834                                 return (vm_offset_t) -1;
1835                         }
1836
1837                         /* Allocate the indirect block */
1838                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1839                         if (vsmap == NULL) {
1840                                 VS_MAP_UNLOCK(vs);
1841                                 return (vm_offset_t) -1;
1842                         }
1843                         /* Initialize the cluster offsets */
1844                         for (i = 0; i < CLMAP_ENTRIES; i++)
1845                                 VSM_CLR(vsmap[i]);
1846                         vs->vs_imap[ind_block] = vsmap;
1847                 }
1848         } else
1849                 vsmap = vs->vs_dmap;
1850
1851         ASSERT(vsmap);
1852         vsmap += cluster%CLMAP_ENTRIES;
1853
1854         /*
1855          * At this point, vsmap points to the struct vs_map desired.
1856          *
1857          * Look in the map for the cluster, if there was an error on a
1858          * previous write, flag it and return.  If it is not yet
1859          * allocated, then allocate it, if we're writing; if we're
1860          * doing a lookup and the cluster's not allocated, return error.
1861          */
1862         if (VSM_ISERR(*vsmap)) {
1863                 clmap->cl_error = VSM_GETERR(*vsmap);
1864                 VS_MAP_UNLOCK(vs);
1865                 return (vm_offset_t) -1;
1866         } else if (VSM_ISCLR(*vsmap)) {
1867                 int psindex;
1868
1869                 if (flag == CL_FIND) {
1870                         /*
1871                          * If there's an error and the entry is clear, then
1872                          * we've run out of swap space.  Record the error
1873                          * here and return.
1874                          */
1875                         if (error) {
1876                                 VSM_SETERR(*vsmap, error);
1877                         }
1878                         VS_MAP_UNLOCK(vs);
1879                         return (vm_offset_t) -1;
1880                 } else {
1881                         /*
1882                          * Attempt to allocate a cluster from the paging segment
1883                          */
1884                         newcl = ps_allocate_cluster(vs, &psindex,
1885                                                     PAGING_SEGMENT_NULL);
1886                         if (newcl == (vm_offset_t) -1) {
1887                                 VS_MAP_UNLOCK(vs);
1888                                 return (vm_offset_t) -1;
1889                         }
1890                         VSM_CLR(*vsmap);
1891                         VSM_SETCLOFF(*vsmap, newcl);
1892                         VSM_SETPS(*vsmap, psindex);
1893                 }
1894         } else
1895                 newcl = VSM_CLOFF(*vsmap);
1896
1897         /*
1898          * Fill in pertinent fields of the clmap
1899          */
1900         clmap->cl_ps = VSM_PS(*vsmap);
1901         clmap->cl_numpages = VSCLSIZE(vs);
1902         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1903
1904         /*
1905          * Byte offset in paging segment is byte offset to cluster plus
1906          * byte offset within cluster.  It looks ugly, but should be
1907          * relatively quick.
1908          */
1909         ASSERT(trunc_page(offset) == offset);
1910         newcl = ptoa_32(newcl) << vs->vs_clshift;
1911         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1912         if (flag == CL_ALLOC) {
1913                 /*
1914                  * set bits in the allocation bitmap according to which
1915                  * pages were requested.  size is in bytes.
1916                  */
1917                 i = atop_32(newoff);
1918                 while ((size > 0) && (i < VSCLSIZE(vs))) {
1919                         VSM_SETALLOC(*vsmap, i);
1920                         i++;
1921                         size -= vm_page_size;
1922                 }
1923         }
1924         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1925         if (newoff) {
1926                 /*
1927                  * Offset is not cluster aligned, so number of pages
1928                  * and bitmaps must be adjusted
1929                  */
1930                 clmap->cl_numpages -= atop_32(newoff);
1931                 CLMAP_SHIFT(clmap, vs);
1932                 CLMAP_SHIFTALLOC(clmap, vs);
1933         }
1934
1935         /*
1936          *
1937          * The setting of valid bits and handling of write errors
1938          * must be done here, while we hold the lock on the map.
1939          * It logically should be done in ps_vs_write_complete().
1940          * The size and error information has been passed from
1941          * ps_vs_write_complete().  If the size parameter is non-zero,
1942          * then there is work to be done.  If error is also non-zero,
1943          * then the error number is recorded in the cluster and the
1944          * entire cluster is in error.
1945          */
1946         if (size && flag == CL_FIND) {
1947                 vm_offset_t off = (vm_offset_t) 0;
1948
1949                 if (!error) {
1950                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1951                              i++) {
1952                                 VSM_SETPG(*vsmap, i);
1953                                 size -= vm_page_size;
1954                         }
1955                         ASSERT(i <= VSCLSIZE(vs));
1956                 } else {
1957                         BS_STAT(clmap->cl_ps->ps_bs,
1958                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1959                                         atop_32(size));
1960                         off = VSM_CLOFF(*vsmap);
1961                         VSM_SETERR(*vsmap, error);
1962                 }
1963                 /*
1964                  * Deallocate cluster if error, and no valid pages
1965                  * already present.
1966                  */
1967                 if (off != (vm_offset_t) 0)
1968                         ps_deallocate_cluster(clmap->cl_ps, off);
1969                 VS_MAP_UNLOCK(vs);
1970                 return (vm_offset_t) 0;
1971         } else
1972                 VS_MAP_UNLOCK(vs);
1973
1974         DP_DEBUG(DEBUG_VS_INTERNAL,
1975                  ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1976                   newcl+newoff, (int) vs, (int) vsmap, flag));
1977         DP_DEBUG(DEBUG_VS_INTERNAL,
1978                  ("     clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1979                   (int) clmap->cl_ps, clmap->cl_numpages,
1980                   (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1981
1982         return (newcl + newoff);
1983 }
1984
1985 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t);     /* forward */
1986
1987 void
1988 ps_clunmap(
1989         vstruct_t       vs,
1990         vm_offset_t     offset,
1991         vm_size_t       length)
1992 {
1993         vm_offset_t             cluster; /* The cluster number of offset */
1994         struct vs_map           *vsmap;
1995
1996         VS_MAP_LOCK(vs);
1997
1998         /*
1999          * Loop through all clusters in this range, freeing paging segment
2000          * clusters and map entries as encountered.
2001          */
2002         while (length > 0) {
2003                 vm_offset_t     newoff;
2004                 unsigned int    i;
2005
2006                 cluster = atop_32(offset) >> vs->vs_clshift;
2007                 if (vs->vs_indirect)    /* indirect map */
2008                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2009                 else
2010                         vsmap = vs->vs_dmap;
2011                 if (vsmap == NULL) {
2012                         VS_MAP_UNLOCK(vs);
2013                         return;
2014                 }
2015                 vsmap += cluster%CLMAP_ENTRIES;
2016                 if (VSM_ISCLR(*vsmap)) {
2017                         length -= vm_page_size;
2018                         offset += vm_page_size;
2019                         continue;
2020                 }
2021                 /*
2022                  * We've got a valid mapping.  Clear it and deallocate
2023                  * paging segment cluster pages.
2024                  * Optimize for entire cluster cleraing.
2025                  */
2026                 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2027                         /*
2028                          * Not cluster aligned.
2029                          */
2030                         ASSERT(trunc_page(newoff) == newoff);
2031                         i = atop_32(newoff);
2032                 } else
2033                         i = 0;
2034                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2035                         VSM_CLRPG(*vsmap, i);
2036                         VSM_CLRALLOC(*vsmap, i);
2037                         length -= vm_page_size;
2038                         offset += vm_page_size;
2039                         i++;
2040                 }
2041
2042                 /*
2043                  * If map entry is empty, clear and deallocate cluster.
2044                  */
2045                 if (!VSM_ALLOC(*vsmap)) {
2046                         ps_deallocate_cluster(VSM_PS(*vsmap),
2047                                               VSM_CLOFF(*vsmap));
2048                         VSM_CLR(*vsmap);
2049                 }
2050         }
2051
2052         VS_MAP_UNLOCK(vs);
2053 }
2054
2055 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2056
2057 void
2058 ps_vs_write_complete(
2059         vstruct_t       vs,
2060         vm_offset_t     offset,
2061         vm_size_t       size,
2062         int             error)
2063 {
2064         struct clmap    clmap;
2065
2066         /*
2067          * Get the struct vsmap for this cluster.
2068          * Use READ, even though it was written, because the
2069          * cluster MUST be present, unless there was an error
2070          * in the original ps_clmap (e.g. no space), in which
2071          * case, nothing happens.
2072          *
2073          * Must pass enough information to ps_clmap to allow it
2074          * to set the vs_map structure bitmap under lock.
2075          */
2076         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2077 }
2078
2079 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int);    /* forward */
2080
2081 void
2082 vs_cl_write_complete(
2083         vstruct_t                                       vs,
2084         __unused paging_segment_t       ps,
2085         vm_offset_t                                     offset,
2086         __unused vm_offset_t            addr,
2087         vm_size_t                                       size,
2088         boolean_t                                       async,
2089         int                                                     error)
2090 {
2091 //      kern_return_t   kr;
2092
2093         if (error) {
2094                 /*
2095                  * For internal objects, the error is recorded on a
2096                  * per-cluster basis by ps_clmap() which is called
2097                  * by ps_vs_write_complete() below.
2098                  */
2099                 dprintf(("write failed error = 0x%x\n", error));
2100                 /* add upl_abort code here */
2101         } else
2102                 GSTAT(global_stats.gs_pages_out += atop_32(size));
2103         /*
2104          * Notify the vstruct mapping code, so it can do its accounting.
2105          */
2106         ps_vs_write_complete(vs, offset, size, error);
2107
2108         if (async) {
2109                 VS_LOCK(vs);
2110                 ASSERT(vs->vs_async_pending > 0);
2111                 vs->vs_async_pending -= size;
2112                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2113                         vs->vs_waiting_async = FALSE;
2114                         VS_UNLOCK(vs);
2115                         /* mutex_unlock(&vs->vs_waiting_async); */
2116                         thread_wakeup(&vs->vs_async_pending);
2117                 } else {
2118                         VS_UNLOCK(vs);
2119                 }
2120         }
2121 }
2122
2123 #ifdef DEVICE_PAGING
2124 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2125
2126 kern_return_t
2127 device_write_reply(
2128         MACH_PORT_FACE  reply_port,
2129         kern_return_t   device_code,
2130         io_buf_len_t    bytes_written)
2131 {
2132         struct vs_async *vsa;
2133
2134         vsa = (struct vs_async *)
2135                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2136
2137         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2138                 device_code = KERN_FAILURE;
2139         }
2140
2141         vsa->vsa_error = device_code;
2142
2143
2144         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2145         if(vsa->vsa_flags & VSA_TRANSFER) {
2146                 /* revisit when async disk segments redone */
2147                 if(vsa->vsa_error) {
2148                    /* need to consider error condition.  re-write data or */
2149                    /* throw it away here. */
2150                    vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2151                 }
2152                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2153                                                 vsa->vsa_size, vsa->vsa_error);
2154         } else {
2155                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2156                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2157                              vsa->vsa_error);
2158         }
2159         VS_FREE_ASYNC(vsa);
2160
2161         return KERN_SUCCESS;
2162 }
2163
2164 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2165 kern_return_t
2166 device_write_reply_inband(
2167         MACH_PORT_FACE          reply_port,
2168         kern_return_t           return_code,
2169         io_buf_len_t            bytes_written)
2170 {
2171         panic("device_write_reply_inband: illegal");
2172         return KERN_SUCCESS;
2173 }
2174
2175 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2176 kern_return_t
2177 device_read_reply(
2178         MACH_PORT_FACE          reply_port,
2179         kern_return_t           return_code,
2180         io_buf_ptr_t            data,
2181         mach_msg_type_number_t  dataCnt)
2182 {
2183         struct vs_async *vsa;
2184         vsa = (struct vs_async *)
2185                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2186         vsa->vsa_addr = (vm_offset_t)data;
2187         vsa->vsa_size = (vm_size_t)dataCnt;
2188         vsa->vsa_error = return_code;
2189         thread_wakeup(&vsa->vsa_lock);
2190         return KERN_SUCCESS;
2191 }
2192
2193 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2194 kern_return_t
2195 device_read_reply_inband(
2196         MACH_PORT_FACE          reply_port,
2197         kern_return_t           return_code,
2198         io_buf_ptr_inband_t     data,
2199         mach_msg_type_number_t  dataCnt)
2200 {
2201         panic("device_read_reply_inband: illegal");
2202         return KERN_SUCCESS;
2203 }
2204
2205 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2206 kern_return_t
2207 device_read_reply_overwrite(
2208         MACH_PORT_FACE          reply_port,
2209         kern_return_t           return_code,
2210         io_buf_len_t            bytes_read)
2211 {
2212         panic("device_read_reply_overwrite: illegal\n");
2213         return KERN_SUCCESS;
2214 }
2215
2216 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2217 kern_return_t
2218 device_open_reply(
2219         MACH_PORT_FACE          reply_port,
2220         kern_return_t           return_code,
2221         MACH_PORT_FACE          device_port)
2222 {
2223         panic("device_open_reply: illegal\n");
2224         return KERN_SUCCESS;
2225 }
2226
2227 kern_return_t
2228 ps_read_device(
2229         paging_segment_t        ps,
2230         vm_offset_t             offset,
2231         vm_offset_t             *bufferp,
2232         unsigned int            size,
2233         unsigned int            *residualp,
2234         int                     flags)
2235 {
2236         kern_return_t   kr;
2237         recnum_t        dev_offset;
2238         unsigned int    bytes_wanted;
2239         unsigned int    bytes_read;
2240         unsigned int    total_read;
2241         vm_offset_t     dev_buffer;
2242         vm_offset_t     buf_ptr;
2243         unsigned int    records_read;
2244         struct vs_async *vsa;
2245         mutex_t vs_waiting_read_reply;
2246
2247         device_t        device;
2248         vm_map_copy_t   device_data = NULL;
2249         default_pager_thread_t *dpt = NULL;
2250
2251         device = dev_port_lookup(ps->ps_device);
2252         clustered_reads[atop_32(size)]++;
2253
2254         dev_offset = (ps->ps_offset +
2255                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2256         bytes_wanted = size;
2257         total_read = 0;
2258         *bufferp = (vm_offset_t)NULL;
2259
2260         do {
2261                 vsa = VS_ALLOC_ASYNC();
2262                 if (vsa) {
2263                         vsa->vsa_vs = NULL;
2264                         vsa->vsa_addr = 0;
2265                         vsa->vsa_offset = 0;
2266                         vsa->vsa_size = 0;
2267                         vsa->vsa_ps = NULL;
2268                 }
2269                 mutex_init(&vsa->vsa_lock, 0);
2270                 ip_lock(vsa->reply_port);
2271                 vsa->reply_port->ip_sorights++;
2272                 ip_reference(vsa->reply_port);
2273                 ip_unlock(vsa->reply_port);
2274                 kr = ds_device_read_common(device,
2275                                  vsa->reply_port,
2276                                  (mach_msg_type_name_t)
2277                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2278                                  (dev_mode_t) 0,
2279                                  dev_offset,
2280                                  bytes_wanted,
2281                                  (IO_READ | IO_CALL),
2282                                  (io_buf_ptr_t *) &dev_buffer,
2283                                  (mach_msg_type_number_t *) &bytes_read);
2284                 if(kr == MIG_NO_REPLY) {
2285                         assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2286                         thread_block(THREAD_CONTINUE_NULL);
2287
2288                         dev_buffer = vsa->vsa_addr;
2289                         bytes_read = (unsigned int)vsa->vsa_size;
2290                         kr = vsa->vsa_error;
2291                 }
2292                 VS_FREE_ASYNC(vsa);
2293                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2294                         break;
2295                 }
2296                 total_read += bytes_read;
2297
2298                 /*
2299                  * If we got the entire range, use the returned dev_buffer.
2300                  */
2301                 if (bytes_read == size) {
2302                         *bufferp = (vm_offset_t)dev_buffer;
2303                         break;
2304                 }
2305
2306 #if 1
2307                 dprintf(("read only %d bytes out of %d\n",
2308                          bytes_read, bytes_wanted));
2309 #endif
2310                 if(dpt == NULL) {
2311                         dpt = get_read_buffer();
2312                         buf_ptr = dpt->dpt_buffer;
2313                         *bufferp = (vm_offset_t)buf_ptr;
2314                 }
2315                 /*
2316                  * Otherwise, copy the data into the provided buffer (*bufferp)
2317                  * and append the rest of the range as it comes in.
2318                  */
2319                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2320                 buf_ptr += bytes_read;
2321                 bytes_wanted -= bytes_read;
2322                 records_read = (bytes_read >>
2323                                 (vm_page_shift - ps->ps_record_shift));
2324                 dev_offset += records_read;
2325                 DP_DEBUG(DEBUG_VS_INTERNAL,
2326                          ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2327                           dev_buffer, bytes_read));
2328                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2329                     != KERN_SUCCESS)
2330                         Panic("dealloc buf");
2331         } while (bytes_wanted);
2332
2333         *residualp = size - total_read;
2334         if((dev_buffer != *bufferp) && (total_read != 0)) {
2335                 vm_offset_t temp_buffer;
2336                 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2337                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2338                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2339                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2340                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2341                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2342                         (vm_map_copy_t *)&device_data, FALSE))
2343                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2344         }
2345         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2346                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2347                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2348                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2349                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2350                         (vm_map_copy_t *)&device_data, FALSE))
2351                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2352         }
2353         else {
2354                 device_data = NULL;
2355         }
2356         *bufferp = (vm_offset_t)device_data;
2357
2358         if(dpt != NULL) {
2359                 /* Free the receive buffer */
2360                 dpt->checked_out = 0;
2361                 thread_wakeup(&dpt_array);
2362         }
2363         return KERN_SUCCESS;
2364 }
2365
2366 kern_return_t
2367 ps_write_device(
2368         paging_segment_t        ps,
2369         vm_offset_t             offset,
2370         vm_offset_t             addr,
2371         unsigned int            size,
2372         struct vs_async         *vsa)
2373 {
2374         recnum_t        dev_offset;
2375         io_buf_len_t    bytes_to_write, bytes_written;
2376         recnum_t        records_written;
2377         kern_return_t   kr;
2378         MACH_PORT_FACE  reply_port;
2379
2380
2381
2382         clustered_writes[atop_32(size)]++;
2383
2384         dev_offset = (ps->ps_offset +
2385                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2386         bytes_to_write = size;
2387
2388         if (vsa) {
2389                 /*
2390                  * Asynchronous write.
2391                  */
2392                 reply_port = vsa->reply_port;
2393                 ip_lock(reply_port);
2394                 reply_port->ip_sorights++;
2395                 ip_reference(reply_port);
2396                 ip_unlock(reply_port);
2397                 {
2398                 device_t        device;
2399                 device = dev_port_lookup(ps->ps_device);
2400
2401                 vsa->vsa_addr = addr;
2402                 kr=ds_device_write_common(device,
2403                         reply_port,
2404                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2405                         (dev_mode_t) 0,
2406                         dev_offset,
2407                         (io_buf_ptr_t)  addr,
2408                         size,
2409                         (IO_WRITE | IO_CALL),
2410                         &bytes_written);
2411                 }
2412                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2413                         if (verbose)
2414                                 dprintf(("%s0x%x, addr=0x%x,"
2415                                          "size=0x%x,offset=0x%x\n",
2416                                          "device_write_request returned ",
2417                                          kr, addr, size, offset));
2418                         BS_STAT(ps->ps_bs,
2419                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2420                         /* do the completion notification to free resources */
2421                         device_write_reply(reply_port, kr, 0);
2422                         return PAGER_ERROR;
2423                 }
2424         } else do {
2425                 /*
2426                  * Synchronous write.
2427                  */
2428                 {
2429                 device_t        device;
2430                 device = dev_port_lookup(ps->ps_device);
2431                 kr=ds_device_write_common(device,
2432                         IP_NULL, 0,
2433                         (dev_mode_t) 0,
2434                         dev_offset,
2435                         (io_buf_ptr_t)  addr,
2436                         size,
2437                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2438                         &bytes_written);
2439                 }
2440                 if (kr != KERN_SUCCESS) {
2441                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2442                                  "device_write returned ",
2443                                  kr, addr, size, offset));
2444                         BS_STAT(ps->ps_bs,
2445                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2446                         return PAGER_ERROR;
2447                 }
2448                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2449                         Panic("fragmented write");
2450                 records_written = (bytes_written >>
2451                                    (vm_page_shift - ps->ps_record_shift));
2452                 dev_offset += records_written;
2453 #if 1
2454                 if (bytes_written != bytes_to_write) {
2455                         dprintf(("wrote only %d bytes out of %d\n",
2456                                  bytes_written, bytes_to_write));
2457                 }
2458 #endif
2459                 bytes_to_write -= bytes_written;
2460                 addr += bytes_written;
2461         } while (bytes_to_write > 0);
2462
2463         return PAGER_SUCCESS;
2464 }
2465
2466
2467 #else /* !DEVICE_PAGING */
2468
2469 kern_return_t
2470 ps_read_device(
2471         __unused paging_segment_t       ps,
2472         __unused vm_offset_t            offset,
2473         __unused vm_offset_t            *bufferp,
2474         __unused unsigned int           size,
2475         __unused unsigned int           *residualp,
2476         __unused int                            flags)
2477 {
2478   panic("ps_read_device not supported");
2479   return KERN_FAILURE;
2480 }
2481
2482 kern_return_t
2483 ps_write_device(
2484         __unused paging_segment_t       ps,
2485         __unused vm_offset_t            offset,
2486         __unused vm_offset_t            addr,
2487         __unused unsigned int           size,
2488         __unused struct vs_async        *vsa)
2489 {
2490   panic("ps_write_device not supported");
2491   return KERN_FAILURE;
2492 }
2493
2494 #endif /* DEVICE_PAGING */
2495 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t);      /* forward */
2496
2497 void
2498 pvs_object_data_provided(
2499         __unused vstruct_t              vs,
2500         __unused upl_t                  upl,
2501         __unused upl_offset_t   offset,
2502         upl_size_t                              size)
2503 {
2504
2505         DP_DEBUG(DEBUG_VS_INTERNAL,
2506                  ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2507                   upl, offset, size));
2508
2509         ASSERT(size > 0);
2510         GSTAT(global_stats.gs_pages_in += atop_32(size));
2511
2512
2513 #if     USE_PRECIOUS
2514         ps_clunmap(vs, offset, size);
2515 #endif  /* USE_PRECIOUS */
2516
2517 }
2518
2519 static memory_object_offset_t   last_start;
2520 static vm_size_t                last_length;
2521
2522 kern_return_t
2523 pvs_cluster_read(
2524         vstruct_t       vs,
2525         vm_offset_t     vs_offset,
2526         vm_size_t       cnt,
2527         void            *fault_info)
2528 {
2529         kern_return_t           error = KERN_SUCCESS;
2530         unsigned int            size;
2531         unsigned int            residual;
2532         unsigned int            request_flags;
2533         int                     seg_index;
2534         int                     pages_in_cl;
2535         int                     cl_size;
2536         int                     cl_mask;
2537         int                     cl_index;
2538         unsigned int            xfer_size;
2539         vm_offset_t             orig_vs_offset;
2540         vm_offset_t       ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2541         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2542         struct clmap            clmap;
2543         upl_t                   upl;
2544         unsigned int            page_list_count;
2545         memory_object_offset_t  start;
2546
2547         pages_in_cl = 1 << vs->vs_clshift;
2548         cl_size = pages_in_cl * vm_page_size;
2549         cl_mask = cl_size - 1;
2550
2551 #if     USE_PRECIOUS
2552         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2553 #else
2554         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2555 #endif
2556         cl_index = (vs_offset & cl_mask) / vm_page_size;
2557
2558         if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (vm_offset_t)-1) ||
2559             !CLMAP_ISSET(clmap, cl_index)) {
2560                 /*
2561                  * the needed page doesn't exist in the backing store...
2562                  * we don't want to try to do any I/O, just abort the
2563                  * page and let the fault handler provide a zero-fill
2564                  */
2565                 if (cnt == 0) {
2566                         /*
2567                          * The caller was just poking at us to see if
2568                          * the page has been paged out.  No need to
2569                          * mess with the page at all.
2570                          * Just let the caller know we don't have that page.
2571                          */
2572                         return KERN_FAILURE;
2573                 }
2574
2575                 page_list_count = 0;
2576
2577                 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2578                                                 PAGE_SIZE, PAGE_SIZE,
2579                                                 &upl, NULL, &page_list_count,
2580                                                 request_flags);
2581
2582                 if (clmap.cl_error)
2583                         upl_abort(upl, UPL_ABORT_ERROR);
2584                 else
2585                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2586                 upl_deallocate(upl);
2587
2588                 return KERN_SUCCESS;
2589         }
2590
2591         if (cnt == 0) {
2592                 /*
2593                  * The caller was just poking at us to see if
2594                  * the page has been paged out.  No need to
2595                  * mess with the page at all.
2596                  * Just let the caller know we do have that page.
2597                  */
2598                 return KERN_SUCCESS;
2599         }
2600
2601         assert(dp_encryption_inited);
2602         if (dp_encryption) {
2603                 /*
2604                  * ENCRYPTED SWAP:
2605                  * request that the UPL be prepared for
2606                  * decryption.
2607                  */
2608                 request_flags |= UPL_ENCRYPT;
2609         }
2610         orig_vs_offset = vs_offset;
2611
2612         start = (memory_object_offset_t)vs_offset;
2613         assert(cnt != 0);
2614         cnt = VM_SUPER_CLUSTER;
2615
2616         /*
2617          * determine how big a speculative I/O we should try for...
2618          */
2619         if (memory_object_cluster_size(vs->vs_control, &start, &cnt, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
2620                 assert(vs_offset >= (vm_offset_t) start &&
2621                        vs_offset < (vm_offset_t) (start + cnt));
2622                 vs_offset = (vm_offset_t)start;
2623         } else
2624                 cnt = PAGE_SIZE;
2625
2626         last_start = start;
2627         last_length = cnt;
2628
2629         /*
2630          * This loop will be executed multiple times until the entire
2631          * range has been looked at or we issue an I/O... if the request spans cluster
2632          * boundaries, the clusters will be checked for logical continunity,
2633          * if contiguous the I/O request will span multiple clusters...
2634          * at most only 1 I/O will be issued... it will encompass the original offset
2635          */
2636         while (cnt && error == KERN_SUCCESS) {
2637                 int     ps_info_valid;
2638
2639                 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
2640                         size = VM_SUPER_CLUSTER;
2641                         size -= vs_offset & cl_mask;
2642                 } else if (cnt > VM_SUPER_CLUSTER)
2643                         size = VM_SUPER_CLUSTER;
2644                 else
2645                         size = cnt;
2646
2647                 cnt -= size;
2648
2649                 ps_info_valid = 0;
2650                 seg_index     = 0;
2651
2652                 while (size > 0 && error == KERN_SUCCESS) {
2653                         unsigned int  abort_size;
2654                         int           failed_size;
2655                         int           beg_pseg;
2656                         int           beg_indx;
2657                         vm_offset_t   cur_offset;
2658
2659                         if ( !ps_info_valid) {
2660                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2661                                 psp[seg_index]       = CLMAP_PS(clmap);
2662                                 ps_info_valid = 1;
2663                         }
2664                         /*
2665                          * skip over unallocated physical segments
2666                          */
2667                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2668                                 abort_size = cl_size - (vs_offset & cl_mask);
2669                                 abort_size = MIN(abort_size, size);
2670
2671                                 size      -= abort_size;
2672                                 vs_offset += abort_size;
2673
2674                                 seg_index++;
2675                                 ps_info_valid = 0;
2676
2677                                 continue;
2678                         }
2679                         cl_index = (vs_offset & cl_mask) / vm_page_size;
2680
2681                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2682                                 /*
2683                                  * skip over unallocated pages
2684                                  */
2685                                 if (CLMAP_ISSET(clmap, cl_index))
2686                                         break;
2687                                 abort_size += vm_page_size;
2688                         }
2689                         if (abort_size) {
2690                                 size      -= abort_size;
2691                                 vs_offset += abort_size;
2692
2693                                 if (cl_index == pages_in_cl) {
2694                                         /*
2695                                          * if we're at the end of this physical cluster
2696                                          * then bump to the next one and continue looking
2697                                          */
2698                                         seg_index++;
2699                                         ps_info_valid = 0;
2700
2701                                         continue;
2702                                 }
2703                                 if (size == 0)
2704                                         break;
2705                         }
2706                         /*
2707                          * remember the starting point of the first allocated page
2708                          * for the I/O we're about to issue
2709                          */
2710                         beg_pseg   = seg_index;
2711                         beg_indx   = cl_index;
2712                         cur_offset = vs_offset;
2713
2714                         /*
2715                          * calculate the size of the I/O that we can do...
2716                          * this may span multiple physical segments if
2717                          * they are contiguous
2718                          */
2719                         for (xfer_size = 0; xfer_size < size; ) {
2720
2721                                 while (cl_index < pages_in_cl && xfer_size < size) {
2722                                         /*
2723                                          * accumulate allocated pages within
2724                                          * a physical segment
2725                                          */
2726                                         if (CLMAP_ISSET(clmap, cl_index)) {
2727                                                 xfer_size  += vm_page_size;
2728                                                 cur_offset += vm_page_size;
2729                                                 cl_index++;
2730
2731                                                 BS_STAT(psp[seg_index]->ps_bs,
2732                                                         psp[seg_index]->ps_bs->bs_pages_in++);
2733                                         } else
2734                                                 break;
2735                                 }
2736                                 if (cl_index < pages_in_cl || xfer_size >= size) {
2737                                         /*
2738                                          * we've hit an unallocated page or
2739                                          * the end of this request... see if
2740                                          * it's time to fire the I/O
2741                                          */
2742                                         break;
2743                                 }
2744                                 /*
2745                                  * we've hit the end of the current physical
2746                                  * segment and there's more to do, so try
2747                                  * moving to the next one
2748                                  */
2749                                 seg_index++;
2750
2751                                 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2752                                 psp[seg_index] = CLMAP_PS(clmap);
2753                                 ps_info_valid = 1;
2754
2755                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2756                                         /*
2757                                          * if the physical segment we're about
2758                                          * to step into is not contiguous to
2759                                          * the one we're currently in, or it's
2760                                          * in a different paging file, or
2761                                          * it hasn't been allocated....
2762                                          * we stop this run and go check
2763                                          * to see if it's time to fire the I/O
2764                                          */
2765                                         break;
2766                                 }
2767                                 /*
2768                                  * start with first page of the next physical
2769                                  * segment
2770                                  */
2771                                 cl_index = 0;
2772                         }
2773                         if (xfer_size == 0) {
2774                                 /*
2775                                  * no I/O to generate for this segment
2776                                  */
2777                                 continue;
2778                         }
2779                         if (cur_offset <= orig_vs_offset) {
2780                                 /*
2781                                  * we've hit a hole in our speculative cluster
2782                                  * before the offset that we're really after...
2783                                  * don't issue the I/O since it doesn't encompass
2784                                  * the original offset and we're looking to only
2785                                  * pull in the speculative pages if they can be
2786                                  * made part of a single I/O
2787                                  */
2788                                 size      -= xfer_size;
2789                                 vs_offset += xfer_size;
2790
2791                                 continue;
2792                         }
2793                         /*
2794                          * we have a contiguous range of allocated pages
2795                          * to read from that encompasses the original offset
2796                          */
2797                         page_list_count = 0;
2798                         memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2799                                                         xfer_size, xfer_size,
2800                                                         &upl, NULL, &page_list_count,
2801                                                         request_flags | UPL_SET_INTERNAL | UPL_NOBLOCK);
2802
2803                         error = ps_read_file(psp[beg_pseg],
2804                                              upl, (upl_offset_t) 0,
2805                                              ps_offset[beg_pseg] + (beg_indx * vm_page_size),
2806                                              xfer_size, &residual, 0);
2807
2808                         failed_size = 0;
2809
2810                         /*
2811                          * Adjust counts and send response to VM.  Optimize
2812                          * for the common case, i.e. no error and/or partial
2813                          * data. If there was an error, then we need to error
2814                          * the entire range, even if some data was successfully
2815                          * read. If there was a partial read we may supply some
2816                          * data and may error some as well.  In all cases the
2817                          * VM must receive some notification for every page
2818                          * in the range.
2819                          */
2820                         if ((error == KERN_SUCCESS) && (residual == 0)) {
2821                                 /*
2822                                  * Got everything we asked for, supply the data
2823                                  * to the VM.  Note that as a side effect of
2824                                  * supplying the data, the buffer holding the
2825                                  * supplied data is deallocated from the pager's
2826                                  *  address space.
2827                                  */
2828                                 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
2829                         } else {
2830                                 failed_size = xfer_size;
2831
2832                                 if (error == KERN_SUCCESS) {
2833                                         if (residual == xfer_size) {
2834                                                 /*
2835                                                  * If a read operation returns no error
2836                                                  * and no data moved, we turn it into
2837                                                  * an error, assuming we're reading at
2838                                                  * or beyong EOF.
2839                                                  * Fall through and error the entire range.
2840                                                  */
2841                                                 error = KERN_FAILURE;
2842                                         } else {
2843                                                 /*
2844                                                  * Otherwise, we have partial read. If
2845                                                  * the part read is a integral number
2846                                                  * of pages supply it. Otherwise round
2847                                                  * it up to a page boundary, zero fill
2848                                                  * the unread part, and supply it.
2849                                                  * Fall through and error the remainder
2850                                                  * of the range, if any.
2851                                                  */
2852                                                 int fill;
2853                                                 unsigned int lsize;
2854
2855                                                 fill = residual & ~vm_page_size;
2856                                                 lsize = (xfer_size - residual) + fill;
2857
2858                                                 pvs_object_data_provided(vs, upl, vs_offset, lsize);
2859
2860                                                 if (lsize < xfer_size) {
2861                                                         failed_size = xfer_size - lsize;
2862                                                         error = KERN_FAILURE;
2863                                                 }
2864                                         }
2865                                 }
2866                         }
2867                         if (error != KERN_SUCCESS) {
2868                                 /*
2869                                  * There was an error in some part of the range, tell
2870                                  * the VM. Note that error is explicitly checked again
2871                                  * since it can be modified above.
2872                                  */
2873                                 BS_STAT(psp[beg_pseg]->ps_bs,
2874                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
2875                         }
2876                         /*
2877                          * we've issued a single I/O that encompassed the original offset
2878                          * at this point we either met our speculative request length or
2879                          * we ran into a 'hole' (i.e. page not present in the cluster, cluster
2880                          * not present or not physically contiguous to the previous one), so
2881                          * we're done issuing I/O at this point
2882                          */
2883                         return (error);
2884                 }
2885         }
2886         return error;
2887 }
2888
2889 int vs_do_async_write = 1;
2890
2891 kern_return_t
2892 vs_cluster_write(
2893         vstruct_t       vs,
2894         upl_t           internal_upl,
2895         upl_offset_t    offset,
2896         upl_size_t      cnt,
2897         boolean_t       dp_internal,
2898         int             flags)
2899 {
2900         upl_size_t      transfer_size;
2901         int             error = 0;
2902         struct clmap    clmap;
2903
2904         vm_offset_t     actual_offset;  /* Offset within paging segment */
2905         paging_segment_t ps;
2906         vm_offset_t     mobj_base_addr;
2907         vm_offset_t     mobj_target_addr;
2908
2909         upl_t           upl;
2910         upl_page_info_t *pl;
2911         int             page_index;
2912         int             list_size;
2913         int             pages_in_cl;
2914         unsigned int    cl_size;
2915         int             base_index;
2916         unsigned int    seg_size;
2917
2918         pages_in_cl = 1 << vs->vs_clshift;
2919         cl_size = pages_in_cl * vm_page_size;
2920
2921         if (!dp_internal) {
2922                 unsigned int page_list_count;
2923                 int          request_flags;
2924                 unsigned int super_size;
2925                 int          first_dirty;
2926                 int          num_dirty;
2927                 int          num_of_pages;
2928                 int          seg_index;
2929                 upl_offset_t  upl_offset;
2930                 vm_offset_t  seg_offset;
2931                 vm_offset_t  ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2932                 paging_segment_t   psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2933
2934
2935                 if (bs_low) {
2936                         super_size = cl_size;
2937
2938                         request_flags = UPL_NOBLOCK |
2939                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2940                                 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
2941                 } else {
2942                         super_size = VM_SUPER_CLUSTER;
2943
2944                         request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2945                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2946                                 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
2947                 }
2948
2949                 if (!dp_encryption_inited) {
2950                         /*
2951                          * ENCRYPTED SWAP:
2952                          * Once we've started using swap, we
2953                          * can't change our mind on whether
2954                          * it needs to be encrypted or
2955                          * not.
2956                          */
2957                         dp_encryption_inited = TRUE;
2958                 }
2959                 if (dp_encryption) {
2960                         /*
2961                          * ENCRYPTED SWAP:
2962                          * request that the UPL be prepared for
2963                          * encryption.
2964                          */
2965                         request_flags |= UPL_ENCRYPT;
2966                         flags |= UPL_PAGING_ENCRYPTED;
2967                 }
2968
2969                 page_list_count = 0;
2970                 memory_object_super_upl_request(vs->vs_control,
2971                                 (memory_object_offset_t)offset,
2972                                 cnt, super_size,
2973                                 &upl, NULL, &page_list_count,
2974                                 request_flags | UPL_FOR_PAGEOUT);
2975
2976                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2977
2978                 seg_size = cl_size - (upl->offset % cl_size);
2979                 upl_offset = upl->offset & ~(cl_size - 1);
2980
2981                 for (seg_index = 0, transfer_size = upl->size;
2982                                                 transfer_size > 0; ) {
2983                         ps_offset[seg_index] =
2984                                 ps_clmap(vs,
2985                                         upl_offset,
2986                                         &clmap, CL_ALLOC,
2987                                         cl_size, 0);
2988
2989                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2990                                 upl_abort(upl, 0);
2991                                 upl_deallocate(upl);
2992
2993                                 return KERN_FAILURE;
2994
2995                         }
2996                         psp[seg_index] = CLMAP_PS(clmap);
2997
2998                         if (transfer_size > seg_size) {
2999                                 transfer_size -= seg_size;
3000                                 upl_offset += cl_size;
3001                                 seg_size    = cl_size;
3002                                 seg_index++;
3003                         } else
3004                                 transfer_size = 0;
3005                 }
3006                 /*
3007                  * Ignore any non-present pages at the end of the
3008                  * UPL.
3009                  */
3010                 for (page_index = upl->size / vm_page_size; page_index > 0;)
3011                         if (UPL_PAGE_PRESENT(pl, --page_index))
3012                                 break;
3013                 num_of_pages = page_index + 1;
3014
3015                 base_index = (upl->offset % cl_size) / PAGE_SIZE;
3016
3017                 for (page_index = 0; page_index < num_of_pages; ) {
3018                         /*
3019                          * skip over non-dirty pages
3020                          */
3021                         for ( ; page_index < num_of_pages; page_index++) {
3022                                 if (UPL_DIRTY_PAGE(pl, page_index)
3023                                         || UPL_PRECIOUS_PAGE(pl, page_index))
3024                                         /*
3025                                          * this is a page we need to write
3026                                          * go see if we can buddy it up with
3027                                          * others that are contiguous to it
3028                                          */
3029                                         break;
3030                                 /*
3031                                  * if the page is not-dirty, but present we
3032                                  * need to commit it...  This is an unusual
3033                                  * case since we only asked for dirty pages
3034                                  */
3035                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
3036                                         boolean_t empty = FALSE;
3037                                         upl_commit_range(upl,
3038                                                  page_index * vm_page_size,
3039                                                  vm_page_size,
3040                                                  UPL_COMMIT_NOTIFY_EMPTY,
3041                                                  pl,
3042                                                  page_list_count,
3043                                                  &empty);
3044                                         if (empty) {
3045                                                 assert(page_index ==
3046                                                        num_of_pages - 1);
3047                                                 upl_deallocate(upl);
3048                                         }
3049                                 }
3050                         }
3051                         if (page_index == num_of_pages)
3052                                 /*
3053                                  * no more pages to look at, we're out of here
3054                                  */
3055                                 break;
3056
3057                         /*
3058                          * gather up contiguous dirty pages... we have at
3059                          * least 1 * otherwise we would have bailed above
3060                          * make sure that each physical segment that we step
3061                          * into is contiguous to the one we're currently in
3062                          * if it's not, we have to stop and write what we have
3063                          */
3064                         for (first_dirty = page_index;
3065                                         page_index < num_of_pages; ) {
3066                                 if ( !UPL_DIRTY_PAGE(pl, page_index)
3067                                         && !UPL_PRECIOUS_PAGE(pl, page_index))
3068                                         break;
3069                                 page_index++;
3070                                 /*
3071                                  * if we just looked at the last page in the UPL
3072                                  * we don't need to check for physical segment
3073                                  * continuity
3074                                  */
3075                                 if (page_index < num_of_pages) {
3076                                         int cur_seg;
3077                                         int nxt_seg;
3078
3079                                         cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3080                                         nxt_seg = (base_index + page_index)/pages_in_cl;
3081
3082                                         if (cur_seg != nxt_seg) {
3083                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3084                                                 /*
3085                                                  * if the segment we're about
3086                                                  * to step into is not
3087                                                  * contiguous to the one we're
3088                                                  * currently in, or it's in a
3089                                                  * different paging file....
3090                                                  * we stop here and generate
3091                                                  * the I/O
3092                                                  */
3093                                                         break;
3094                                         }
3095                                 }
3096                         }
3097                         num_dirty = page_index - first_dirty;
3098
3099                         if (num_dirty) {
3100                                 upl_offset = first_dirty * vm_page_size;
3101                                 transfer_size = num_dirty * vm_page_size;
3102
3103                                 while (transfer_size) {
3104
3105                                         if ((seg_size = cl_size -
3106                                                 ((upl->offset + upl_offset) % cl_size))
3107                                                         > transfer_size)
3108                                                 seg_size = transfer_size;
3109
3110                                         ps_vs_write_complete(vs,
3111                                                 upl->offset + upl_offset,
3112                                                 seg_size, error);
3113
3114                                         transfer_size -= seg_size;
3115                                         upl_offset += seg_size;
3116                                 }
3117                                 upl_offset = first_dirty * vm_page_size;
3118                                 transfer_size = num_dirty * vm_page_size;
3119
3120                                 seg_index  = (base_index + first_dirty) / pages_in_cl;
3121                                 seg_offset = (upl->offset + upl_offset) % cl_size;
3122
3123                                 error = ps_write_file(psp[seg_index],
3124                                                 upl, upl_offset,
3125                                                 ps_offset[seg_index]
3126                                                                 + seg_offset,
3127                                                 transfer_size, flags);
3128                         } else {
3129                                 boolean_t empty = FALSE;
3130                                 upl_abort_range(upl,
3131                                                 first_dirty * vm_page_size,
3132                                                 num_dirty   * vm_page_size,
3133                                                 UPL_ABORT_NOTIFY_EMPTY,
3134                                                 &empty);
3135                                 if (empty) {
3136                                         assert(page_index == num_of_pages);
3137                                         upl_deallocate(upl);
3138                                 }
3139                         }
3140                 }
3141
3142         } else {
3143                 assert(cnt  <= (vm_page_size << vs->vs_clshift));
3144                 list_size = cnt;
3145
3146                 page_index = 0;
3147                 /* The caller provides a mapped_data which is derived  */
3148                 /* from a temporary object.  The targeted pages are    */
3149                 /* guaranteed to be set at offset 0 in the mapped_data */
3150                 /* The actual offset however must still be derived     */
3151                 /* from the offset in the vs in question               */
3152                 mobj_base_addr = offset;
3153                 mobj_target_addr = mobj_base_addr;
3154
3155                 for (transfer_size = list_size; transfer_size != 0;) {
3156                         actual_offset = ps_clmap(vs, mobj_target_addr,
3157                                 &clmap, CL_ALLOC,
3158                                 transfer_size < cl_size ?
3159                                         transfer_size : cl_size, 0);
3160                         if(actual_offset == (vm_offset_t) -1) {
3161                                 error = 1;
3162                                 break;
3163                         }
3164                         cnt = MIN(transfer_size,
3165                                 CLMAP_NPGS(clmap) * vm_page_size);
3166                         ps = CLMAP_PS(clmap);
3167                         /* Assume that the caller has given us contiguous */
3168                         /* pages */
3169                         if(cnt) {
3170                                 ps_vs_write_complete(vs, mobj_target_addr,
3171                                                                 cnt, error);
3172                                 error = ps_write_file(ps, internal_upl,
3173                                                 0, actual_offset,
3174                                                 cnt, flags);
3175                                 if (error)
3176                                         break;
3177                            }
3178                         if (error)
3179                                 break;
3180                         actual_offset += cnt;
3181                         mobj_target_addr += cnt;
3182                         transfer_size -= cnt;
3183                         cnt = 0;
3184
3185                         if (error)
3186                                 break;
3187                 }
3188         }
3189         if(error)
3190                 return KERN_FAILURE;
3191         else
3192                 return KERN_SUCCESS;
3193 }
3194
3195 vm_size_t
3196 ps_vstruct_allocated_size(
3197         vstruct_t       vs)
3198 {
3199         int             num_pages;
3200         struct vs_map   *vsmap;
3201         unsigned int    i, j, k;
3202
3203         num_pages = 0;
3204         if (vs->vs_indirect) {
3205                 /* loop on indirect maps */
3206                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3207                         vsmap = vs->vs_imap[i];
3208                         if (vsmap == NULL)
3209                                 continue;
3210                         /* loop on clusters in this indirect map */
3211                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3212                                 if (VSM_ISCLR(vsmap[j]) ||
3213                                     VSM_ISERR(vsmap[j]))
3214                                         continue;
3215                                 /* loop on pages in this cluster */
3216                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3217                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3218                                                 num_pages++;
3219                                 }
3220                         }
3221                 }
3222         } else {
3223                 vsmap = vs->vs_dmap;
3224                 if (vsmap == NULL)
3225                         return 0;
3226                 /* loop on clusters in the direct map */
3227                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3228                         if (VSM_ISCLR(vsmap[j]) ||
3229                             VSM_ISERR(vsmap[j]))
3230                                 continue;
3231                         /* loop on pages in this cluster */
3232                         for (k = 0; k < VSCLSIZE(vs); k++) {
3233                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3234                                         num_pages++;
3235                         }
3236                 }
3237         }
3238
3239         return ptoa_32(num_pages);
3240 }
3241
3242 size_t
3243 ps_vstruct_allocated_pages(
3244         vstruct_t               vs,
3245         default_pager_page_t    *pages,
3246         size_t                  pages_size)
3247 {
3248         unsigned int    num_pages;
3249         struct vs_map   *vsmap;
3250         vm_offset_t     offset;
3251         unsigned int    i, j, k;
3252
3253         num_pages = 0;
3254         offset = 0;
3255         if (vs->vs_indirect) {
3256                 /* loop on indirect maps */
3257                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3258                         vsmap = vs->vs_imap[i];
3259                         if (vsmap == NULL) {
3260                                 offset += (vm_page_size * CLMAP_ENTRIES *
3261                                            VSCLSIZE(vs));
3262                                 continue;
3263                         }
3264                         /* loop on clusters in this indirect map */
3265                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3266                                 if (VSM_ISCLR(vsmap[j]) ||
3267                                     VSM_ISERR(vsmap[j])) {
3268                                         offset += vm_page_size * VSCLSIZE(vs);
3269                                         continue;
3270                                 }
3271                                 /* loop on pages in this cluster */
3272                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3273                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3274                                                 num_pages++;
3275                                                 if (num_pages < pages_size)
3276                                                         pages++->dpp_offset =
3277                                                                 offset;
3278                                         }
3279                                         offset += vm_page_size;
3280                                 }
3281                         }
3282                 }
3283         } else {
3284                 vsmap = vs->vs_dmap;
3285                 if (vsmap == NULL)
3286                         return 0;
3287                 /* loop on clusters in the direct map */
3288                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3289                         if (VSM_ISCLR(vsmap[j]) ||
3290                             VSM_ISERR(vsmap[j])) {
3291                                 offset += vm_page_size * VSCLSIZE(vs);
3292                                 continue;
3293                         }
3294                         /* loop on pages in this cluster */
3295                         for (k = 0; k < VSCLSIZE(vs); k++) {
3296                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3297                                         num_pages++;
3298                                         if (num_pages < pages_size)
3299                                                 pages++->dpp_offset = offset;
3300                                 }
3301                                 offset += vm_page_size;
3302                         }
3303                 }
3304         }
3305
3306         return num_pages;
3307 }
3308
3309
3310 kern_return_t
3311 ps_vstruct_transfer_from_segment(
3312         vstruct_t        vs,
3313         paging_segment_t segment,
3314         upl_t            upl)
3315 {
3316         struct vs_map   *vsmap;
3317 //      struct vs_map   old_vsmap;
3318 //      struct vs_map   new_vsmap;
3319         unsigned int    i, j;
3320
3321         VS_LOCK(vs);    /* block all work on this vstruct */
3322                         /* can't allow the normal multiple write */
3323                         /* semantic because writes may conflict */
3324         vs->vs_xfer_pending = TRUE;
3325         vs_wait_for_sync_writers(vs);
3326         vs_start_write(vs);
3327         vs_wait_for_readers(vs);
3328         /* we will unlock the vs to allow other writes while transferring */
3329         /* and will be guaranteed of the persistance of the vs struct     */
3330         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3331         /* vs_async_pending */
3332         /* OK we now have guaranteed no other parties are accessing this */
3333         /* vs.  Now that we are also supporting simple lock versions of  */
3334         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3335         /* our purpose in holding it before was the multiple write case */
3336         /* we now use the boolean xfer_pending to do that.  We can use  */
3337         /* a boolean instead of a count because we have guaranteed single */
3338         /* file access to this code in its caller */
3339         VS_UNLOCK(vs);
3340 vs_changed:
3341         if (vs->vs_indirect) {
3342                 unsigned int    vsmap_size;
3343                 int             clmap_off;
3344                 /* loop on indirect maps */
3345                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3346                         vsmap = vs->vs_imap[i];
3347                         if (vsmap == NULL)
3348                                 continue;
3349                         /* loop on clusters in this indirect map */
3350                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3351                                            VSCLSIZE(vs) * i);
3352                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3353                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3354                         else
3355                                 vsmap_size = CLMAP_ENTRIES;
3356                         for (j = 0; j < vsmap_size; j++) {
3357                                 if (VSM_ISCLR(vsmap[j]) ||
3358                                     VSM_ISERR(vsmap[j]) ||
3359                                     (VSM_PS(vsmap[j]) != segment))
3360                                         continue;
3361                                 if(vs_cluster_transfer(vs,
3362                                         (vm_page_size * (j << vs->vs_clshift))
3363                                         + clmap_off,
3364                                         vm_page_size << vs->vs_clshift,
3365                                         upl)
3366                                                 != KERN_SUCCESS) {
3367                                    VS_LOCK(vs);
3368                                    vs->vs_xfer_pending = FALSE;
3369                                    VS_UNLOCK(vs);
3370                                    vs_finish_write(vs);
3371                                    return KERN_FAILURE;
3372                                 }
3373                                 /* allow other readers/writers during transfer*/
3374                                 VS_LOCK(vs);
3375                                 vs->vs_xfer_pending = FALSE;
3376                                 VS_UNLOCK(vs);
3377                                 vs_finish_write(vs);
3378                                 VS_LOCK(vs);
3379                                 vs->vs_xfer_pending = TRUE;
3380                                 vs_wait_for_sync_writers(vs);
3381                                 vs_start_write(vs);
3382                                 vs_wait_for_readers(vs);
3383                                 VS_UNLOCK(vs);
3384                                 if (!(vs->vs_indirect)) {
3385                                         goto vs_changed;
3386                                 }
3387                         }
3388                 }
3389         } else {
3390                 vsmap = vs->vs_dmap;
3391                 if (vsmap == NULL) {
3392                         VS_LOCK(vs);
3393                         vs->vs_xfer_pending = FALSE;
3394                         VS_UNLOCK(vs);
3395                         vs_finish_write(vs);
3396                         return KERN_SUCCESS;
3397                 }
3398                 /* loop on clusters in the direct map */
3399                 for (j = 0; j < vs->vs_size; j++) {
3400                         if (VSM_ISCLR(vsmap[j]) ||
3401                             VSM_ISERR(vsmap[j]) ||
3402                             (VSM_PS(vsmap[j]) != segment))
3403                                 continue;
3404                         if(vs_cluster_transfer(vs,
3405                                 vm_page_size * (j << vs->vs_clshift),
3406                                 vm_page_size << vs->vs_clshift,
3407                                 upl) != KERN_SUCCESS) {
3408                            VS_LOCK(vs);
3409                            vs->vs_xfer_pending = FALSE;
3410                            VS_UNLOCK(vs);
3411                            vs_finish_write(vs);
3412                            return KERN_FAILURE;
3413                         }
3414                         /* allow other readers/writers during transfer*/
3415                         VS_LOCK(vs);
3416                         vs->vs_xfer_pending = FALSE;
3417                         VS_UNLOCK(vs);
3418                         vs_finish_write(vs);
3419                         VS_LOCK(vs);
3420                         vs->vs_xfer_pending = TRUE;
3421                         VS_UNLOCK(vs);
3422                         vs_wait_for_sync_writers(vs);
3423                         vs_start_write(vs);
3424                         vs_wait_for_readers(vs);
3425                         if (vs->vs_indirect) {
3426                                 goto vs_changed;
3427                         }
3428                 }
3429         }
3430
3431         VS_LOCK(vs);
3432         vs->vs_xfer_pending = FALSE;
3433         VS_UNLOCK(vs);
3434         vs_finish_write(vs);
3435         return KERN_SUCCESS;
3436 }
3437
3438
3439
3440 vs_map_t
3441 vs_get_map_entry(
3442         vstruct_t       vs,
3443         vm_offset_t     offset)
3444 {
3445         struct vs_map   *vsmap;
3446         vm_offset_t     cluster;
3447
3448         cluster = atop_32(offset) >> vs->vs_clshift;
3449         if (vs->vs_indirect) {
3450                 long    ind_block = cluster/CLMAP_ENTRIES;
3451
3452                 /* Is the indirect block allocated? */
3453                 vsmap = vs->vs_imap[ind_block];
3454                 if(vsmap == (vs_map_t) NULL)
3455                         return vsmap;
3456         } else
3457                 vsmap = vs->vs_dmap;
3458         vsmap += cluster%CLMAP_ENTRIES;
3459         return vsmap;
3460 }
3461
3462 kern_return_t
3463 vs_cluster_transfer(
3464         vstruct_t       vs,
3465         vm_offset_t     offset,
3466         vm_size_t       cnt,
3467         upl_t           upl)
3468 {
3469         vm_offset_t             actual_offset;
3470         paging_segment_t        ps;
3471         struct clmap            clmap;
3472         kern_return_t           error = KERN_SUCCESS;
3473         unsigned int            size, size_wanted;
3474         int                     i;
3475         unsigned int            residual = 0;
3476         unsigned int            unavail_size;
3477 //      default_pager_thread_t  *dpt;
3478 //      boolean_t               dealloc;
3479         struct  vs_map          *vsmap_ptr = NULL;
3480         struct  vs_map          read_vsmap;
3481         struct  vs_map          original_read_vsmap;
3482         struct  vs_map          write_vsmap;
3483 //      upl_t                           sync_upl;
3484 //      vm_offset_t                     ioaddr;
3485
3486         /* vs_cluster_transfer reads in the pages of a cluster and
3487          * then writes these pages back to new backing store.  The
3488          * segment the pages are being read from is assumed to have
3489          * been taken off-line and is no longer considered for new
3490          * space requests.
3491          */
3492
3493         /*
3494          * This loop will be executed once per cluster referenced.
3495          * Typically this means once, since it's unlikely that the
3496          * VM system will ask for anything spanning cluster boundaries.
3497          *
3498          * If there are holes in a cluster (in a paging segment), we stop
3499          * reading at the hole, then loop again, hoping to
3500          * find valid pages later in the cluster.  This continues until
3501          * the entire range has been examined, and read, if present.  The
3502          * pages are written as they are read.  If a failure occurs after
3503          * some pages are written the unmap call at the bottom of the loop
3504          * recovers the backing store and the old backing store remains
3505          * in effect.
3506          */
3507
3508         VSM_CLR(write_vsmap);
3509         VSM_CLR(original_read_vsmap);
3510         /* grab the actual object's pages to sync with I/O */
3511         while (cnt && (error == KERN_SUCCESS)) {
3512                 vsmap_ptr = vs_get_map_entry(vs, offset);
3513                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3514
3515                 if (actual_offset == (vm_offset_t) -1) {
3516
3517                         /*
3518                          * Nothing left to write in this cluster at least
3519                          * set write cluster information for any previous
3520                          * write, clear for next cluster, if there is one
3521                          */
3522                         unsigned int local_size, clmask, clsize;
3523
3524                         clsize = vm_page_size << vs->vs_clshift;
3525                         clmask = clsize - 1;
3526                         local_size = clsize - (offset & clmask);
3527                         ASSERT(local_size);
3528                         local_size = MIN(local_size, cnt);
3529
3530                         /* This cluster has no data in it beyond what may */
3531                         /* have been found on a previous iteration through */
3532                         /* the loop "write_vsmap" */
3533                         *vsmap_ptr = write_vsmap;
3534                         VSM_CLR(write_vsmap);
3535                         VSM_CLR(original_read_vsmap);
3536
3537                         cnt -= local_size;
3538                         offset += local_size;
3539                         continue;
3540                 }
3541
3542                 /*
3543                  * Count up contiguous available or unavailable
3544                  * pages.
3545                  */
3546                 ps = CLMAP_PS(clmap);
3547                 ASSERT(ps);
3548                 size = 0;
3549                 unavail_size = 0;
3550                 for (i = 0;
3551                      (size < cnt) && (unavail_size < cnt) &&
3552                      (i < CLMAP_NPGS(clmap)); i++) {
3553                         if (CLMAP_ISSET(clmap, i)) {
3554                                 if (unavail_size != 0)
3555                                         break;
3556                                 size += vm_page_size;
3557                                 BS_STAT(ps->ps_bs,
3558                                         ps->ps_bs->bs_pages_in++);
3559                         } else {
3560                                 if (size != 0)
3561                                         break;
3562                                 unavail_size += vm_page_size;
3563                         }
3564                 }
3565
3566                 if (size == 0) {
3567                         ASSERT(unavail_size);
3568                         cnt -= unavail_size;
3569                         offset += unavail_size;
3570                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3571                                 == 0) {
3572                                 /* There is no more to transfer in this
3573                                    cluster
3574                                 */
3575                                 *vsmap_ptr = write_vsmap;
3576                                 VSM_CLR(write_vsmap);
3577                                 VSM_CLR(original_read_vsmap);
3578                         }
3579                         continue;
3580                 }
3581
3582                 if(VSM_ISCLR(original_read_vsmap))
3583                         original_read_vsmap = *vsmap_ptr;
3584
3585                 if(ps->ps_segtype == PS_PARTITION) {
3586                         panic("swap partition not supported\n");
3587                         /*NOTREACHED*/
3588                         error = KERN_FAILURE;
3589                         residual = size;
3590 /*
3591                         NEED TO ISSUE WITH SYNC & NO COMMIT
3592                         error = ps_read_device(ps, actual_offset, &buffer,
3593                                        size, &residual, flags);
3594 */
3595                 } else {
3596                         /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3597                         error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
3598                                         size, &residual,
3599                                         (UPL_IOSYNC | UPL_NOCOMMIT));
3600                 }
3601
3602                 read_vsmap = *vsmap_ptr;
3603
3604
3605                 /*
3606                  * Adjust counts and put data in new BS.  Optimize for the
3607                  * common case, i.e. no error and/or partial data.
3608                  * If there was an error, then we need to error the entire
3609                  * range, even if some data was successfully read.
3610                  *
3611                  */
3612                 if ((error == KERN_SUCCESS) && (residual == 0)) {
3613
3614                         /*
3615                          * Got everything we asked for, supply the data to
3616                          * the new BS.  Note that as a side effect of supplying
3617                          * the data, the buffer holding the supplied data is
3618                          * deallocated from the pager's address space unless
3619                          * the write is unsuccessful.
3620                          */
3621
3622                         /* note buffer will be cleaned up in all cases by */
3623                         /* internal_cluster_write or if an error on write */
3624                         /* the vm_map_copy_page_discard call              */
3625                         *vsmap_ptr = write_vsmap;
3626
3627                         if(vs_cluster_write(vs, upl, offset,
3628                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3629                                 error = KERN_FAILURE;
3630                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
3631                                         /* unmap the new backing store object */
3632                                         ps_clunmap(vs, offset, size);
3633                                 }
3634                                 /* original vsmap */
3635                                 *vsmap_ptr = original_read_vsmap;
3636                                 VSM_CLR(write_vsmap);
3637                         } else {
3638                                if((offset + size) &
3639                                         ((vm_page_size << vs->vs_clshift)
3640                                         - 1)) {
3641                                         /* There is more to transfer in this
3642                                            cluster
3643                                         */
3644                                         write_vsmap = *vsmap_ptr;
3645                                         *vsmap_ptr = read_vsmap;
3646                                 } else {
3647                                         /* discard the old backing object */
3648                                         write_vsmap = *vsmap_ptr;
3649                                         *vsmap_ptr = read_vsmap;
3650                                         ps_clunmap(vs, offset, size);
3651                                         *vsmap_ptr = write_vsmap;
3652                                         VSM_CLR(write_vsmap);
3653                                         VSM_CLR(original_read_vsmap);
3654                                 }
3655                         }
3656                 } else {
3657                         size_wanted = size;
3658                         if (error == KERN_SUCCESS) {
3659                                 if (residual == size) {
3660                                         /*
3661                                          * If a read operation returns no error
3662                                          * and no data moved, we turn it into
3663                                          * an error, assuming we're reading at
3664                                          * or beyond EOF.
3665                                          * Fall through and error the entire
3666                                          * range.
3667                                          */
3668                                         error = KERN_FAILURE;
3669                                         *vsmap_ptr = write_vsmap;
3670                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3671                                         /* unmap the new backing store object */
3672                                         ps_clunmap(vs, offset, size);
3673                                         }
3674                                         *vsmap_ptr = original_read_vsmap;
3675                                         VSM_CLR(write_vsmap);
3676                                         continue;
3677                                 } else {
3678                                         /*
3679                                          * Otherwise, we have partial read.
3680                                          * This is also considered an error
3681                                          * for the purposes of cluster transfer
3682                                          */
3683                                         error = KERN_FAILURE;
3684                                         *vsmap_ptr = write_vsmap;
3685                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3686                                         /* unmap the new backing store object */
3687                                         ps_clunmap(vs, offset, size);
3688                                         }
3689                                         *vsmap_ptr = original_read_vsmap;
3690                                         VSM_CLR(write_vsmap);
3691                                         continue;
3692                                 }
3693                         }
3694
3695                 }
3696                 cnt -= size;
3697                 offset += size;
3698
3699         } /* END while (cnt && (error == 0)) */
3700         if(!VSM_ISCLR(write_vsmap))
3701                 *vsmap_ptr = write_vsmap;
3702
3703         return error;
3704 }
3705
3706 kern_return_t
3707 default_pager_add_file(
3708         MACH_PORT_FACE  backing_store,
3709         vnode_ptr_t     vp,
3710         int             record_size,
3711         vm_size_t       size)
3712 {
3713         backing_store_t         bs;
3714         paging_segment_t        ps;
3715         int                     i;
3716         unsigned int            j;
3717         int                     error;
3718
3719         if ((bs = backing_store_lookup(backing_store))
3720             == BACKING_STORE_NULL)
3721                 return KERN_INVALID_ARGUMENT;
3722
3723         PSL_LOCK();
3724         for (i = 0; i <= paging_segment_max; i++) {
3725                 ps = paging_segments[i];
3726                 if (ps == PAGING_SEGMENT_NULL)
3727                         continue;
3728                 if (ps->ps_segtype != PS_FILE)
3729                         continue;
3730
3731                 /*
3732                  * Check for overlap on same device.
3733                  */
3734                 if (ps->ps_vnode == (struct vnode *)vp) {
3735                         PSL_UNLOCK();
3736                         BS_UNLOCK(bs);
3737                         return KERN_INVALID_ARGUMENT;
3738                 }
3739         }
3740         PSL_UNLOCK();
3741
3742         /*
3743          * Set up the paging segment
3744          */
3745         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3746         if (ps == PAGING_SEGMENT_NULL) {
3747                 BS_UNLOCK(bs);
3748                 return KERN_RESOURCE_SHORTAGE;
3749         }
3750
3751         ps->ps_segtype = PS_FILE;
3752         ps->ps_vnode = (struct vnode *)vp;
3753         ps->ps_offset = 0;
3754         ps->ps_record_shift = local_log2(vm_page_size / record_size);
3755         ps->ps_recnum = size;
3756         ps->ps_pgnum = size >> ps->ps_record_shift;
3757
3758         ps->ps_pgcount = ps->ps_pgnum;
3759         ps->ps_clshift = local_log2(bs->bs_clsize);
3760         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3761         ps->ps_hint = 0;
3762
3763         PS_LOCK_INIT(ps);
3764         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3765         if (!ps->ps_bmap) {
3766                 kfree(ps, sizeof *ps);
3767                 BS_UNLOCK(bs);
3768                 return KERN_RESOURCE_SHORTAGE;
3769         }
3770         for (j = 0; j < ps->ps_ncls; j++) {
3771                 clrbit(ps->ps_bmap, j);
3772         }
3773
3774         ps->ps_going_away = FALSE;
3775         ps->ps_bs = bs;
3776
3777         if ((error = ps_enter(ps)) != 0) {
3778                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3779                 kfree(ps, sizeof *ps);
3780                 BS_UNLOCK(bs);
3781                 return KERN_RESOURCE_SHORTAGE;
3782         }
3783
3784         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3785         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3786         PSL_LOCK();
3787         dp_pages_free += ps->ps_pgcount;
3788         PSL_UNLOCK();
3789
3790         BS_UNLOCK(bs);
3791
3792         bs_more_space(ps->ps_clcount);
3793
3794         DP_DEBUG(DEBUG_BS_INTERNAL,
3795                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3796                   device, offset, size, record_size,
3797                   ps->ps_record_shift, ps->ps_pgnum));
3798
3799         return KERN_SUCCESS;
3800 }
3801
3802
3803
3804 kern_return_t
3805 ps_read_file(
3806         paging_segment_t        ps,
3807         upl_t                   upl,
3808         upl_offset_t            upl_offset,
3809         vm_offset_t             offset,
3810         upl_size_t              size,
3811         unsigned int            *residualp,
3812         int                     flags)
3813 {
3814         vm_object_offset_t      f_offset;
3815         int                     error = 0;
3816         int                     result;
3817
3818         assert(dp_encryption_inited);
3819
3820         clustered_reads[atop_32(size)]++;
3821
3822         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3823
3824         /*
3825          * for transfer case we need to pass uploffset and flags
3826          */
3827         error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL);
3828
3829         /* The vnode_pagein semantic is somewhat at odds with the existing   */
3830         /* device_read semantic.  Partial reads are not experienced at this  */
3831         /* level.  It is up to the bit map code and cluster read code to     */
3832         /* check that requested data locations are actually backed, and the  */
3833         /* pagein code to either read all of the requested data or return an */
3834         /* error. */
3835
3836         if (error)
3837                 result = KERN_FAILURE;
3838         else {
3839                 *residualp = 0;
3840                 result = KERN_SUCCESS;
3841         }
3842         return result;
3843 }
3844
3845 kern_return_t
3846 ps_write_file(
3847         paging_segment_t        ps,
3848         upl_t                   upl,
3849         upl_offset_t            upl_offset,
3850         vm_offset_t             offset,
3851         unsigned int            size,
3852         int                     flags)
3853 {
3854         vm_object_offset_t      f_offset;
3855         kern_return_t           result;
3856
3857         assert(dp_encryption_inited);
3858
3859         clustered_writes[atop_32(size)]++;
3860         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3861
3862         if (flags & UPL_PAGING_ENCRYPTED) {
3863                 /*
3864                  * ENCRYPTED SWAP:
3865                  * encrypt all the pages that we're going
3866                  * to pageout.
3867                  */
3868                 upl_encrypt(upl, upl_offset, size);
3869         }
3870         if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3871                 result = KERN_FAILURE;
3872         else
3873                 result = KERN_SUCCESS;
3874
3875         return result;
3876 }
3877
3878 kern_return_t
3879 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
3880         int             hi_wat,
3881         int             lo_wat,
3882         int             flags,
3883         MACH_PORT_FACE  trigger_port)
3884 {
3885         MACH_PORT_FACE release;
3886         kern_return_t kr;
3887
3888         PSL_LOCK();
3889         if (flags == SWAP_ENCRYPT_ON) {
3890                 /* ENCRYPTED SWAP: turn encryption on */
3891                 release = trigger_port;
3892                 if (!dp_encryption_inited) {
3893                         dp_encryption_inited = TRUE;
3894                         dp_encryption = TRUE;
3895                         kr = KERN_SUCCESS;
3896                 } else {
3897                         kr = KERN_FAILURE;
3898                 }
3899         } else if (flags == SWAP_ENCRYPT_OFF) {
3900                 /* ENCRYPTED SWAP: turn encryption off */
3901                 release = trigger_port;
3902                 if (!dp_encryption_inited) {
3903                         dp_encryption_inited = TRUE;
3904                         dp_encryption = FALSE;
3905                         kr = KERN_SUCCESS;
3906                 } else {
3907                         kr = KERN_FAILURE;
3908                 }
3909         } else if (flags == HI_WAT_ALERT) {
3910                 release = min_pages_trigger_port;
3911                 min_pages_trigger_port = trigger_port;
3912                 minimum_pages_remaining = hi_wat/vm_page_size;
3913                 bs_low = FALSE;
3914                 kr = KERN_SUCCESS;
3915         } else if (flags ==  LO_WAT_ALERT) {
3916                 release = max_pages_trigger_port;
3917                 max_pages_trigger_port = trigger_port;
3918                 maximum_pages_free = lo_wat/vm_page_size;
3919                 kr = KERN_SUCCESS;
3920         } else {
3921                 release = trigger_port;
3922                 kr =  KERN_INVALID_ARGUMENT;
3923         }
3924         PSL_UNLOCK();
3925
3926         if (IP_VALID(release))
3927                 ipc_port_release_send(release);
3928
3929         return kr;
3930 }
3931
3932 /*
3933  * Monitor the amount of available backing store vs. the amount of
3934  * required backing store, notify a listener (if present) when
3935  * backing store may safely be removed.
3936  *
3937  * We attempt to avoid the situation where backing store is
3938  * discarded en masse, as this can lead to thrashing as the
3939  * backing store is compacted.
3940  */
3941
3942 #define PF_INTERVAL     3       /* time between free level checks */
3943 #define PF_LATENCY      10      /* number of intervals before release */
3944
3945 static int dp_pages_free_low_count = 0;
3946 thread_call_t default_pager_backing_store_monitor_callout;
3947
3948 void
3949 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
3950                                                                         __unused thread_call_param_t p2)
3951 {
3952 //      unsigned long long      average;
3953         ipc_port_t              trigger;
3954         uint64_t                deadline;
3955
3956         /*
3957          * We determine whether it will be safe to release some
3958          * backing store by watching the free page level.  If
3959          * it remains below the maximum_pages_free threshold for
3960          * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3961          * then we deem it safe.
3962          *
3963          * Note that this establishes a maximum rate at which backing
3964          * store will be released, as each notification (currently)
3965          * only results in a single backing store object being
3966          * released.
3967          */
3968         if (dp_pages_free > maximum_pages_free) {
3969                 dp_pages_free_low_count++;
3970         } else {
3971                 dp_pages_free_low_count = 0;
3972         }
3973
3974         /* decide whether to send notification */
3975         trigger = IP_NULL;
3976         if (max_pages_trigger_port &&
3977             (backing_store_release_trigger_disable == 0) &&
3978             (dp_pages_free_low_count > PF_LATENCY)) {
3979                 trigger = max_pages_trigger_port;
3980                 max_pages_trigger_port = NULL;
3981         }
3982
3983         /* send notification */
3984         if (trigger != IP_NULL) {
3985                 VSL_LOCK();
3986                 if(backing_store_release_trigger_disable != 0) {
3987                         assert_wait((event_t)
3988                                     &backing_store_release_trigger_disable,
3989                                     THREAD_UNINT);
3990                         VSL_UNLOCK();
3991                         thread_block(THREAD_CONTINUE_NULL);
3992                 } else {
3993                         VSL_UNLOCK();
3994                 }
3995                 default_pager_space_alert(trigger, LO_WAT_ALERT);
3996                 ipc_port_release_send(trigger);
3997                 dp_pages_free_low_count = 0;
3998         }
3999
4000         clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
4001         thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
4002 }