osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56
  57 /*
  58  *      Default Pager.
  59  *              Paging File Management.
  60  */
  61
  62 #include <mach/host_priv.h>
  63 #include <mach/memory_object_control.h>
  64 #include <mach/memory_object_server.h>
  65 #include <mach/upl.h>
  66 #include <default_pager/default_pager_internal.h>
  67 #include <default_pager/default_pager_alerts.h>
  68 #include <default_pager/default_pager_object_server.h>
  69
  70 #include <ipc/ipc_types.h>
  71 #include <ipc/ipc_port.h>
  72 #include <ipc/ipc_space.h>
  73
  74 #include <kern/kern_types.h>
  75 #include <kern/host.h>
  76 #include <kern/queue.h>
  77 #include <kern/counters.h>
  78 #include <kern/sched_prim.h>
  79
  80 #include <vm/vm_kern.h>
  81 #include <vm/vm_pageout.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/vm_object.h>
  84 #include <vm/vm_protos.h>
  85
  86
  87 /* todo - need large internal object support */
  88
  89 /*
  90  * ALLOC_STRIDE... the maximum number of bytes allocated from
  91  * a swap file before moving on to the next swap file... if
  92  * all swap files reside on a single disk, this value should
  93  * be very large (this is the default assumption)... if the
  94  * swap files are spread across multiple disks, than this value
  95  * should be small (128 * 1024)...
  96  *
  97  * This should be determined dynamically in the future
  98  */
  99
 100 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
 101 int physical_transfer_cluster_count = 0;
 102
 103 #define VM_SUPER_CLUSTER        0x40000
 104 #define VM_SUPER_PAGES          (VM_SUPER_CLUSTER / PAGE_MIN_SIZE)
 105
 106 /*
 107  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
 108  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
 109  */
 110 #define VSTRUCT_MIN_CLSHIFT     0
 111
 112 #define VSTRUCT_DEF_CLSHIFT     2
 113 int default_pager_clsize = 0;
 114
 115 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
 116
 117 /* statistics */
 118 unsigned int clustered_writes[VM_SUPER_PAGES+1];
 119 unsigned int clustered_reads[VM_SUPER_PAGES+1];
 120
 121 /*
 122  * Globals used for asynchronous paging operations:
 123  *      vs_async_list:  head of list of to-be-completed I/O ops
 124  *      async_num_queued: number of pages completed, but not yet
 125  *              processed by async thread.
 126  *      async_requests_out: number of pages of requests not completed.
 127  */
 128
 129 #if 0
 130 struct vs_async *vs_async_list;
 131 int     async_num_queued;
 132 int     async_requests_out;
 133 #endif
 134
 135
 136 #define VS_ASYNC_REUSE 1
 137 struct vs_async *vs_async_free_list;
 138
 139 lck_mtx_t       default_pager_async_lock;       /* Protects globals above */
 140
 141
 142 int vs_alloc_async_failed = 0;                  /* statistics */
 143 int vs_alloc_async_count = 0;                   /* statistics */
 144 struct vs_async *vs_alloc_async(void);          /* forward */
 145 void vs_free_async(struct vs_async *vsa);       /* forward */
 146
 147
 148 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 149 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 150
 151 #define VS_ASYNC_LOCK()         lck_mtx_lock(&default_pager_async_lock)
 152 #define VS_ASYNC_UNLOCK()       lck_mtx_unlock(&default_pager_async_lock)
 153 #define VS_ASYNC_LOCK_INIT()    lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
 154 #define VS_ASYNC_LOCK_DESTROY() lck_mtx_destroy(&default_pager_async_lock, &default_pager_lck_grp)
 155 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 156 /*
 157  *  Paging Space Hysteresis triggers and the target notification port
 158  *
 159  */
 160 unsigned int    dp_pages_free_drift_count = 0;
 161 unsigned int    dp_pages_free_drifted_max = 0;
 162 unsigned int    minimum_pages_remaining = 0;
 163 unsigned int    maximum_pages_free = 0;
 164 ipc_port_t      min_pages_trigger_port = NULL;
 165 ipc_port_t      max_pages_trigger_port = NULL;
 166
 167 #if CONFIG_FREEZE
 168 boolean_t       use_emergency_swap_file_first = TRUE;
 169 #else
 170 boolean_t       use_emergency_swap_file_first = FALSE;
 171 #endif
 172 boolean_t       bs_low = FALSE;
 173 int             backing_store_release_trigger_disable = 0;
 174 boolean_t       backing_store_stop_compaction = FALSE;
 175 boolean_t       backing_store_abort_compaction = FALSE;
 176
 177 /* Have we decided if swap needs to be encrypted yet ? */
 178 boolean_t       dp_encryption_inited = FALSE;
 179 /* Should we encrypt swap ? */
 180 boolean_t       dp_encryption = FALSE;
 181
 182 boolean_t       dp_isssd = FALSE;
 183
 184 /*
 185  * Object sizes are rounded up to the next power of 2,
 186  * unless they are bigger than a given maximum size.
 187  */
 188 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 189
 190 /*
 191  * List of all backing store and segments.
 192  */
 193 MACH_PORT_FACE          emergency_segment_backing_store;
 194 struct backing_store_list_head backing_store_list;
 195 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 196 lck_mtx_t                       paging_segments_lock;
 197 int                     paging_segment_max = 0;
 198 int                     paging_segment_count = 0;
 199 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 200
 201
 202 /*
 203  * Total pages free in system
 204  * This differs from clusters committed/avail which is a measure of the
 205  * over commitment of paging segments to backing store.  An idea which is
 206  * likely to be deprecated.
 207  */
 208 unsigned  int   dp_pages_free = 0;
 209 unsigned  int   dp_pages_reserve = 0;
 210 unsigned  int   cluster_transfer_minimum = 100;
 211
 212 /*
 213  * Trim state
 214  */
 215 struct ps_vnode_trim_data {
 216         struct vnode *vp;
 217         dp_offset_t   offset;
 218         dp_size_t     length;
 219 };
 220
 221 /* forward declarations */
 222 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int);     /* forward */
 223 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int);     /* forward */
 224 default_pager_thread_t *get_read_buffer( void );
 225 kern_return_t ps_vstruct_transfer_from_segment(
 226         vstruct_t        vs,
 227         paging_segment_t segment,
 228         upl_t            upl);
 229 kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
 230 kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
 231 kern_return_t vs_cluster_transfer(
 232         vstruct_t       vs,
 233         dp_offset_t     offset,
 234         dp_size_t       cnt,
 235         upl_t           upl);
 236 vs_map_t vs_get_map_entry(
 237         vstruct_t       vs,
 238         dp_offset_t     offset);
 239
 240 kern_return_t
 241 default_pager_backing_store_delete_internal( MACH_PORT_FACE );
 242
 243 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data);
 244 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data);
 245 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length);
 246
 247 default_pager_thread_t *
 248 get_read_buffer( void )
 249 {
 250         int     i;
 251
 252         DPT_LOCK(dpt_lock);
 253         while(TRUE) {
 254                 for (i=0; i<default_pager_internal_count; i++) {
 255                         if(dpt_array[i]->checked_out == FALSE) {
 256                           dpt_array[i]->checked_out = TRUE;
 257                           DPT_UNLOCK(dpt_lock);
 258                           return  dpt_array[i];
 259                         }
 260                 }
 261                 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
 262         }
 263 }
 264
 265 void
 266 bs_initialize(void)
 267 {
 268         int i;
 269
 270         /*
 271          * List of all backing store.
 272          */
 273         BSL_LOCK_INIT();
 274         queue_init(&backing_store_list.bsl_queue);
 275         PSL_LOCK_INIT();
 276
 277         VS_ASYNC_LOCK_INIT();
 278 #if     VS_ASYNC_REUSE
 279         vs_async_free_list = NULL;
 280 #endif  /* VS_ASYNC_REUSE */
 281
 282         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 283                 clustered_writes[i] = 0;
 284                 clustered_reads[i] = 0;
 285         }
 286
 287 }
 288
 289 /*
 290  * When things do not quite workout...
 291  */
 292 void bs_no_paging_space(boolean_t);     /* forward */
 293
 294 void
 295 bs_no_paging_space(
 296         boolean_t out_of_memory)
 297 {
 298
 299         if (out_of_memory)
 300                 dprintf(("*** OUT OF MEMORY ***\n"));
 301         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 302 }
 303
 304 void bs_more_space(int);        /* forward */
 305 void bs_commit(int);            /* forward */
 306
 307 boolean_t       user_warned = FALSE;
 308 unsigned int    clusters_committed = 0;
 309 unsigned int    clusters_available = 0;
 310 unsigned int    clusters_committed_peak = 0;
 311
 312 void
 313 bs_more_space(
 314         int     nclusters)
 315 {
 316         BSL_LOCK();
 317         /*
 318          * Account for new paging space.
 319          */
 320         clusters_available += nclusters;
 321
 322         if (clusters_available >= clusters_committed) {
 323                 if (verbose && user_warned) {
 324                         printf("%s%s - %d excess clusters now.\n",
 325                                my_name,
 326                                "paging space is OK now",
 327                                clusters_available - clusters_committed);
 328                         user_warned = FALSE;
 329                         clusters_committed_peak = 0;
 330                 }
 331         } else {
 332                 if (verbose && user_warned) {
 333                         printf("%s%s - still short of %d clusters.\n",
 334                                my_name,
 335                                "WARNING: paging space over-committed",
 336                                clusters_committed - clusters_available);
 337                         clusters_committed_peak -= nclusters;
 338                 }
 339         }
 340         BSL_UNLOCK();
 341
 342         return;
 343 }
 344
 345 void
 346 bs_commit(
 347         int     nclusters)
 348 {
 349         BSL_LOCK();
 350         clusters_committed += nclusters;
 351         if (clusters_committed > clusters_available) {
 352                 if (verbose && !user_warned) {
 353                         user_warned = TRUE;
 354                         printf("%s%s - short of %d clusters.\n",
 355                                my_name,
 356                                "WARNING: paging space over-committed",
 357                                clusters_committed - clusters_available);
 358                 }
 359                 if (clusters_committed > clusters_committed_peak) {
 360                         clusters_committed_peak = clusters_committed;
 361                 }
 362         } else {
 363                 if (verbose && user_warned) {
 364                         printf("%s%s - was short of up to %d clusters.\n",
 365                                my_name,
 366                                "paging space is OK now",
 367                                clusters_committed_peak - clusters_available);
 368                         user_warned = FALSE;
 369                         clusters_committed_peak = 0;
 370                 }
 371         }
 372         BSL_UNLOCK();
 373
 374         return;
 375 }
 376
 377 int default_pager_info_verbose = 1;
 378
 379 void
 380 bs_global_info(
 381         uint64_t        *totalp,
 382         uint64_t        *freep)
 383 {
 384         uint64_t                pages_total, pages_free;
 385         paging_segment_t        ps;
 386         int                     i;
 387
 388         PSL_LOCK();
 389         pages_total = pages_free = 0;
 390         for (i = 0; i <= paging_segment_max; i++) {
 391                 ps = paging_segments[i];
 392                 if (ps == PAGING_SEGMENT_NULL)
 393                         continue;
 394
 395                 /*
 396                  * no need to lock: by the time this data
 397                  * gets back to any remote requestor it
 398                  * will be obsolete anyways
 399                  */
 400                 pages_total += ps->ps_pgnum;
 401                 pages_free += ps->ps_clcount << ps->ps_clshift;
 402                 DP_DEBUG(DEBUG_BS_INTERNAL,
 403                          ("segment #%d: %d total, %d free\n",
 404                           i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 405         }
 406         *totalp = pages_total;
 407         *freep = pages_free;
 408         if (verbose && user_warned && default_pager_info_verbose) {
 409                 if (clusters_available < clusters_committed) {
 410                         printf("%s %d clusters committed, %d available.\n",
 411                                my_name,
 412                                clusters_committed,
 413                                clusters_available);
 414                 }
 415         }
 416         PSL_UNLOCK();
 417 }
 418
 419 backing_store_t backing_store_alloc(void);      /* forward */
 420
 421 backing_store_t
 422 backing_store_alloc(void)
 423 {
 424         backing_store_t bs;
 425
 426         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 427         if (bs == BACKING_STORE_NULL)
 428                 panic("backing_store_alloc: no memory");
 429
 430         BS_LOCK_INIT(bs);
 431         bs->bs_port = MACH_PORT_NULL;
 432         bs->bs_priority = 0;
 433         bs->bs_clsize = 0;
 434         bs->bs_pages_total = 0;
 435         bs->bs_pages_in = 0;
 436         bs->bs_pages_in_fail = 0;
 437         bs->bs_pages_out = 0;
 438         bs->bs_pages_out_fail = 0;
 439
 440         return bs;
 441 }
 442
 443 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 444
 445 /* Even in both the component space and external versions of this pager, */
 446 /* backing_store_lookup will be called from tasks in the application space */
 447 backing_store_t
 448 backing_store_lookup(
 449         MACH_PORT_FACE port)
 450 {
 451         backing_store_t bs;
 452
 453 /*
 454         port is currently backed with a vs structure in the alias field
 455         we could create an ISBS alias and a port_is_bs call but frankly
 456         I see no reason for the test, the bs->port == port check below
 457         will work properly on junk entries.
 458
 459         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 460 */
 461         if (port == MACH_PORT_NULL)
 462                 return BACKING_STORE_NULL;
 463
 464         BSL_LOCK();
 465         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 466                       bs_links) {
 467                 BS_LOCK(bs);
 468                 if (bs->bs_port == port) {
 469                         BSL_UNLOCK();
 470                         /* Success, return it locked. */
 471                         return bs;
 472                 }
 473                 BS_UNLOCK(bs);
 474         }
 475         BSL_UNLOCK();
 476         return BACKING_STORE_NULL;
 477 }
 478
 479 void backing_store_add(backing_store_t);        /* forward */
 480
 481 void
 482 backing_store_add(
 483         __unused backing_store_t bs)
 484 {
 485 //      MACH_PORT_FACE          port = bs->bs_port;
 486 //      MACH_PORT_FACE          pset = default_pager_default_set;
 487         kern_return_t           kr = KERN_SUCCESS;
 488
 489         if (kr != KERN_SUCCESS)
 490                 panic("backing_store_add: add to set");
 491
 492 }
 493
 494 /*
 495  * Set up default page shift, but only if not already
 496  * set and argument is within range.
 497  */
 498 boolean_t
 499 bs_set_default_clsize(unsigned int npages)
 500 {
 501         switch(npages){
 502             case 1:
 503             case 2:
 504             case 4:
 505             case 8:
 506                 if (default_pager_clsize == 0)  /* if not yet set */
 507                         vstruct_def_clshift = local_log2(npages);
 508                 return(TRUE);
 509         }
 510         return(FALSE);
 511 }
 512
 513 int bs_get_global_clsize(int clsize);   /* forward */
 514
 515 int
 516 bs_get_global_clsize(
 517         int     clsize)
 518 {
 519         int                     i;
 520         memory_object_default_t dmm;
 521         kern_return_t           kr;
 522
 523         /*
 524          * Only allow setting of cluster size once. If called
 525          * with no cluster size (default), we use the compiled-in default
 526          * for the duration. The same cluster size is used for all
 527          * paging segments.
 528          */
 529         if (default_pager_clsize == 0) {
 530                 /*
 531                  * Keep cluster size in bit shift because it's quicker
 532                  * arithmetic, and easier to keep at a power of 2.
 533                  */
 534                 if (clsize != NO_CLSIZE) {
 535                         for (i = 0; (1 << i) < clsize; i++);
 536                         if (i > MAX_CLUSTER_SHIFT)
 537                                 i = MAX_CLUSTER_SHIFT;
 538                         vstruct_def_clshift = i;
 539                 }
 540                 default_pager_clsize = (1 << vstruct_def_clshift);
 541
 542                 /*
 543                  * Let the user know the new (and definitive) cluster size.
 544                  */
 545                 if (verbose)
 546                         printf("%scluster size = %d page%s\n",
 547                                 my_name, default_pager_clsize,
 548                                 (default_pager_clsize == 1) ? "" : "s");
 549
 550                 /*
 551                  * Let the kernel know too, in case it hasn't used the
 552                  * default value provided in main() yet.
 553                  */
 554                 dmm = default_pager_object;
 555                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 556                 kr = host_default_memory_manager(host_priv_self(),
 557                                                  &dmm,
 558                                                  clsize);
 559                 memory_object_default_deallocate(dmm);
 560
 561                 if (kr != KERN_SUCCESS) {
 562                    panic("bs_get_global_cl_size:host_default_memory_manager");
 563                 }
 564                 if (dmm != default_pager_object) {
 565                   panic("bs_get_global_cl_size:there is another default pager");
 566                 }
 567         }
 568         ASSERT(default_pager_clsize > 0 &&
 569                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 570
 571         return default_pager_clsize;
 572 }
 573
 574 kern_return_t
 575 default_pager_backing_store_create(
 576         memory_object_default_t pager,
 577         int                     priority,
 578         int                     clsize,         /* in bytes */
 579         MACH_PORT_FACE          *backing_store)
 580 {
 581         backing_store_t bs;
 582         MACH_PORT_FACE  port;
 583 //      kern_return_t   kr;
 584         struct vstruct_alias *alias_struct;
 585
 586         if (pager != default_pager_object)
 587                 return KERN_INVALID_ARGUMENT;
 588
 589         bs = backing_store_alloc();
 590         port = ipc_port_alloc_kernel();
 591         ipc_port_make_send(port);
 592         assert (port != IP_NULL);
 593
 594         DP_DEBUG(DEBUG_BS_EXTERNAL,
 595                  ("priority=%d clsize=%d bs_port=0x%x\n",
 596                   priority, clsize, (int) backing_store));
 597
 598         alias_struct = (struct vstruct_alias *)
 599                                 kalloc(sizeof (struct vstruct_alias));
 600         if(alias_struct != NULL) {
 601                 alias_struct->vs = (struct vstruct *)bs;
 602                 alias_struct->name = &default_pager_ops;
 603                 port->ip_alias = (uintptr_t) alias_struct;
 604         }
 605         else {
 606                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 607
 608                 BS_LOCK_DESTROY(bs);
 609                 kfree(bs, sizeof (struct backing_store));
 610
 611                 return KERN_RESOURCE_SHORTAGE;
 612         }
 613
 614         bs->bs_port = port;
 615         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 616                 priority = BS_MAXPRI;
 617         else if (priority == BS_NOPRI)
 618                 priority = BS_MAXPRI;
 619         else
 620                 priority = BS_MINPRI;
 621         bs->bs_priority = priority;
 622
 623         bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
 624
 625         BSL_LOCK();
 626         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 627                     bs_links);
 628         BSL_UNLOCK();
 629
 630         backing_store_add(bs);
 631
 632         *backing_store = port;
 633         return KERN_SUCCESS;
 634 }
 635
 636 kern_return_t
 637 default_pager_backing_store_info(
 638         MACH_PORT_FACE          backing_store,
 639         backing_store_flavor_t  flavour,
 640         backing_store_info_t    info,
 641         mach_msg_type_number_t  *size)
 642 {
 643         backing_store_t                 bs;
 644         backing_store_basic_info_t      basic;
 645         int                             i;
 646         paging_segment_t                ps;
 647
 648         if (flavour != BACKING_STORE_BASIC_INFO ||
 649             *size < BACKING_STORE_BASIC_INFO_COUNT)
 650                 return KERN_INVALID_ARGUMENT;
 651
 652         basic = (backing_store_basic_info_t)info;
 653         *size = BACKING_STORE_BASIC_INFO_COUNT;
 654
 655         VSTATS_LOCK(&global_stats.gs_lock);
 656         basic->pageout_calls    = global_stats.gs_pageout_calls;
 657         basic->pagein_calls     = global_stats.gs_pagein_calls;
 658         basic->pages_in         = global_stats.gs_pages_in;
 659         basic->pages_out        = global_stats.gs_pages_out;
 660         basic->pages_unavail    = global_stats.gs_pages_unavail;
 661         basic->pages_init       = global_stats.gs_pages_init;
 662         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 663         VSTATS_UNLOCK(&global_stats.gs_lock);
 664
 665         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 666                 return KERN_INVALID_ARGUMENT;
 667
 668         basic->bs_pages_total   = bs->bs_pages_total;
 669         PSL_LOCK();
 670         bs->bs_pages_free = 0;
 671         for (i = 0; i <= paging_segment_max; i++) {
 672                 ps = paging_segments[i];
 673                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 674                         PS_LOCK(ps);
 675                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 676                         PS_UNLOCK(ps);
 677                 }
 678         }
 679         PSL_UNLOCK();
 680         basic->bs_pages_free    = bs->bs_pages_free;
 681         basic->bs_pages_in      = bs->bs_pages_in;
 682         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 683         basic->bs_pages_out     = bs->bs_pages_out;
 684         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 685
 686         basic->bs_priority      = bs->bs_priority;
 687         basic->bs_clsize        = ptoa_32(bs->bs_clsize);       /* in bytes */
 688
 689         BS_UNLOCK(bs);
 690
 691         return KERN_SUCCESS;
 692 }
 693
 694 int ps_delete(paging_segment_t);        /* forward */
 695 boolean_t current_thread_aborted(void);
 696
 697 int
 698 ps_delete(
 699         paging_segment_t ps)
 700 {
 701         vstruct_t       vs;
 702         kern_return_t   error = KERN_SUCCESS;
 703         int             vs_count;
 704
 705         VSL_LOCK();             /* get the lock on the list of vs's      */
 706
 707         /* The lock relationship and sequence is farily complicated      */
 708         /* this code looks at a live list, locking and unlocking the list */
 709         /* as it traverses it.  It depends on the locking behavior of    */
 710         /* default_pager_no_senders.  no_senders always locks the vstruct */
 711         /* targeted for removal before locking the vstruct list.  However */
 712         /* it will remove that member of the list without locking its    */
 713         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 714         /* it cannot be removed from the list but we must hold the list  */
 715         /* lock to be sure that its pointers to its neighbors are valid. */
 716         /* Also, we can hold off destruction of a vstruct when the list  */
 717         /* lock and the vs locks are not being held by bumping the       */
 718         /* vs_async_pending count.      */
 719
 720
 721         while(backing_store_release_trigger_disable != 0) {
 722                 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
 723         }
 724
 725         /* we will choose instead to hold a send right */
 726         vs_count = vstruct_list.vsl_count;
 727         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 728         if(vs == (vstruct_t)&vstruct_list)  {
 729                 VSL_UNLOCK();
 730                 return KERN_SUCCESS;
 731         }
 732         VS_LOCK(vs);
 733         vs_async_wait(vs);  /* wait for any pending async writes */
 734         if ((vs_count != 0) && (vs != NULL))
 735                 vs->vs_async_pending += 1;  /* hold parties calling  */
 736                                             /* vs_async_wait */
 737
 738         if (bs_low == FALSE)
 739                 backing_store_abort_compaction = FALSE;
 740
 741         VS_UNLOCK(vs);
 742         VSL_UNLOCK();
 743         while((vs_count != 0) && (vs != NULL)) {
 744                 /* We take the count of AMO's before beginning the         */
 745                 /* transfer of of the target segment.                      */
 746                 /* We are guaranteed that the target segment cannot get    */
 747                 /* more users.  We also know that queue entries are        */
 748                 /* made at the back of the list.  If some of the entries   */
 749                 /* we would check disappear while we are traversing the    */
 750                 /* list then we will either check new entries which        */
 751                 /* do not have any backing store in the target segment     */
 752                 /* or re-check old entries.  This might not be optimal     */
 753                 /* but it will always be correct. The alternative is to    */
 754                 /* take a snapshot of the list.                            */
 755                 vstruct_t       next_vs;
 756
 757                 if(dp_pages_free < cluster_transfer_minimum)
 758                         error = KERN_FAILURE;
 759                 else {
 760                         vm_object_t         transfer_object;
 761                         unsigned int        count;
 762                         upl_t               upl;
 763                         upl_control_flags_t upl_flags;
 764
 765                         transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
 766                         count = 0;
 767                         upl_flags = (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE |
 768                                      UPL_SET_LITE | UPL_SET_INTERNAL);
 769                         if (dp_encryption) {
 770                                 /* mark the pages as "encrypted" when they come in */
 771                                 upl_flags |= UPL_ENCRYPT;
 772                         }
 773                         error = vm_object_upl_request(transfer_object,
 774                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 775                                 &upl, NULL, &count, upl_flags);
 776
 777                         if(error == KERN_SUCCESS) {
 778                                 error = ps_vstruct_transfer_from_segment(
 779                                                         vs, ps, upl);
 780                                 upl_commit(upl, NULL, 0);
 781                                 upl_deallocate(upl);
 782                         } else {
 783                                 error = KERN_FAILURE;
 784                         }
 785                         vm_object_deallocate(transfer_object);
 786                 }
 787                 if(error || current_thread_aborted()) {
 788                         VS_LOCK(vs);
 789                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 790                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 791                                 vs->vs_waiting_async = FALSE;
 792                                 VS_UNLOCK(vs);
 793                                 thread_wakeup(&vs->vs_async_pending);
 794                         } else {
 795                                 VS_UNLOCK(vs);
 796                         }
 797                         return KERN_FAILURE;
 798                 }
 799
 800                 VSL_LOCK();
 801
 802                 while(backing_store_release_trigger_disable != 0) {
 803                         VSL_SLEEP(&backing_store_release_trigger_disable,
 804                                   THREAD_UNINT);
 805                 }
 806
 807                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 808                 if((next_vs != (vstruct_t)&vstruct_list) &&
 809                                 (vs != next_vs) && (vs_count != 1)) {
 810                         VS_LOCK(next_vs);
 811                         vs_async_wait(next_vs);  /* wait for any  */
 812                                                  /* pending async writes */
 813                         next_vs->vs_async_pending += 1; /* hold parties  */
 814                                                 /* calling vs_async_wait */
 815                         VS_UNLOCK(next_vs);
 816                 }
 817                 VSL_UNLOCK();
 818                 VS_LOCK(vs);
 819                 vs->vs_async_pending -= 1;
 820                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 821                         vs->vs_waiting_async = FALSE;
 822                         VS_UNLOCK(vs);
 823                         thread_wakeup(&vs->vs_async_pending);
 824                 } else {
 825                         VS_UNLOCK(vs);
 826                 }
 827                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 828                         vs = NULL;
 829                 else
 830                         vs = next_vs;
 831                 vs_count--;
 832         }
 833         return KERN_SUCCESS;
 834 }
 835
 836
 837 kern_return_t
 838 default_pager_backing_store_delete_internal(
 839         MACH_PORT_FACE backing_store)
 840 {
 841         backing_store_t         bs;
 842         int                     i;
 843         paging_segment_t        ps;
 844         int                     error;
 845         int                     interim_pages_removed = 0;
 846         boolean_t               dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store );
 847
 848         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 849                 return KERN_INVALID_ARGUMENT;
 850
 851 restart:
 852         PSL_LOCK();
 853         error = KERN_SUCCESS;
 854         for (i = 0; i <= paging_segment_max; i++) {
 855                 ps = paging_segments[i];
 856                 if (ps != PAGING_SEGMENT_NULL &&
 857                     ps->ps_bs == bs &&
 858                     ! IS_PS_GOING_AWAY(ps)) {
 859                         PS_LOCK(ps);
 860
 861                         if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) {
 862                         /*
 863                          * Someone is already busy reclamining this paging segment.
 864                          * If it's the emergency segment we are looking at then check
 865                          * that someone has not already recovered it and set the right
 866                          * state i.e. online but not activated.
 867                          */
 868                                 PS_UNLOCK(ps);
 869                                 continue;
 870                         }
 871
 872                         /* disable access to this segment */
 873                         ps->ps_state &= ~PS_CAN_USE;
 874                         ps->ps_state |= PS_GOING_AWAY;
 875                         PS_UNLOCK(ps);
 876                         /*
 877                          * The "ps" segment is "off-line" now,
 878                          * we can try and delete it...
 879                          */
 880                         if(dp_pages_free < (cluster_transfer_minimum
 881                                                         + ps->ps_pgcount)) {
 882                                 error = KERN_FAILURE;
 883                                 PSL_UNLOCK();
 884                         }
 885                         else {
 886                                 /* remove all pages associated with the  */
 887                                 /* segment from the list of free pages   */
 888                                 /* when transfer is through, all target  */
 889                                 /* segment pages will appear to be free  */
 890
 891                                 dp_pages_free -=  ps->ps_pgcount;
 892                                 interim_pages_removed += ps->ps_pgcount;
 893                                 PSL_UNLOCK();
 894                                 error = ps_delete(ps);
 895                         }
 896                         if (error != KERN_SUCCESS) {
 897                                 /*
 898                                  * We couldn't delete the segment,
 899                                  * probably because there's not enough
 900                                  * virtual memory left.
 901                                  * Re-enable all the segments.
 902                                  */
 903                                 PSL_LOCK();
 904                                 break;
 905                         }
 906                         goto restart;
 907                 }
 908         }
 909
 910         if (error != KERN_SUCCESS) {
 911                 for (i = 0; i <= paging_segment_max; i++) {
 912                         ps = paging_segments[i];
 913                         if (ps != PAGING_SEGMENT_NULL &&
 914                             ps->ps_bs == bs &&
 915                             IS_PS_GOING_AWAY(ps)) {
 916                                 PS_LOCK(ps);
 917
 918                                 if( !IS_PS_GOING_AWAY(ps)) {
 919                                         PS_UNLOCK(ps);
 920                                         continue;
 921                                 }
 922                                 /* Handle the special clusters that came in while we let go the lock*/
 923                                 if( ps->ps_special_clusters) {
 924                                         dp_pages_free += ps->ps_special_clusters << ps->ps_clshift;
 925                                         ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift;
 926                                         ps->ps_clcount += ps->ps_special_clusters;
 927                                         if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) {
 928                                                 ps_select_array[ps->ps_bs->bs_priority] = 0;
 929                                         }
 930                                         ps->ps_special_clusters = 0;
 931                                 }
 932                                 /* re-enable access to this segment */
 933                                 ps->ps_state &= ~PS_GOING_AWAY;
 934                                 ps->ps_state |= PS_CAN_USE;
 935                                 PS_UNLOCK(ps);
 936                         }
 937                 }
 938                 dp_pages_free += interim_pages_removed;
 939                 PSL_UNLOCK();
 940                 BS_UNLOCK(bs);
 941                 return error;
 942         }
 943
 944         for (i = 0; i <= paging_segment_max; i++) {
 945                 ps = paging_segments[i];
 946                 if (ps != PAGING_SEGMENT_NULL &&
 947                     ps->ps_bs == bs) {
 948                         if(IS_PS_GOING_AWAY(ps)) {
 949                                 if(IS_PS_EMERGENCY_SEGMENT(ps)) {
 950                                         PS_LOCK(ps);
 951                                         ps->ps_state &= ~PS_GOING_AWAY;
 952                                         ps->ps_special_clusters = 0;
 953                                         ps->ps_pgcount = ps->ps_pgnum;
 954                                         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
 955                                         dp_pages_reserve += ps->ps_pgcount;
 956                                         PS_UNLOCK(ps);
 957                                 } else {
 958                                         paging_segments[i] = PAGING_SEGMENT_NULL;
 959                                         paging_segment_count--;
 960                                         PS_LOCK(ps);
 961                                         kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
 962                                         kfree(ps, sizeof *ps);
 963                                 }
 964                         }
 965                 }
 966         }
 967
 968         /* Scan the entire ps array separately to make certain we find the */
 969         /* proper paging_segment_max                                       */
 970         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 971                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 972                    paging_segment_max = i;
 973         }
 974
 975         PSL_UNLOCK();
 976
 977         if( dealing_with_emergency_segment ) {
 978                 BS_UNLOCK(bs);
 979                 return KERN_SUCCESS;
 980         }
 981
 982         /*
 983          * All the segments have been deleted.
 984          * We can remove the backing store.
 985          */
 986
 987         /*
 988          * Disable lookups of this backing store.
 989          */
 990         if((void *)bs->bs_port->ip_alias != NULL)
 991                 kfree((void *) bs->bs_port->ip_alias,
 992                       sizeof (struct vstruct_alias));
 993         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 994         bs->bs_port = MACH_PORT_NULL;
 995         BS_UNLOCK(bs);
 996
 997         /*
 998          * Remove backing store from backing_store list.
 999          */
1000         BSL_LOCK();
1001         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
1002                      bs_links);
1003         BSL_UNLOCK();
1004
1005         /*
1006          * Free the backing store structure.
1007          */
1008         BS_LOCK_DESTROY(bs);
1009         kfree(bs, sizeof *bs);
1010
1011         return KERN_SUCCESS;
1012 }
1013
1014 kern_return_t
1015 default_pager_backing_store_delete(
1016         MACH_PORT_FACE backing_store)
1017 {
1018         if( backing_store != emergency_segment_backing_store ) {
1019                 default_pager_backing_store_delete_internal(emergency_segment_backing_store);
1020         }
1021         return(default_pager_backing_store_delete_internal(backing_store));
1022 }
1023
1024 int     ps_enter(paging_segment_t);     /* forward */
1025
1026 int
1027 ps_enter(
1028         paging_segment_t ps)
1029 {
1030         int i;
1031
1032         PSL_LOCK();
1033
1034         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
1035                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
1036                         break;
1037         }
1038
1039         if (i < MAX_NUM_PAGING_SEGMENTS) {
1040                 paging_segments[i] = ps;
1041                 if (i > paging_segment_max)
1042                         paging_segment_max = i;
1043                 paging_segment_count++;
1044                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
1045                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
1046                         ps_select_array[ps->ps_bs->bs_priority] = 0;
1047                 i = 0;
1048         } else {
1049                 PSL_UNLOCK();
1050                 return KERN_RESOURCE_SHORTAGE;
1051         }
1052
1053         PSL_UNLOCK();
1054         return i;
1055 }
1056
1057 #ifdef DEVICE_PAGING
1058 kern_return_t
1059 default_pager_add_segment(
1060         MACH_PORT_FACE  backing_store,
1061         MACH_PORT_FACE  device,
1062         recnum_t        offset,
1063         recnum_t        count,
1064         int             record_size)
1065 {
1066         backing_store_t         bs;
1067         paging_segment_t        ps;
1068         int                     i;
1069         int                     error;
1070
1071         if ((bs = backing_store_lookup(backing_store))
1072             == BACKING_STORE_NULL)
1073                 return KERN_INVALID_ARGUMENT;
1074
1075         PSL_LOCK();
1076         for (i = 0; i <= paging_segment_max; i++) {
1077                 ps = paging_segments[i];
1078                 if (ps == PAGING_SEGMENT_NULL)
1079                         continue;
1080
1081                 /*
1082                  * Check for overlap on same device.
1083                  */
1084                 if (!(ps->ps_device != device
1085                       || offset >= ps->ps_offset + ps->ps_recnum
1086                       || offset + count <= ps->ps_offset)) {
1087                         PSL_UNLOCK();
1088                         BS_UNLOCK(bs);
1089                         return KERN_INVALID_ARGUMENT;
1090                 }
1091         }
1092         PSL_UNLOCK();
1093
1094         /*
1095          * Set up the paging segment
1096          */
1097         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1098         if (ps == PAGING_SEGMENT_NULL) {
1099                 BS_UNLOCK(bs);
1100                 return KERN_RESOURCE_SHORTAGE;
1101         }
1102
1103         ps->ps_segtype = PS_PARTITION;
1104         ps->ps_device = device;
1105         ps->ps_offset = offset;
1106         ps->ps_record_shift = local_log2(vm_page_size / record_size);
1107         ps->ps_recnum = count;
1108         ps->ps_pgnum = count >> ps->ps_record_shift;
1109
1110         ps->ps_pgcount = ps->ps_pgnum;
1111         ps->ps_clshift = local_log2(bs->bs_clsize);
1112         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1113         ps->ps_hint = 0;
1114
1115         PS_LOCK_INIT(ps);
1116         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1117         if (!ps->ps_bmap) {
1118                 PS_LOCK_DESTROY(ps);
1119                 kfree(ps, sizeof *ps);
1120                 BS_UNLOCK(bs);
1121                 return KERN_RESOURCE_SHORTAGE;
1122         }
1123         for (i = 0; i < ps->ps_ncls; i++) {
1124                 clrbit(ps->ps_bmap, i);
1125         }
1126
1127         if(paging_segment_count == 0) {
1128                 ps->ps_state = PS_EMERGENCY_SEGMENT;
1129                 if(use_emergency_swap_file_first) {
1130                         ps->ps_state |= PS_CAN_USE;
1131                 }
1132         } else {
1133                 ps->ps_state = PS_CAN_USE;
1134         }
1135
1136         ps->ps_bs = bs;
1137
1138         if ((error = ps_enter(ps)) != 0) {
1139                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1140
1141                 PS_LOCK_DESTROY(ps);
1142                 kfree(ps, sizeof *ps);
1143                 BS_UNLOCK(bs);
1144                 return KERN_RESOURCE_SHORTAGE;
1145         }
1146
1147         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1148         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1149         BS_UNLOCK(bs);
1150
1151         PSL_LOCK();
1152         if(IS_PS_OK_TO_USE(ps)) {
1153                 dp_pages_free += ps->ps_pgcount;
1154         } else {
1155                 dp_pages_reserve += ps->ps_pgcount;
1156         }
1157         PSL_UNLOCK();
1158
1159         bs_more_space(ps->ps_clcount);
1160
1161         DP_DEBUG(DEBUG_BS_INTERNAL,
1162                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1163                   device, offset, count, record_size,
1164                   ps->ps_record_shift, ps->ps_pgnum));
1165
1166         return KERN_SUCCESS;
1167 }
1168
1169 boolean_t
1170 bs_add_device(
1171         char            *dev_name,
1172         MACH_PORT_FACE  master)
1173 {
1174         security_token_t        null_security_token = {
1175                 { 0, 0 }
1176         };
1177         MACH_PORT_FACE  device;
1178         int             info[DEV_GET_SIZE_COUNT];
1179         mach_msg_type_number_t info_count;
1180         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1181         unsigned int    rec_size;
1182         recnum_t        count;
1183         int             clsize;
1184         MACH_PORT_FACE  reply_port;
1185
1186         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1187                         null_security_token, dev_name, &device))
1188                 return FALSE;
1189
1190         info_count = DEV_GET_SIZE_COUNT;
1191         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1192                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1193                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1194                 clsize = bs_get_global_clsize(0);
1195                 if (!default_pager_backing_store_create(
1196                                         default_pager_object,
1197                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1198                                         (clsize * vm_page_size),
1199                                         &bs)) {
1200                         if (!default_pager_add_segment(bs, device,
1201                                                        0, count, rec_size)) {
1202                                 return TRUE;
1203                         }
1204                         ipc_port_release_receive(bs);
1205                 }
1206         }
1207
1208         ipc_port_release_send(device);
1209         return FALSE;
1210 }
1211 #endif /* DEVICE_PAGING */
1212
1213 #if     VS_ASYNC_REUSE
1214
1215 struct vs_async *
1216 vs_alloc_async(void)
1217 {
1218         struct vs_async *vsa;
1219         MACH_PORT_FACE  reply_port;
1220 //      kern_return_t   kr;
1221
1222         VS_ASYNC_LOCK();
1223         if (vs_async_free_list == NULL) {
1224                 VS_ASYNC_UNLOCK();
1225                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1226                 if (vsa != NULL) {
1227                         /*
1228                          * Try allocating a reply port named after the
1229                          * address of the vs_async structure.
1230                          */
1231                         struct vstruct_alias    *alias_struct;
1232
1233                         reply_port = ipc_port_alloc_kernel();
1234                         alias_struct = (struct vstruct_alias *)
1235                                 kalloc(sizeof (struct vstruct_alias));
1236                         if(alias_struct != NULL) {
1237                                 __IGNORE_WCASTALIGN(alias_struct->vs = (struct vstruct *)vsa);
1238                                 alias_struct->name = &default_pager_ops;
1239                                 reply_port->ip_alias = (uintptr_t) alias_struct;
1240                                 vsa->reply_port = reply_port;
1241                                 vs_alloc_async_count++;
1242                         }
1243                         else {
1244                                 vs_alloc_async_failed++;
1245                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1246                                                                 (reply_port));
1247                                 kfree(vsa, sizeof (struct vs_async));
1248                                 vsa = NULL;
1249                         }
1250                 }
1251         } else {
1252                 vsa = vs_async_free_list;
1253                 vs_async_free_list = vs_async_free_list->vsa_next;
1254                 VS_ASYNC_UNLOCK();
1255         }
1256
1257         return vsa;
1258 }
1259
1260 void
1261 vs_free_async(
1262         struct vs_async *vsa)
1263 {
1264         VS_ASYNC_LOCK();
1265         vsa->vsa_next = vs_async_free_list;
1266         vs_async_free_list = vsa;
1267         VS_ASYNC_UNLOCK();
1268 }
1269
1270 #else   /* VS_ASYNC_REUSE */
1271
1272 struct vs_async *
1273 vs_alloc_async(void)
1274 {
1275         struct vs_async *vsa;
1276         MACH_PORT_FACE  reply_port;
1277         kern_return_t   kr;
1278
1279         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1280         if (vsa != NULL) {
1281                 /*
1282                  * Try allocating a reply port named after the
1283                  * address of the vs_async structure.
1284                  */
1285                         reply_port = ipc_port_alloc_kernel();
1286                         alias_struct = (vstruct_alias *)
1287                                 kalloc(sizeof (struct vstruct_alias));
1288                         if(alias_struct != NULL) {
1289                                 alias_struct->vs = reply_port;
1290                                 alias_struct->name = &default_pager_ops;
1291                                 reply_port->defpager_importance.alias = (int) vsa;
1292                                 vsa->reply_port = reply_port;
1293                                 vs_alloc_async_count++;
1294                         }
1295                         else {
1296                                 vs_alloc_async_failed++;
1297                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1298                                                                 (reply_port));
1299                                 kfree(vsa, sizeof (struct vs_async));
1300                                 vsa = NULL;
1301                         }
1302         }
1303
1304         return vsa;
1305 }
1306
1307 void
1308 vs_free_async(
1309         struct vs_async *vsa)
1310 {
1311         MACH_PORT_FACE  reply_port;
1312         kern_return_t   kr;
1313
1314         reply_port = vsa->reply_port;
1315         kfree(reply_port->ip_alias, sizeof (struct vstuct_alias));
1316         kfree(vsa, sizeof (struct vs_async));
1317         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1318 #if 0
1319         VS_ASYNC_LOCK();
1320         vs_alloc_async_count--;
1321         VS_ASYNC_UNLOCK();
1322 #endif
1323 }
1324
1325 #endif  /* VS_ASYNC_REUSE */
1326
1327 zone_t  vstruct_zone;
1328
1329 vstruct_t
1330 ps_vstruct_create(
1331         dp_size_t size)
1332 {
1333         vstruct_t       vs;
1334         unsigned int    i;
1335
1336         vs = (vstruct_t) zalloc(vstruct_zone);
1337         if (vs == VSTRUCT_NULL) {
1338                 return VSTRUCT_NULL;
1339         }
1340
1341         VS_LOCK_INIT(vs);
1342
1343         /*
1344          * The following fields will be provided later.
1345          */
1346         vs->vs_pager_ops = NULL;
1347         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1348         vs->vs_references = 1;
1349         vs->vs_seqno = 0;
1350
1351         vs->vs_waiting_seqno = FALSE;
1352         vs->vs_waiting_read = FALSE;
1353         vs->vs_waiting_write = FALSE;
1354         vs->vs_waiting_async = FALSE;
1355
1356         vs->vs_readers = 0;
1357         vs->vs_writers = 0;
1358
1359         vs->vs_errors = 0;
1360
1361         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1362         vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1363         vs->vs_async_pending = 0;
1364
1365         /*
1366          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1367          * depending on the size of the memory object.
1368          */
1369         if (INDIRECT_CLMAP(vs->vs_size)) {
1370                 vs->vs_imap = (struct vs_map **)
1371                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1372                 vs->vs_indirect = TRUE;
1373         } else {
1374                 vs->vs_dmap = (struct vs_map *)
1375                         kalloc(CLMAP_SIZE(vs->vs_size));
1376                 vs->vs_indirect = FALSE;
1377         }
1378         vs->vs_xfer_pending = FALSE;
1379         DP_DEBUG(DEBUG_VS_INTERNAL,
1380                  ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1381
1382         /*
1383          * Check to see that we got the space.
1384          */
1385         if (!vs->vs_dmap) {
1386                 kfree(vs, sizeof *vs);
1387                 return VSTRUCT_NULL;
1388         }
1389
1390         /*
1391          * Zero the indirect pointers, or clear the direct pointers.
1392          */
1393         if (vs->vs_indirect)
1394                 memset(vs->vs_imap, 0,
1395                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1396         else
1397                 for (i = 0; i < vs->vs_size; i++)
1398                         VSM_CLR(vs->vs_dmap[i]);
1399
1400         VS_MAP_LOCK_INIT(vs);
1401
1402         bs_commit(vs->vs_size);
1403
1404         return vs;
1405 }
1406
1407 paging_segment_t ps_select_segment(unsigned int, int *);        /* forward */
1408
1409 paging_segment_t
1410 ps_select_segment(
1411         unsigned int    shift,
1412         int             *psindex)
1413 {
1414         paging_segment_t        ps;
1415         int                     i;
1416         int                     j;
1417
1418         /*
1419          * Optimize case where there's only one segment.
1420          * paging_segment_max will index the one and only segment.
1421          */
1422
1423         PSL_LOCK();
1424         if (paging_segment_count == 1) {
1425                 paging_segment_t lps = PAGING_SEGMENT_NULL;     /* used to avoid extra PS_UNLOCK */
1426                 ipc_port_t trigger = IP_NULL;
1427
1428                 ps = paging_segments[paging_segment_max];
1429                 *psindex = paging_segment_max;
1430                 PS_LOCK(ps);
1431                 if( !IS_PS_EMERGENCY_SEGMENT(ps) ) {
1432                         panic("Emergency paging segment missing\n");
1433                 }
1434                 ASSERT(ps->ps_clshift >= shift);
1435                 if(IS_PS_OK_TO_USE(ps)) {
1436                         if (ps->ps_clcount) {
1437                                 ps->ps_clcount--;
1438                                 dp_pages_free -=  1 << ps->ps_clshift;
1439                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1440                                 if(min_pages_trigger_port &&
1441                                   (dp_pages_free < minimum_pages_remaining)) {
1442                                         trigger = min_pages_trigger_port;
1443                                         min_pages_trigger_port = NULL;
1444                                         bs_low = TRUE;
1445                                         backing_store_abort_compaction = TRUE;
1446                                 }
1447                                 lps = ps;
1448                         }
1449                 }
1450                 PS_UNLOCK(ps);
1451
1452                 if( lps == PAGING_SEGMENT_NULL ) {
1453                         if(dp_pages_free) {
1454                                 dp_pages_free_drift_count++;
1455                                 if(dp_pages_free > dp_pages_free_drifted_max) {
1456                                         dp_pages_free_drifted_max = dp_pages_free;
1457                                 }
1458                                 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1459                         }
1460                         dp_pages_free = 0;
1461                 }
1462
1463                 PSL_UNLOCK();
1464
1465                 if (trigger != IP_NULL) {
1466                         dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1467
1468                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1469                         ipc_port_release_send(trigger);
1470                 }
1471                 return lps;
1472         }
1473
1474         if (paging_segment_count == 0) {
1475                 if(dp_pages_free) {
1476                         dp_pages_free_drift_count++;
1477                         if(dp_pages_free > dp_pages_free_drifted_max) {
1478                                 dp_pages_free_drifted_max = dp_pages_free;
1479                         }
1480                         dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1481                 }
1482                 dp_pages_free = 0;
1483                 PSL_UNLOCK();
1484                 return PAGING_SEGMENT_NULL;
1485         }
1486
1487         for (i = BS_MAXPRI;
1488              i >= BS_MINPRI; i--) {
1489                 int start_index;
1490
1491                 if ((ps_select_array[i] == BS_NOPRI) ||
1492                                 (ps_select_array[i] == BS_FULLPRI))
1493                         continue;
1494                 start_index = ps_select_array[i];
1495
1496                 if(!(paging_segments[start_index])) {
1497                         j = start_index+1;
1498                         physical_transfer_cluster_count = 0;
1499                 }
1500                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1501                                 (((paging_segments[start_index])->ps_clshift)
1502                                 + vm_page_shift))) {
1503                         physical_transfer_cluster_count = 0;
1504                         j = start_index + 1;
1505                 } else {
1506                         physical_transfer_cluster_count+=1;
1507                         j = start_index;
1508                         if(start_index == 0)
1509                                 start_index = paging_segment_max;
1510                         else
1511                                 start_index = start_index - 1;
1512                 }
1513
1514                 while (1) {
1515                         if (j > paging_segment_max)
1516                                 j = 0;
1517                         if ((ps = paging_segments[j]) &&
1518                             (ps->ps_bs->bs_priority == i)) {
1519                                 /*
1520                                  * Force the ps cluster size to be
1521                                  * >= that of the vstruct.
1522                                  */
1523                                 PS_LOCK(ps);
1524                                 if (IS_PS_OK_TO_USE(ps)) {
1525                                         if ((ps->ps_clcount) &&
1526                                                    (ps->ps_clshift >= shift)) {
1527                                                 ipc_port_t trigger = IP_NULL;
1528
1529                                                 ps->ps_clcount--;
1530                                                 dp_pages_free -=  1 << ps->ps_clshift;
1531                                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1532                                                 if(min_pages_trigger_port &&
1533                                                         (dp_pages_free <
1534                                                         minimum_pages_remaining)) {
1535                                                         trigger = min_pages_trigger_port;
1536                                                         min_pages_trigger_port = NULL;
1537                                                         bs_low = TRUE;
1538                                                         backing_store_abort_compaction = TRUE;
1539                                                 }
1540                                                 PS_UNLOCK(ps);
1541                                                 /*
1542                                                  * found one, quit looking.
1543                                                  */
1544                                                 ps_select_array[i] = j;
1545                                                 PSL_UNLOCK();
1546
1547                                                 if (trigger != IP_NULL) {
1548                                                         dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1549
1550                                                         default_pager_space_alert(
1551                                                                 trigger,
1552                                                                 HI_WAT_ALERT);
1553                                                         ipc_port_release_send(trigger);
1554                                                 }
1555                                                 *psindex = j;
1556                                                 return ps;
1557                                         }
1558                                 }
1559                                 PS_UNLOCK(ps);
1560                         }
1561                         if (j == start_index) {
1562                                 /*
1563                                  * none at this priority -- mark it full
1564                                  */
1565                                 ps_select_array[i] = BS_FULLPRI;
1566                                 break;
1567                         }
1568                         j++;
1569                 }
1570         }
1571
1572         if(dp_pages_free) {
1573                 dp_pages_free_drift_count++;
1574                 if(dp_pages_free > dp_pages_free_drifted_max) {
1575                         dp_pages_free_drifted_max = dp_pages_free;
1576                 }
1577                 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free));
1578         }
1579         dp_pages_free = 0;
1580         PSL_UNLOCK();
1581         return PAGING_SEGMENT_NULL;
1582 }
1583
1584 dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1585
1586 dp_offset_t
1587 ps_allocate_cluster(
1588         vstruct_t               vs,
1589         int                     *psindex,
1590         paging_segment_t        use_ps)
1591 {
1592         unsigned int            byte_num;
1593         int                     bit_num = 0;
1594         paging_segment_t        ps;
1595         dp_offset_t             cluster;
1596         ipc_port_t              trigger = IP_NULL;
1597
1598         /*
1599          * Find best paging segment.
1600          * ps_select_segment will decrement cluster count on ps.
1601          * Must pass cluster shift to find the most appropriate segment.
1602          */
1603         /* NOTE:  The addition of paging segment delete capability threatened
1604          * to seriously complicate the treatment of paging segments in this
1605          * module and the ones that call it (notably ps_clmap), because of the
1606          * difficulty in assuring that the paging segment would continue to
1607          * exist between being unlocked and locked.   This was
1608          * avoided because all calls to this module are based in either
1609          * dp_memory_object calls which rely on the vs lock, or by
1610          * the transfer function which is part of the segment delete path.
1611          * The transfer function which is part of paging segment delete is
1612          * protected from multiple callers by the backing store lock.
1613          * The paging segment delete function treats mappings to a paging
1614          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1615          * while data is transferred to the remaining segments.  This is in
1616          * line with the view that incomplete or in-transition mappings between
1617          * data, a vstruct, and backing store are protected by the vs lock.
1618          * This and the ordering of the paging segment "going_away" bit setting
1619          * protects us.
1620          */
1621 retry:
1622         if (use_ps != PAGING_SEGMENT_NULL) {
1623                 ps = use_ps;
1624                 PSL_LOCK();
1625                 PS_LOCK(ps);
1626
1627                 ASSERT(ps->ps_clcount != 0);
1628
1629                 ps->ps_clcount--;
1630                 dp_pages_free -=  1 << ps->ps_clshift;
1631                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1632                 if(min_pages_trigger_port &&
1633                                 (dp_pages_free < minimum_pages_remaining)) {
1634                         trigger = min_pages_trigger_port;
1635                         min_pages_trigger_port = NULL;
1636                         bs_low = TRUE;
1637                         backing_store_abort_compaction = TRUE;
1638                 }
1639                 PSL_UNLOCK();
1640                 PS_UNLOCK(ps);
1641                 if (trigger != IP_NULL) {
1642                         dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1643
1644                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1645                         ipc_port_release_send(trigger);
1646                 }
1647
1648         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1649                    PAGING_SEGMENT_NULL) {
1650                 static clock_sec_t lastnotify = 0;
1651                 clock_sec_t now;
1652                 clock_nsec_t nanoseconds_dummy;
1653
1654                 /*
1655                  * Don't immediately jump to the emergency segment. Give the
1656                  * dynamic pager a chance to create it's first normal swap file.
1657                  * Unless, of course the very first normal swap file can't be
1658                  * created due to some problem and we didn't expect that problem
1659                  * i.e. use_emergency_swap_file_first was never set to true initially.
1660                  * It then gets set in the swap file creation error handling.
1661                  */
1662                 if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) {
1663
1664                         ps = paging_segments[EMERGENCY_PSEG_INDEX];
1665                         if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) {
1666                                 PSL_LOCK();
1667                                 PS_LOCK(ps);
1668
1669                                 if(IS_PS_GOING_AWAY(ps)) {
1670                                         /* Someone de-activated the emergency paging segment*/
1671                                         PS_UNLOCK(ps);
1672                                         PSL_UNLOCK();
1673
1674                                 } else if(dp_pages_free) {
1675                                         /*
1676                                          * Someone has already activated the emergency paging segment
1677                                          * OR
1678                                          * Between us having rec'd a NULL segment from ps_select_segment
1679                                          * and reaching here a new normal segment could have been added.
1680                                          * E.g. we get NULL segment and another thread just added the
1681                                          * new swap file. Hence check to see if we have more dp_pages_free
1682                                          * before activating the emergency segment.
1683                                          */
1684                                         PS_UNLOCK(ps);
1685                                         PSL_UNLOCK();
1686                                         goto retry;
1687
1688                                 } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) {
1689                                         /*
1690                                          * PS_CAN_USE is only reset from the emergency segment when it's
1691                                          * been successfully recovered. So it's legal to have an emergency
1692                                          * segment that has PS_CAN_USE but no clusters because it's recovery
1693                                          * failed.
1694                                          */
1695                                         backing_store_t bs = ps->ps_bs;
1696                                         ps->ps_state |= PS_CAN_USE;
1697                                         if(ps_select_array[bs->bs_priority] == BS_FULLPRI ||
1698                                                 ps_select_array[bs->bs_priority] == BS_NOPRI) {
1699                                                 ps_select_array[bs->bs_priority] = 0;
1700                                         }
1701                                         dp_pages_free += ps->ps_pgcount;
1702                                         dp_pages_reserve -= ps->ps_pgcount;
1703                                         PS_UNLOCK(ps);
1704                                         PSL_UNLOCK();
1705                                         dprintf(("Switching ON Emergency paging segment\n"));
1706                                         goto retry;
1707                                 }
1708
1709                                 PS_UNLOCK(ps);
1710                                 PSL_UNLOCK();
1711                         }
1712                 }
1713
1714                 /*
1715                  * Emit a notification of the low-paging resource condition
1716                  * but don't issue it more than once every five seconds.  This
1717                  * prevents us from overflowing logs with thousands of
1718                  * repetitions of the message.
1719                  */
1720                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1721                 if (paging_segment_count > 1 && (now > lastnotify + 5)) {
1722                         /* With an activated emergency paging segment we still
1723                          * didn't get any clusters. This could mean that the
1724                          * emergency paging segment is exhausted.
1725                          */
1726                         dprintf(("System is out of paging space.\n"));
1727                         lastnotify = now;
1728                 }
1729
1730                 PSL_LOCK();
1731
1732                 if(min_pages_trigger_port) {
1733                         trigger = min_pages_trigger_port;
1734                         min_pages_trigger_port = NULL;
1735                         bs_low = TRUE;
1736                         backing_store_abort_compaction = TRUE;
1737                 }
1738                 PSL_UNLOCK();
1739                 if (trigger != IP_NULL) {
1740                         dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1741
1742                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1743                         ipc_port_release_send(trigger);
1744                 }
1745                 return (dp_offset_t) -1;
1746         }
1747
1748         /*
1749          * Look for an available cluster.  At the end of the loop,
1750          * byte_num is the byte offset and bit_num is the bit offset of the
1751          * first zero bit in the paging segment bitmap.
1752          */
1753         PS_LOCK(ps);
1754         byte_num = ps->ps_hint;
1755         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1756                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1757                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1758                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1759                                         break;
1760                         }
1761                         ASSERT(bit_num != NBBY);
1762                         break;
1763                 }
1764         }
1765         ps->ps_hint = byte_num;
1766         cluster = (byte_num*NBBY) + bit_num;
1767
1768         /* Space was reserved, so this must be true */
1769         ASSERT(cluster < ps->ps_ncls);
1770
1771         setbit(ps->ps_bmap, cluster);
1772         PS_UNLOCK(ps);
1773
1774         return cluster;
1775 }
1776
1777 void ps_deallocate_cluster(paging_segment_t, dp_offset_t);      /* forward */
1778
1779 void
1780 ps_deallocate_cluster(
1781         paging_segment_t        ps,
1782         dp_offset_t             cluster)
1783 {
1784
1785         if (cluster >= ps->ps_ncls)
1786                 panic("ps_deallocate_cluster: Invalid cluster number");
1787
1788         /*
1789          * Lock the paging segment, clear the cluster's bitmap and increment the
1790          * number of free cluster.
1791          */
1792         PSL_LOCK();
1793         PS_LOCK(ps);
1794         clrbit(ps->ps_bmap, cluster);
1795         if( IS_PS_OK_TO_USE(ps)) {
1796                 ++ps->ps_clcount;
1797                 ps->ps_pgcount +=  1 << ps->ps_clshift;
1798                 dp_pages_free +=  1 << ps->ps_clshift;
1799         } else {
1800                 ps->ps_special_clusters += 1;
1801         }
1802
1803         /*
1804          * Move the hint down to the freed cluster if it is
1805          * less than the current hint.
1806          */
1807         if ((cluster/NBBY) < ps->ps_hint) {
1808                 ps->ps_hint = (cluster/NBBY);
1809         }
1810
1811
1812         /*
1813          * If we're freeing space on a full priority, reset the array.
1814          */
1815         if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1816                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1817         PS_UNLOCK(ps);
1818         PSL_UNLOCK();
1819
1820         return;
1821 }
1822
1823 void ps_dealloc_vsmap(struct vs_map *, dp_size_t);      /* forward */
1824
1825 void
1826 ps_dealloc_vsmap(
1827         struct vs_map   *vsmap,
1828         dp_size_t       size)
1829 {
1830         unsigned int i;
1831         struct ps_vnode_trim_data trim_data;
1832
1833         ps_vnode_trim_init(&trim_data);
1834
1835         for (i = 0; i < size; i++) {
1836                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) {
1837                         ps_vnode_trim_more(&trim_data,
1838                                               &vsmap[i],
1839                                               VSM_PS(vsmap[i])->ps_clshift,
1840                                               vm_page_size << VSM_PS(vsmap[i])->ps_clshift);
1841                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1842                                               VSM_CLOFF(vsmap[i]));
1843                 } else {
1844                         ps_vnode_trim_now(&trim_data);
1845                 }
1846         }
1847         ps_vnode_trim_now(&trim_data);
1848 }
1849
1850 void
1851 ps_vstruct_dealloc(
1852         vstruct_t vs)
1853 {
1854         unsigned int    i;
1855 //      spl_t   s;
1856
1857         VS_MAP_LOCK(vs);
1858
1859         /*
1860          * If this is an indirect structure, then we walk through the valid
1861          * (non-zero) indirect pointers and deallocate the clusters
1862          * associated with each used map entry (via ps_dealloc_vsmap).
1863          * When all of the clusters in an indirect block have been
1864          * freed, we deallocate the block.  When all of the indirect
1865          * blocks have been deallocated we deallocate the memory
1866          * holding the indirect pointers.
1867          */
1868         if (vs->vs_indirect) {
1869                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1870                         if (vs->vs_imap[i] != NULL) {
1871                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1872                                 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1873                         }
1874                 }
1875                 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1876         } else {
1877                 /*
1878                  * Direct map.  Free used clusters, then memory.
1879                  */
1880                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1881                 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1882         }
1883         VS_MAP_UNLOCK(vs);
1884
1885         bs_commit(- vs->vs_size);
1886
1887         VS_MAP_LOCK_DESTROY(vs);
1888
1889         zfree(vstruct_zone, vs);
1890 }
1891
1892 kern_return_t
1893 ps_vstruct_reclaim(
1894         vstruct_t vs,
1895         boolean_t return_to_vm,
1896         boolean_t reclaim_backing_store)
1897 {
1898         unsigned int    i, j;
1899         struct vs_map   *vsmap;
1900         boolean_t       vsmap_all_clear, vsimap_all_clear;
1901         struct vm_object_fault_info fault_info;
1902         int             clmap_off;
1903         unsigned int    vsmap_size;
1904         kern_return_t   kr = KERN_SUCCESS;
1905
1906         VS_MAP_LOCK(vs);
1907
1908         fault_info.cluster_size = VM_SUPER_CLUSTER;
1909         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
1910         fault_info.user_tag = 0;
1911         fault_info.pmap_options = 0;
1912         fault_info.lo_offset = 0;
1913         fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift);
1914         fault_info.io_sync = reclaim_backing_store;
1915         fault_info.batch_pmap_op = FALSE;
1916
1917         /*
1918          * If this is an indirect structure, then we walk through the valid
1919          * (non-zero) indirect pointers and deallocate the clusters
1920          * associated with each used map entry (via ps_dealloc_vsmap).
1921          * When all of the clusters in an indirect block have been
1922          * freed, we deallocate the block.  When all of the indirect
1923          * blocks have been deallocated we deallocate the memory
1924          * holding the indirect pointers.
1925          */
1926         if (vs->vs_indirect) {
1927                 vsimap_all_clear = TRUE;
1928                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1929                         vsmap = vs->vs_imap[i];
1930                         if (vsmap == NULL)
1931                                 continue;
1932                         /* loop on clusters in this indirect map */
1933                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
1934                                      VSCLSIZE(vs) * i);
1935                         if (i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
1936                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
1937                         else
1938                                 vsmap_size = CLMAP_ENTRIES;
1939                         vsmap_all_clear = TRUE;
1940                         if (return_to_vm) {
1941                                 for (j = 0; j < vsmap_size;) {
1942                                         if (VSM_ISCLR(vsmap[j]) ||
1943                                             VSM_ISERR(vsmap[j])) {
1944                                                 j++;
1945                                                 clmap_off += vm_page_size * VSCLSIZE(vs);
1946                                                 continue;
1947                                         }
1948                                         VS_MAP_UNLOCK(vs);
1949                                         kr = pvs_cluster_read(
1950                                                 vs,
1951                                                 clmap_off,
1952                                                 (dp_size_t) -1, /* read whole cluster */
1953                                                 &fault_info);
1954
1955                                         VS_MAP_LOCK(vs); /* XXX what if it changed ? */
1956                                         if (kr != KERN_SUCCESS) {
1957                                                 vsmap_all_clear = FALSE;
1958                                                 vsimap_all_clear = FALSE;
1959
1960                                                 kr = KERN_MEMORY_ERROR;
1961                                                 goto out;
1962                                         }
1963                                 }
1964                         }
1965                         if (vsmap_all_clear) {
1966                                 ps_dealloc_vsmap(vsmap, CLMAP_ENTRIES);
1967                                 kfree(vsmap, CLMAP_THRESHOLD);
1968                                 vs->vs_imap[i] = NULL;
1969                         }
1970                 }
1971                 if (vsimap_all_clear) {
1972 //                      kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1973                 }
1974         } else {
1975                 /*
1976                  * Direct map.  Free used clusters, then memory.
1977                  */
1978                 vsmap = vs->vs_dmap;
1979                 if (vsmap == NULL) {
1980                         goto out;
1981                 }
1982                 vsmap_all_clear = TRUE;
1983                 /* loop on clusters in the direct map */
1984                 if (return_to_vm) {
1985                         for (j = 0; j < vs->vs_size;) {
1986                                 if (VSM_ISCLR(vsmap[j]) ||
1987                                     VSM_ISERR(vsmap[j])) {
1988                                         j++;
1989                                         continue;
1990                                 }
1991                                 clmap_off = vm_page_size * (j << vs->vs_clshift);
1992                                 VS_MAP_UNLOCK(vs);
1993                                 kr = pvs_cluster_read(
1994                                         vs,
1995                                         clmap_off,
1996                                         (dp_size_t) -1, /* read whole cluster */
1997                                         &fault_info);
1998
1999                                 VS_MAP_LOCK(vs); /* XXX what if it changed ? */
2000                                 if (kr != KERN_SUCCESS) {
2001                                         vsmap_all_clear = FALSE;
2002
2003                                         kr = KERN_MEMORY_ERROR;
2004                                         goto out;
2005                                 } else {
2006 //                                      VSM_CLR(vsmap[j]);
2007                                 }
2008                         }
2009                 }
2010                 if (vsmap_all_clear) {
2011                         ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
2012 //                      kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
2013                 }
2014         }
2015 out:
2016         VS_MAP_UNLOCK(vs);
2017
2018         return kr;
2019 }
2020
2021 int ps_map_extend(vstruct_t, unsigned int);     /* forward */
2022
2023 int ps_map_extend(
2024         vstruct_t       vs,
2025         unsigned int    new_size)
2026 {
2027         struct vs_map   **new_imap;
2028         struct vs_map   *new_dmap = NULL;
2029         int             newdsize;
2030         int             i;
2031         void            *old_map = NULL;
2032         int             old_map_size = 0;
2033
2034         if (vs->vs_size >= new_size) {
2035                 /*
2036                  * Someone has already done the work.
2037                  */
2038                 return 0;
2039         }
2040
2041         /*
2042          * If the new size extends into the indirect range, then we have one
2043          * of two cases: we are going from indirect to indirect, or we are
2044          * going from direct to indirect.  If we are going from indirect to
2045          * indirect, then it is possible that the new size will fit in the old
2046          * indirect map.  If this is the case, then just reset the size of the
2047          * vstruct map and we are done.  If the new size will not
2048          * fit into the old indirect map, then we have to allocate a new
2049          * indirect map and copy the old map pointers into this new map.
2050          *
2051          * If we are going from direct to indirect, then we have to allocate a
2052          * new indirect map and copy the old direct pages into the first
2053          * indirect page of the new map.
2054          * NOTE: allocating memory here is dangerous, as we're in the
2055          * pageout path.
2056          */
2057         if (INDIRECT_CLMAP(new_size)) {
2058                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
2059
2060                 /*
2061                  * Get a new indirect map and zero it.
2062                  */
2063                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
2064                 if (vs->vs_indirect &&
2065                     (new_map_size == old_map_size)) {
2066                         bs_commit(new_size - vs->vs_size);
2067                         vs->vs_size = new_size;
2068                         return 0;
2069                 }
2070
2071                 new_imap = (struct vs_map **)kalloc(new_map_size);
2072                 if (new_imap == NULL) {
2073                         return -1;
2074                 }
2075                 memset(new_imap, 0, new_map_size);
2076
2077                 if (vs->vs_indirect) {
2078                         /* Copy old entries into new map */
2079                         memcpy(new_imap, vs->vs_imap, old_map_size);
2080                         /* Arrange to free the old map */
2081                         old_map = (void *) vs->vs_imap;
2082                         newdsize = 0;
2083                 } else {        /* Old map was a direct map */
2084                         /* Allocate an indirect page */
2085                         if ((new_imap[0] = (struct vs_map *)
2086                              kalloc(CLMAP_THRESHOLD)) == NULL) {
2087                                 kfree(new_imap, new_map_size);
2088                                 return -1;
2089                         }
2090                         new_dmap = new_imap[0];
2091                         newdsize = CLMAP_ENTRIES;
2092                 }
2093         } else {
2094                 new_imap = NULL;
2095                 newdsize = new_size;
2096                 /*
2097                  * If the new map is a direct map, then the old map must
2098                  * also have been a direct map.  All we have to do is
2099                  * to allocate a new direct map, copy the old entries
2100                  * into it and free the old map.
2101                  */
2102                 if ((new_dmap = (struct vs_map *)
2103                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
2104                         return -1;
2105                 }
2106         }
2107         if (newdsize) {
2108
2109                 /* Free the old map */
2110                 old_map = (void *) vs->vs_dmap;
2111                 old_map_size = CLMAP_SIZE(vs->vs_size);
2112
2113                 /* Copy info from the old map into the new map */
2114                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
2115
2116                 /* Initialize the rest of the new map */
2117                 for (i = vs->vs_size; i < newdsize; i++)
2118                         VSM_CLR(new_dmap[i]);
2119         }
2120         if (new_imap) {
2121                 vs->vs_imap = new_imap;
2122                 vs->vs_indirect = TRUE;
2123         } else
2124                 vs->vs_dmap = new_dmap;
2125         bs_commit(new_size - vs->vs_size);
2126         vs->vs_size = new_size;
2127         if (old_map)
2128                 kfree(old_map, old_map_size);
2129         return 0;
2130 }
2131
2132 dp_offset_t
2133 ps_clmap(
2134         vstruct_t       vs,
2135         dp_offset_t     offset,
2136         struct clmap    *clmap,
2137         int             flag,
2138         dp_size_t       size,
2139         int             error)
2140 {
2141         dp_offset_t     cluster;        /* The cluster of offset.       */
2142         dp_offset_t     newcl;          /* The new cluster allocated.   */
2143         dp_offset_t     newoff;
2144         unsigned int    i;
2145         struct vs_map   *vsmap;
2146
2147         VS_MAP_LOCK(vs);
2148
2149         ASSERT(vs->vs_dmap);
2150         cluster = atop_32(offset) >> vs->vs_clshift;
2151
2152         /*
2153          * Initialize cluster error value
2154          */
2155         clmap->cl_error = 0;
2156
2157         /*
2158          * If the object has grown, extend the page map.
2159          */
2160         if (cluster >= vs->vs_size) {
2161                 if (flag == CL_FIND) {
2162                         /* Do not allocate if just doing a lookup */
2163                         VS_MAP_UNLOCK(vs);
2164                         return (dp_offset_t) -1;
2165                 }
2166                 if (ps_map_extend(vs, cluster + 1)) {
2167                         VS_MAP_UNLOCK(vs);
2168                         return (dp_offset_t) -1;
2169                 }
2170         }
2171
2172         /*
2173          * Look for the desired cluster.  If the map is indirect, then we
2174          * have a two level lookup.  First find the indirect block, then
2175          * find the actual cluster.  If the indirect block has not yet
2176          * been allocated, then do so.  If the cluster has not yet been
2177          * allocated, then do so.
2178          *
2179          * If any of the allocations fail, then return an error.
2180          * Don't allocate if just doing a lookup.
2181          */
2182         if (vs->vs_indirect) {
2183                 long    ind_block = cluster/CLMAP_ENTRIES;
2184
2185                 /* Is the indirect block allocated? */
2186                 vsmap = vs->vs_imap[ind_block];
2187                 if (vsmap == NULL) {
2188                         if (flag == CL_FIND) {
2189                                 VS_MAP_UNLOCK(vs);
2190                                 return (dp_offset_t) -1;
2191                         }
2192
2193                         /* Allocate the indirect block */
2194                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
2195                         if (vsmap == NULL) {
2196                                 VS_MAP_UNLOCK(vs);
2197                                 return (dp_offset_t) -1;
2198                         }
2199                         /* Initialize the cluster offsets */
2200                         for (i = 0; i < CLMAP_ENTRIES; i++)
2201                                 VSM_CLR(vsmap[i]);
2202                         vs->vs_imap[ind_block] = vsmap;
2203                 }
2204         } else
2205                 vsmap = vs->vs_dmap;
2206
2207         ASSERT(vsmap);
2208         vsmap += cluster%CLMAP_ENTRIES;
2209
2210         /*
2211          * At this point, vsmap points to the struct vs_map desired.
2212          *
2213          * Look in the map for the cluster, if there was an error on a
2214          * previous write, flag it and return.  If it is not yet
2215          * allocated, then allocate it, if we're writing; if we're
2216          * doing a lookup and the cluster's not allocated, return error.
2217          */
2218         if (VSM_ISERR(*vsmap)) {
2219                 clmap->cl_error = VSM_GETERR(*vsmap);
2220                 VS_MAP_UNLOCK(vs);
2221                 return (dp_offset_t) -1;
2222         } else if (VSM_ISCLR(*vsmap)) {
2223                 int psindex;
2224
2225                 if (flag == CL_FIND) {
2226                         /*
2227                          * If there's an error and the entry is clear, then
2228                          * we've run out of swap space.  Record the error
2229                          * here and return.
2230                          */
2231                         if (error) {
2232                                 VSM_SETERR(*vsmap, error);
2233                         }
2234                         VS_MAP_UNLOCK(vs);
2235                         return (dp_offset_t) -1;
2236                 } else {
2237                         /*
2238                          * Attempt to allocate a cluster from the paging segment
2239                          */
2240                         newcl = ps_allocate_cluster(vs, &psindex,
2241                                                     PAGING_SEGMENT_NULL);
2242                         if (newcl == (dp_offset_t) -1) {
2243                                 VS_MAP_UNLOCK(vs);
2244                                 return (dp_offset_t) -1;
2245                         }
2246                         VSM_CLR(*vsmap);
2247                         VSM_SETCLOFF(*vsmap, newcl);
2248                         VSM_SETPS(*vsmap, psindex);
2249                 }
2250         } else
2251                 newcl = VSM_CLOFF(*vsmap);
2252
2253         /*
2254          * Fill in pertinent fields of the clmap
2255          */
2256         clmap->cl_ps = VSM_PS(*vsmap);
2257         clmap->cl_numpages = VSCLSIZE(vs);
2258         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
2259
2260         /*
2261          * Byte offset in paging segment is byte offset to cluster plus
2262          * byte offset within cluster.  It looks ugly, but should be
2263          * relatively quick.
2264          */
2265         ASSERT(trunc_page(offset) == offset);
2266         newcl = ptoa_32(newcl) << vs->vs_clshift;
2267         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
2268         if (flag == CL_ALLOC) {
2269                 /*
2270                  * set bits in the allocation bitmap according to which
2271                  * pages were requested.  size is in bytes.
2272                  */
2273                 i = atop_32(newoff);
2274                 while ((size > 0) && (i < VSCLSIZE(vs))) {
2275                         VSM_SETALLOC(*vsmap, i);
2276                         i++;
2277                         size -= vm_page_size;
2278                 }
2279         }
2280         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
2281         if (newoff) {
2282                 /*
2283                  * Offset is not cluster aligned, so number of pages
2284                  * and bitmaps must be adjusted
2285                  */
2286                 clmap->cl_numpages -= atop_32(newoff);
2287                 CLMAP_SHIFT(clmap, vs);
2288                 CLMAP_SHIFTALLOC(clmap, vs);
2289         }
2290
2291         /*
2292          *
2293          * The setting of valid bits and handling of write errors
2294          * must be done here, while we hold the lock on the map.
2295          * It logically should be done in ps_vs_write_complete().
2296          * The size and error information has been passed from
2297          * ps_vs_write_complete().  If the size parameter is non-zero,
2298          * then there is work to be done.  If error is also non-zero,
2299          * then the error number is recorded in the cluster and the
2300          * entire cluster is in error.
2301          */
2302         if (size && flag == CL_FIND) {
2303                 dp_offset_t off = (dp_offset_t) 0;
2304
2305                 if (!error) {
2306                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
2307                              i++) {
2308                                 VSM_SETPG(*vsmap, i);
2309                                 size -= vm_page_size;
2310                         }
2311                         ASSERT(i <= VSCLSIZE(vs));
2312                 } else {
2313                         BS_STAT(clmap->cl_ps->ps_bs,
2314                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
2315                                         atop_32(size));
2316                         off = VSM_CLOFF(*vsmap);
2317                         VSM_SETERR(*vsmap, error);
2318                 }
2319                 /*
2320                  * Deallocate cluster if error, and no valid pages
2321                  * already present.
2322                  */
2323                 if (off != (dp_offset_t) 0)
2324                         ps_deallocate_cluster(clmap->cl_ps, off);
2325                 VS_MAP_UNLOCK(vs);
2326                 return (dp_offset_t) 0;
2327         } else
2328                 VS_MAP_UNLOCK(vs);
2329
2330         DP_DEBUG(DEBUG_VS_INTERNAL,
2331                  ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2332                   newcl+newoff, (int) vs, (int) vsmap, flag));
2333         DP_DEBUG(DEBUG_VS_INTERNAL,
2334                  ("     clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2335                   (int) clmap->cl_ps, clmap->cl_numpages,
2336                   (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
2337
2338         return (newcl + newoff);
2339 }
2340
2341 void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t);     /* forward */
2342
2343 void
2344 ps_clunmap(
2345         vstruct_t       vs,
2346         dp_offset_t     offset,
2347         dp_size_t       length)
2348 {
2349         dp_offset_t             cluster; /* The cluster number of offset */
2350         struct vs_map           *vsmap;
2351         struct ps_vnode_trim_data trim_data;
2352
2353         ps_vnode_trim_init(&trim_data);
2354
2355         VS_MAP_LOCK(vs);
2356
2357         /*
2358          * Loop through all clusters in this range, freeing paging segment
2359          * clusters and map entries as encountered.
2360          */
2361         while (length > 0) {
2362                 dp_offset_t     newoff;
2363                 unsigned int    i;
2364
2365                 cluster = atop_32(offset) >> vs->vs_clshift;
2366                 if (vs->vs_indirect)    /* indirect map */
2367                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2368                 else
2369                         vsmap = vs->vs_dmap;
2370                 if (vsmap == NULL) {
2371                         ps_vnode_trim_now(&trim_data);
2372                         VS_MAP_UNLOCK(vs);
2373                         return;
2374                 }
2375                 vsmap += cluster%CLMAP_ENTRIES;
2376                 if (VSM_ISCLR(*vsmap)) {
2377                         ps_vnode_trim_now(&trim_data);
2378                         length -= vm_page_size;
2379                         offset += vm_page_size;
2380                         continue;
2381                 }
2382                 /*
2383                  * We've got a valid mapping.  Clear it and deallocate
2384                  * paging segment cluster pages.
2385                  * Optimize for entire cluster cleraing.
2386                  */
2387                 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2388                         /*
2389                          * Not cluster aligned.
2390                          */
2391                         ASSERT(trunc_page(newoff) == newoff);
2392                         i = atop_32(newoff);
2393                 } else
2394                         i = 0;
2395                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2396                         VSM_CLRPG(*vsmap, i);
2397                         VSM_CLRALLOC(*vsmap, i);
2398                         length -= vm_page_size;
2399                         offset += vm_page_size;
2400                         i++;
2401                 }
2402
2403                 /*
2404                  * If map entry is empty, clear and deallocate cluster.
2405                  */
2406                 if (!VSM_BMAP(*vsmap)) {
2407                         ps_vnode_trim_more(&trim_data,
2408                                               vsmap,
2409                                               vs->vs_clshift,
2410                                               VSCLSIZE(vs) * vm_page_size);
2411                         ps_deallocate_cluster(VSM_PS(*vsmap),
2412                                               VSM_CLOFF(*vsmap));
2413                         VSM_CLR(*vsmap);
2414                 } else {
2415                         ps_vnode_trim_now(&trim_data);
2416                 }
2417         }
2418         ps_vnode_trim_now(&trim_data);
2419
2420         VS_MAP_UNLOCK(vs);
2421 }
2422
2423 void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */
2424
2425 void
2426 ps_vs_write_complete(
2427         vstruct_t       vs,
2428         dp_offset_t     offset,
2429         dp_size_t       size,
2430         int             error)
2431 {
2432         struct clmap    clmap;
2433
2434         /*
2435          * Get the struct vsmap for this cluster.
2436          * Use READ, even though it was written, because the
2437          * cluster MUST be present, unless there was an error
2438          * in the original ps_clmap (e.g. no space), in which
2439          * case, nothing happens.
2440          *
2441          * Must pass enough information to ps_clmap to allow it
2442          * to set the vs_map structure bitmap under lock.
2443          */
2444         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2445 }
2446
2447 void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int);    /* forward */
2448
2449 void
2450 vs_cl_write_complete(
2451         vstruct_t                       vs,
2452         __unused paging_segment_t       ps,
2453         dp_offset_t                     offset,
2454         __unused vm_offset_t            addr,
2455         dp_size_t                       size,
2456         boolean_t                       async,
2457         int                             error)
2458 {
2459 //      kern_return_t   kr;
2460
2461         if (error) {
2462                 /*
2463                  * For internal objects, the error is recorded on a
2464                  * per-cluster basis by ps_clmap() which is called
2465                  * by ps_vs_write_complete() below.
2466                  */
2467                 dprintf(("write failed error = 0x%x\n", error));
2468                 /* add upl_abort code here */
2469         } else
2470                 GSTAT(global_stats.gs_pages_out += atop_32(size));
2471         /*
2472          * Notify the vstruct mapping code, so it can do its accounting.
2473          */
2474         ps_vs_write_complete(vs, offset, size, error);
2475
2476         if (async) {
2477                 VS_LOCK(vs);
2478                 ASSERT(vs->vs_async_pending > 0);
2479                 vs->vs_async_pending -= size;
2480                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2481                         vs->vs_waiting_async = FALSE;
2482                         VS_UNLOCK(vs);
2483                         thread_wakeup(&vs->vs_async_pending);
2484                 } else {
2485                         VS_UNLOCK(vs);
2486                 }
2487         }
2488 }
2489
2490 #ifdef DEVICE_PAGING
2491 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2492
2493 kern_return_t
2494 device_write_reply(
2495         MACH_PORT_FACE  reply_port,
2496         kern_return_t   device_code,
2497         io_buf_len_t    bytes_written)
2498 {
2499         struct vs_async *vsa;
2500
2501         vsa = (struct vs_async *)
2502                 ((struct vstruct_alias *)(reply_port->ip_alias))->vs;
2503
2504         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2505                 device_code = KERN_FAILURE;
2506         }
2507
2508         vsa->vsa_error = device_code;
2509
2510
2511         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2512         if(vsa->vsa_flags & VSA_TRANSFER) {
2513                 /* revisit when async disk segments redone */
2514                 if(vsa->vsa_error) {
2515                    /* need to consider error condition.  re-write data or */
2516                    /* throw it away here. */
2517                    vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2518                 }
2519                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2520                                                 vsa->vsa_size, vsa->vsa_error);
2521         } else {
2522                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2523                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2524                              vsa->vsa_error);
2525         }
2526         VS_FREE_ASYNC(vsa);
2527
2528         return KERN_SUCCESS;
2529 }
2530
2531 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2532 kern_return_t
2533 device_write_reply_inband(
2534         MACH_PORT_FACE          reply_port,
2535         kern_return_t           return_code,
2536         io_buf_len_t            bytes_written)
2537 {
2538         panic("device_write_reply_inband: illegal");
2539         return KERN_SUCCESS;
2540 }
2541
2542 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2543 kern_return_t
2544 device_read_reply(
2545         MACH_PORT_FACE          reply_port,
2546         kern_return_t           return_code,
2547         io_buf_ptr_t            data,
2548         mach_msg_type_number_t  dataCnt)
2549 {
2550         struct vs_async *vsa;
2551         vsa = (struct vs_async *)
2552                 ((struct vstruct_alias *)(reply_port->defpager_importance.alias))->vs;
2553         vsa->vsa_addr = (vm_offset_t)data;
2554         vsa->vsa_size = (vm_size_t)dataCnt;
2555         vsa->vsa_error = return_code;
2556         thread_wakeup(&vsa);
2557         return KERN_SUCCESS;
2558 }
2559
2560 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2561 kern_return_t
2562 device_read_reply_inband(
2563         MACH_PORT_FACE          reply_port,
2564         kern_return_t           return_code,
2565         io_buf_ptr_inband_t     data,
2566         mach_msg_type_number_t  dataCnt)
2567 {
2568         panic("device_read_reply_inband: illegal");
2569         return KERN_SUCCESS;
2570 }
2571
2572 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2573 kern_return_t
2574 device_read_reply_overwrite(
2575         MACH_PORT_FACE          reply_port,
2576         kern_return_t           return_code,
2577         io_buf_len_t            bytes_read)
2578 {
2579         panic("device_read_reply_overwrite: illegal\n");
2580         return KERN_SUCCESS;
2581 }
2582
2583 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2584 kern_return_t
2585 device_open_reply(
2586         MACH_PORT_FACE          reply_port,
2587         kern_return_t           return_code,
2588         MACH_PORT_FACE          device_port)
2589 {
2590         panic("device_open_reply: illegal\n");
2591         return KERN_SUCCESS;
2592 }
2593
2594 kern_return_t
2595 ps_read_device(
2596         paging_segment_t        ps,
2597         dp_offset_t             offset,
2598         vm_offset_t             *bufferp,
2599         unsigned int            size,
2600         unsigned int            *residualp,
2601         int                     flags)
2602 {
2603         kern_return_t   kr;
2604         recnum_t        dev_offset;
2605         unsigned int    bytes_wanted;
2606         unsigned int    bytes_read;
2607         unsigned int    total_read;
2608         vm_offset_t     dev_buffer;
2609         vm_offset_t     buf_ptr;
2610         unsigned int    records_read;
2611         struct vs_async *vsa;
2612
2613         device_t        device;
2614         vm_map_copy_t   device_data = NULL;
2615         default_pager_thread_t *dpt = NULL;
2616
2617         device = dev_port_lookup(ps->ps_device);
2618         clustered_reads[atop_32(size)]++;
2619
2620         dev_offset = (ps->ps_offset +
2621                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2622         bytes_wanted = size;
2623         total_read = 0;
2624         *bufferp = (vm_offset_t)NULL;
2625
2626         do {
2627                 vsa = VS_ALLOC_ASYNC();
2628                 if (vsa) {
2629                         vsa->vsa_vs = NULL;
2630                         vsa->vsa_addr = 0;
2631                         vsa->vsa_offset = 0;
2632                         vsa->vsa_size = 0;
2633                         vsa->vsa_ps = NULL;
2634                 }
2635                 ip_lock(vsa->reply_port);
2636                 vsa->reply_port->ip_sorights++;
2637                 ip_reference(vsa->reply_port);
2638                 ip_unlock(vsa->reply_port);
2639                 kr = ds_device_read_common(device,
2640                                  vsa->reply_port,
2641                                  (mach_msg_type_name_t)
2642                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2643                                  (dev_mode_t) 0,
2644                                  dev_offset,
2645                                  bytes_wanted,
2646                                  (IO_READ | IO_CALL),
2647                                  (io_buf_ptr_t *) &dev_buffer,
2648                                  (mach_msg_type_number_t *) &bytes_read);
2649                 if(kr == MIG_NO_REPLY) {
2650                         assert_wait(&vsa, THREAD_UNINT);
2651                         thread_block(THREAD_CONTINUE_NULL);
2652
2653                         dev_buffer = vsa->vsa_addr;
2654                         bytes_read = (unsigned int)vsa->vsa_size;
2655                         kr = vsa->vsa_error;
2656                 }
2657                 VS_FREE_ASYNC(vsa);
2658                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2659                         break;
2660                 }
2661                 total_read += bytes_read;
2662
2663                 /*
2664                  * If we got the entire range, use the returned dev_buffer.
2665                  */
2666                 if (bytes_read == size) {
2667                         *bufferp = (vm_offset_t)dev_buffer;
2668                         break;
2669                 }
2670
2671 #if 1
2672                 dprintf(("read only %d bytes out of %d\n",
2673                          bytes_read, bytes_wanted));
2674 #endif
2675                 if(dpt == NULL) {
2676                         dpt = get_read_buffer();
2677                         buf_ptr = dpt->dpt_buffer;
2678                         *bufferp = (vm_offset_t)buf_ptr;
2679                 }
2680                 /*
2681                  * Otherwise, copy the data into the provided buffer (*bufferp)
2682                  * and append the rest of the range as it comes in.
2683                  */
2684                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2685                 buf_ptr += bytes_read;
2686                 bytes_wanted -= bytes_read;
2687                 records_read = (bytes_read >>
2688                                 (vm_page_shift - ps->ps_record_shift));
2689                 dev_offset += records_read;
2690                 DP_DEBUG(DEBUG_VS_INTERNAL,
2691                          ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2692                           dev_buffer, bytes_read));
2693                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2694                     != KERN_SUCCESS)
2695                         Panic("dealloc buf");
2696         } while (bytes_wanted);
2697
2698         *residualp = size - total_read;
2699         if((dev_buffer != *bufferp) && (total_read != 0)) {
2700                 vm_offset_t temp_buffer;
2701                 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK));
2702                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2703                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2704                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2705                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2706                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2707                         (vm_map_copy_t *)&device_data, FALSE))
2708                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2709         }
2710         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2711                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2712                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2713                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2714                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2715                         (vm_map_copy_t *)&device_data, FALSE))
2716                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2717         }
2718         else {
2719                 device_data = NULL;
2720         }
2721         *bufferp = (vm_offset_t)device_data;
2722
2723         if(dpt != NULL) {
2724                 /* Free the receive buffer */
2725                 dpt->checked_out = 0;
2726                 thread_wakeup(&dpt_array);
2727         }
2728         return KERN_SUCCESS;
2729 }
2730
2731 kern_return_t
2732 ps_write_device(
2733         paging_segment_t        ps,
2734         dp_offset_t             offset,
2735         vm_offset_t             addr,
2736         unsigned int            size,
2737         struct vs_async         *vsa)
2738 {
2739         recnum_t        dev_offset;
2740         io_buf_len_t    bytes_to_write, bytes_written;
2741         recnum_t        records_written;
2742         kern_return_t   kr;
2743         MACH_PORT_FACE  reply_port;
2744
2745
2746
2747         clustered_writes[atop_32(size)]++;
2748
2749         dev_offset = (ps->ps_offset +
2750                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2751         bytes_to_write = size;
2752
2753         if (vsa) {
2754                 /*
2755                  * Asynchronous write.
2756                  */
2757                 reply_port = vsa->reply_port;
2758                 ip_lock(reply_port);
2759                 reply_port->ip_sorights++;
2760                 ip_reference(reply_port);
2761                 ip_unlock(reply_port);
2762                 {
2763                 device_t        device;
2764                 device = dev_port_lookup(ps->ps_device);
2765
2766                 vsa->vsa_addr = addr;
2767                 kr=ds_device_write_common(device,
2768                         reply_port,
2769                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2770                         (dev_mode_t) 0,
2771                         dev_offset,
2772                         (io_buf_ptr_t)  addr,
2773                         size,
2774                         (IO_WRITE | IO_CALL),
2775                         &bytes_written);
2776                 }
2777                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2778                         if (verbose)
2779                                 dprintf(("%s0x%x, addr=0x%x,"
2780                                          "size=0x%x,offset=0x%x\n",
2781                                          "device_write_request returned ",
2782                                          kr, addr, size, offset));
2783                         BS_STAT(ps->ps_bs,
2784                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2785                         /* do the completion notification to free resources */
2786                         device_write_reply(reply_port, kr, 0);
2787                         return PAGER_ERROR;
2788                 }
2789         } else do {
2790                 /*
2791                  * Synchronous write.
2792                  */
2793                 {
2794                 device_t        device;
2795                 device = dev_port_lookup(ps->ps_device);
2796                 kr=ds_device_write_common(device,
2797                         IP_NULL, 0,
2798                         (dev_mode_t) 0,
2799                         dev_offset,
2800                         (io_buf_ptr_t)  addr,
2801                         size,
2802                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2803                         &bytes_written);
2804                 }
2805                 if (kr != KERN_SUCCESS) {
2806                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2807                                  "device_write returned ",
2808                                  kr, addr, size, offset));
2809                         BS_STAT(ps->ps_bs,
2810                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2811                         return PAGER_ERROR;
2812                 }
2813                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2814                         Panic("fragmented write");
2815                 records_written = (bytes_written >>
2816                                    (vm_page_shift - ps->ps_record_shift));
2817                 dev_offset += records_written;
2818 #if 1
2819                 if (bytes_written != bytes_to_write) {
2820                         dprintf(("wrote only %d bytes out of %d\n",
2821                                  bytes_written, bytes_to_write));
2822                 }
2823 #endif
2824                 bytes_to_write -= bytes_written;
2825                 addr += bytes_written;
2826         } while (bytes_to_write > 0);
2827
2828         return PAGER_SUCCESS;
2829 }
2830
2831
2832 #else /* !DEVICE_PAGING */
2833
2834 kern_return_t
2835 ps_read_device(
2836         __unused paging_segment_t       ps,
2837         __unused dp_offset_t            offset,
2838         __unused vm_offset_t            *bufferp,
2839         __unused unsigned int           size,
2840         __unused unsigned int           *residualp,
2841         __unused int                            flags)
2842 {
2843   panic("ps_read_device not supported");
2844   return KERN_FAILURE;
2845 }
2846
2847 kern_return_t
2848 ps_write_device(
2849         __unused paging_segment_t       ps,
2850         __unused dp_offset_t            offset,
2851         __unused vm_offset_t            addr,
2852         __unused unsigned int           size,
2853         __unused struct vs_async        *vsa)
2854 {
2855   panic("ps_write_device not supported");
2856   return KERN_FAILURE;
2857 }
2858
2859 #endif /* DEVICE_PAGING */
2860 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t);      /* forward */
2861
2862 void
2863 pvs_object_data_provided(
2864         __unused vstruct_t              vs,
2865         __unused upl_t                  upl,
2866         __unused upl_offset_t   offset,
2867         upl_size_t                              size)
2868 {
2869 #if     RECLAIM_SWAP
2870         boolean_t       empty;
2871 #endif
2872
2873         DP_DEBUG(DEBUG_VS_INTERNAL,
2874                  ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2875                   upl, offset, size));
2876
2877         ASSERT(size > 0);
2878         GSTAT(global_stats.gs_pages_in += atop_32(size));
2879
2880 /* check upl iosync flag instead of using RECLAIM_SWAP*/
2881 #if     RECLAIM_SWAP
2882         if (size != upl->size) {
2883                 if (size) {
2884                         ps_clunmap(vs, offset, size);
2885                         upl_commit_range(upl, 0, size, 0, NULL, 0, &empty);
2886                 }
2887                 upl_abort(upl, UPL_ABORT_ERROR);
2888                 upl_deallocate(upl);
2889         } else {
2890                 ps_clunmap(vs, offset, size);
2891                 upl_commit(upl, NULL, 0);
2892                 upl_deallocate(upl);
2893         }
2894 #endif  /* RECLAIM_SWAP */
2895
2896 }
2897
2898 static memory_object_offset_t   last_start;
2899 static vm_size_t                last_length;
2900
2901 /*
2902  * A "cnt" of 0 means that the caller just wants to check if the page at
2903  * offset "vs_offset" exists in the backing store.  That page hasn't been
2904  * prepared, so no need to release it.
2905  *
2906  * A "cnt" of -1 means that the caller wants to bring back from the backing
2907  * store all existing pages in the cluster containing "vs_offset".
2908  */
2909 kern_return_t
2910 pvs_cluster_read(
2911         vstruct_t       vs,
2912         dp_offset_t     vs_offset,
2913         dp_size_t       cnt,
2914         void            *fault_info)
2915 {
2916         kern_return_t           error = KERN_SUCCESS;
2917         unsigned int            size;
2918         unsigned int            residual;
2919         unsigned int            request_flags;
2920         int                     io_flags = 0;
2921         int                     seg_index;
2922         int                     pages_in_cl;
2923         int                     cl_size;
2924         int                     cl_mask;
2925         int                     cl_index;
2926         unsigned int            xfer_size;
2927         dp_offset_t             orig_vs_offset;
2928         dp_offset_t             ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2929         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2930         struct clmap            clmap;
2931         upl_t                   upl;
2932         unsigned int            page_list_count;
2933         memory_object_offset_t  cluster_start;
2934         vm_size_t               cluster_length;
2935         uint32_t                io_streaming;
2936         int                     i;
2937         boolean_t               io_sync = FALSE;
2938         boolean_t               reclaim_all = FALSE;
2939
2940         pages_in_cl = 1 << vs->vs_clshift;
2941         cl_size = pages_in_cl * vm_page_size;
2942         cl_mask = cl_size - 1;
2943
2944         request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2945
2946         if (cnt == (dp_size_t) -1)
2947                 reclaim_all = TRUE;
2948
2949         if (reclaim_all == TRUE) {
2950                 /*
2951                  * We've been called from ps_vstruct_reclaim() to move all
2952                  * the object's swapped pages back to VM pages.
2953                  * This can put memory pressure on the system, so we do want
2954                  * to wait for free pages, to avoid getting in the way of the
2955                  * vm_pageout_scan() thread.
2956                  * Let's not use UPL_NOBLOCK in this case.
2957                  */
2958                 vs_offset &= ~cl_mask;
2959                 i = pages_in_cl;
2960         } else {
2961                 i = 1;
2962
2963                 /*
2964                  * if the I/O cluster size == PAGE_SIZE, we don't want to set
2965                  * the UPL_NOBLOCK since we may be trying to recover from a
2966                  * previous partial pagein I/O that occurred because we were low
2967                  * on memory and bailed early in order to honor the UPL_NOBLOCK...
2968                  * since we're only asking for a single page, we can block w/o fear
2969                  * of tying up pages while waiting for more to become available
2970                  */
2971                 if (fault_info == NULL || ((vm_object_fault_info_t)fault_info)->cluster_size > PAGE_SIZE)
2972                         request_flags |= UPL_NOBLOCK;
2973         }
2974
2975 again:
2976         cl_index = (vs_offset & cl_mask) / vm_page_size;
2977
2978         if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) ||
2979             !CLMAP_ISSET(clmap, cl_index)) {
2980                 /*
2981                  * the needed page doesn't exist in the backing store...
2982                  * we don't want to try to do any I/O, just abort the
2983                  * page and let the fault handler provide a zero-fill
2984                  */
2985                 if (cnt == 0) {
2986                         /*
2987                          * The caller was just poking at us to see if
2988                          * the page has been paged out.  No need to
2989                          * mess with the page at all.
2990                          * Just let the caller know we don't have that page.
2991                          */
2992                         return KERN_FAILURE;
2993                 }
2994                 if (reclaim_all == TRUE) {
2995                         i--;
2996                         if (i == 0) {
2997                                 /* no more pages in this cluster */
2998                                 return KERN_FAILURE;
2999                         }
3000                         /* try the next page in this cluster */
3001                         vs_offset += vm_page_size;
3002                         goto again;
3003                 }
3004
3005                 page_list_count = 0;
3006
3007                 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
3008                                                 PAGE_SIZE, PAGE_SIZE,
3009                                                 &upl, NULL, &page_list_count,
3010                                                 request_flags  | UPL_SET_INTERNAL);
3011                 upl_range_needed(upl, 0, 1);
3012
3013                 if (clmap.cl_error)
3014                         upl_abort(upl, UPL_ABORT_ERROR);
3015                 else
3016                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
3017                 upl_deallocate(upl);
3018
3019                 return KERN_SUCCESS;
3020         }
3021
3022         if (cnt == 0) {
3023                 /*
3024                  * The caller was just poking at us to see if
3025                  * the page has been paged out.  No need to
3026                  * mess with the page at all.
3027                  * Just let the caller know we do have that page.
3028                  */
3029                 return KERN_SUCCESS;
3030         }
3031
3032         if(((vm_object_fault_info_t)fault_info)->io_sync == TRUE ) {
3033                 io_sync = TRUE;
3034         } else {
3035 #if RECLAIM_SWAP
3036                 io_sync = TRUE;
3037 #endif  /* RECLAIM_SWAP */
3038         }
3039
3040         if( io_sync == TRUE ) {
3041
3042                 io_flags |= UPL_IOSYNC | UPL_NOCOMMIT;
3043 #if USE_PRECIOUS
3044                 request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE;
3045 #else   /* USE_PRECIOUS */
3046                 request_flags |= UPL_REQUEST_SET_DIRTY;
3047 #endif  /* USE_PRECIOUS */
3048         }
3049
3050         assert(dp_encryption_inited);
3051         if (dp_encryption) {
3052                 /*
3053                  * ENCRYPTED SWAP:
3054                  * request that the UPL be prepared for
3055                  * decryption.
3056                  */
3057                 request_flags |= UPL_ENCRYPT;
3058                 io_flags |= UPL_PAGING_ENCRYPTED;
3059         }
3060         orig_vs_offset = vs_offset;
3061
3062         assert(cnt != 0);
3063         cnt = VM_SUPER_CLUSTER;
3064         cluster_start = (memory_object_offset_t) vs_offset;
3065         cluster_length = (vm_size_t) cnt;
3066         io_streaming = 0;
3067
3068         /*
3069          * determine how big a speculative I/O we should try for...
3070          */
3071         if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
3072                 assert(vs_offset >= (dp_offset_t) cluster_start &&
3073                        vs_offset < (dp_offset_t) (cluster_start + cluster_length));
3074                 vs_offset = (dp_offset_t) cluster_start;
3075                 cnt = (dp_size_t) cluster_length;
3076         } else {
3077                 cluster_length = PAGE_SIZE;
3078                 cnt = PAGE_SIZE;
3079         }
3080
3081         if (io_streaming)
3082                 io_flags |= UPL_IOSTREAMING;
3083
3084         last_start = cluster_start;
3085         last_length = cluster_length;
3086
3087         /*
3088          * This loop will be executed multiple times until the entire
3089          * range has been looked at or we issue an I/O... if the request spans cluster
3090          * boundaries, the clusters will be checked for logical continunity,
3091          * if contiguous the I/O request will span multiple clusters...
3092          * at most only 1 I/O will be issued... it will encompass the original offset
3093          */
3094         while (cnt && error == KERN_SUCCESS) {
3095                 int     ps_info_valid;
3096
3097                 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
3098                         size = VM_SUPER_CLUSTER;
3099                         size -= vs_offset & cl_mask;
3100                 } else if (cnt > VM_SUPER_CLUSTER)
3101                         size = VM_SUPER_CLUSTER;
3102                 else
3103                         size = cnt;
3104
3105                 cnt -= size;
3106
3107                 ps_info_valid = 0;
3108                 seg_index     = 0;
3109
3110                 while (size > 0 && error == KERN_SUCCESS) {
3111                         unsigned int  abort_size;
3112                         unsigned int  lsize;
3113                         int           failed_size;
3114                         int           beg_pseg;
3115                         int           beg_indx;
3116                         dp_offset_t   cur_offset;
3117
3118                         if ( !ps_info_valid) {
3119                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3120                                 psp[seg_index]       = CLMAP_PS(clmap);
3121                                 ps_info_valid = 1;
3122                         }
3123                         /*
3124                          * skip over unallocated physical segments
3125                          */
3126                         if (ps_offset[seg_index] == (dp_offset_t) -1) {
3127                                 abort_size = cl_size - (vs_offset & cl_mask);
3128                                 abort_size = MIN(abort_size, size);
3129
3130                                 size      -= abort_size;
3131                                 vs_offset += abort_size;
3132
3133                                 seg_index++;
3134                                 ps_info_valid = 0;
3135
3136                                 continue;
3137                         }
3138                         cl_index = (vs_offset & cl_mask) / vm_page_size;
3139
3140                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
3141                                 /*
3142                                  * skip over unallocated pages
3143                                  */
3144                                 if (CLMAP_ISSET(clmap, cl_index))
3145                                         break;
3146                                 abort_size += vm_page_size;
3147                         }
3148                         if (abort_size) {
3149                                 size      -= abort_size;
3150                                 vs_offset += abort_size;
3151
3152                                 if (cl_index == pages_in_cl) {
3153                                         /*
3154                                          * if we're at the end of this physical cluster
3155                                          * then bump to the next one and continue looking
3156                                          */
3157                                         seg_index++;
3158                                         ps_info_valid = 0;
3159
3160                                         continue;
3161                                 }
3162                                 if (size == 0)
3163                                         break;
3164                         }
3165                         /*
3166                          * remember the starting point of the first allocated page
3167                          * for the I/O we're about to issue
3168                          */
3169                         beg_pseg   = seg_index;
3170                         beg_indx   = cl_index;
3171                         cur_offset = vs_offset;
3172
3173                         /*
3174                          * calculate the size of the I/O that we can do...
3175                          * this may span multiple physical segments if
3176                          * they are contiguous
3177                          */
3178                         for (xfer_size = 0; xfer_size < size; ) {
3179
3180                                 while (cl_index < pages_in_cl && xfer_size < size) {
3181                                         /*
3182                                          * accumulate allocated pages within
3183                                          * a physical segment
3184                                          */
3185                                         if (CLMAP_ISSET(clmap, cl_index)) {
3186                                                 xfer_size  += vm_page_size;
3187                                                 cur_offset += vm_page_size;
3188                                                 cl_index++;
3189
3190                                                 BS_STAT(psp[seg_index]->ps_bs,
3191                                                         psp[seg_index]->ps_bs->bs_pages_in++);
3192                                         } else
3193                                                 break;
3194                                 }
3195                                 if (cl_index < pages_in_cl || xfer_size >= size) {
3196                                         /*
3197                                          * we've hit an unallocated page or
3198                                          * the end of this request... see if
3199                                          * it's time to fire the I/O
3200                                          */
3201                                         break;
3202                                 }
3203                                 /*
3204                                  * we've hit the end of the current physical
3205                                  * segment and there's more to do, so try
3206                                  * moving to the next one
3207                                  */
3208                                 seg_index++;
3209
3210                                 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3211                                 psp[seg_index] = CLMAP_PS(clmap);
3212                                 ps_info_valid = 1;
3213
3214                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
3215                                         /*
3216                                          * if the physical segment we're about
3217                                          * to step into is not contiguous to
3218                                          * the one we're currently in, or it's
3219                                          * in a different paging file, or
3220                                          * it hasn't been allocated....
3221                                          * we stop this run and go check
3222                                          * to see if it's time to fire the I/O
3223                                          */
3224                                         break;
3225                                 }
3226                                 /*
3227                                  * start with first page of the next physical
3228                                  * segment
3229                                  */
3230                                 cl_index = 0;
3231                         }
3232                         if (xfer_size == 0) {
3233                                 /*
3234                                  * no I/O to generate for this segment
3235                                  */
3236                                 continue;
3237                         }
3238                         if (cur_offset <= orig_vs_offset) {
3239                                 /*
3240                                  * we've hit a hole in our speculative cluster
3241                                  * before the offset that we're really after...
3242                                  * don't issue the I/O since it doesn't encompass
3243                                  * the original offset and we're looking to only
3244                                  * pull in the speculative pages if they can be
3245                                  * made part of a single I/O
3246                                  */
3247                                 size      -= xfer_size;
3248                                 vs_offset += xfer_size;
3249
3250                                 continue;
3251                         }
3252                         /*
3253                          * we have a contiguous range of allocated pages
3254                          * to read from that encompasses the original offset
3255                          */
3256                         page_list_count = 0;
3257                         memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
3258                                                         xfer_size, xfer_size,
3259                                                         &upl, NULL, &page_list_count,
3260                                                         request_flags | UPL_SET_INTERNAL);
3261
3262                         error = ps_read_file(psp[beg_pseg],
3263                                              upl, (upl_offset_t) 0,
3264                                              ps_offset[beg_pseg] + (beg_indx * vm_page_size),
3265                                              xfer_size, &residual, io_flags);
3266
3267
3268                         /*
3269                          * Adjust counts and send response to VM.  Optimize
3270                          * for the common case, i.e. no error and/or partial
3271                          * data. If there was an error, then we need to error
3272                          * the entire range, even if some data was successfully
3273                          * read. If there was a partial read we may supply some
3274                          * data and may error some as well.  In all cases the
3275                          * VM must receive some notification for every page
3276                          * in the range.
3277                          */
3278                         if ((error == KERN_SUCCESS) && (residual == 0)) {
3279                                 /*
3280                                  * Got everything we asked for, supply the data
3281                                  * to the VM.  Note that as a side effect of
3282                                  * supplying the data, the buffer holding the
3283                                  * supplied data is deallocated from the pager's
3284                                  *  address space.
3285                                  */
3286                                 lsize = xfer_size;
3287                                 failed_size = 0;
3288                         } else {
3289                                 lsize = 0;
3290                                 failed_size = xfer_size;
3291
3292                                 if (error == KERN_SUCCESS) {
3293                                         if (residual == xfer_size) {
3294                                                 /*
3295                                                  * If a read operation returns no error
3296                                                  * and no data moved, we turn it into
3297                                                  * an error, assuming we're reading at
3298                                                  * or beyong EOF.
3299                                                  * Fall through and error the entire range.
3300                                                  */
3301                                                 error = KERN_FAILURE;
3302                                         } else {
3303                                                 /*
3304                                                  * Otherwise, we have partial read. If
3305                                                  * the part read is a integral number
3306                                                  * of pages supply it. Otherwise round
3307                                                  * it up to a page boundary, zero fill
3308                                                  * the unread part, and supply it.
3309                                                  * Fall through and error the remainder
3310                                                  * of the range, if any.
3311                                                  */
3312                                                 int fill;
3313
3314                                                 fill = residual & (vm_page_size - 1);
3315                                                 lsize = (xfer_size - residual) + fill;
3316
3317                                                 if (lsize < xfer_size)
3318                                                         failed_size = xfer_size - lsize;
3319
3320                                                 if (reclaim_all == FALSE)
3321                                                         error = KERN_FAILURE;
3322                                         }
3323                                 }
3324                         }
3325                         pvs_object_data_provided(vs, upl, vs_offset, lsize);
3326
3327                         if (failed_size) {
3328                                 /*
3329                                  * There was an error in some part of the range, tell
3330                                  * the VM. Note that error is explicitly checked again
3331                                  * since it can be modified above.
3332                                  */
3333                                 BS_STAT(psp[beg_pseg]->ps_bs,
3334                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
3335                         }
3336                         /*
3337                          * we've issued a single I/O that encompassed the original offset
3338                          * at this point we either met our speculative request length or
3339                          * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3340                          * not present or not physically contiguous to the previous one), so
3341                          * we're done issuing I/O at this point
3342                          */
3343                         return (error);
3344                 }
3345         }
3346         return error;
3347 }
3348
3349 int vs_do_async_write = 1;
3350
3351 kern_return_t
3352 vs_cluster_write(
3353         vstruct_t       vs,
3354         upl_t           internal_upl,
3355         upl_offset_t    offset,
3356         upl_size_t      cnt,
3357         boolean_t       dp_internal,
3358         int             flags)
3359 {
3360         upl_size_t      transfer_size;
3361         int             error = 0;
3362         struct clmap    clmap;
3363
3364         dp_offset_t     actual_offset;  /* Offset within paging segment */
3365         paging_segment_t ps;
3366         dp_offset_t     mobj_base_addr;
3367         dp_offset_t     mobj_target_addr;
3368
3369         upl_t           upl;
3370         upl_page_info_t *pl;
3371         int             page_index;
3372         unsigned int    page_max_index;
3373         int             list_size;
3374         int             pages_in_cl;
3375         unsigned int    cl_size;
3376         int             base_index;
3377         unsigned int    seg_size;
3378         unsigned int    upl_offset_in_object;
3379         boolean_t       minimal_clustering = FALSE;
3380         boolean_t       found_dirty;
3381
3382         if (!dp_encryption_inited) {
3383                 /*
3384                  * ENCRYPTED SWAP:
3385                  * Once we've started using swap, we
3386                  * can't change our mind on whether
3387                  * it needs to be encrypted or
3388                  * not.
3389                  */
3390                 dp_encryption_inited = TRUE;
3391         }
3392         if (dp_encryption) {
3393                 /*
3394                  * ENCRYPTED SWAP:
3395                  * the UPL will need to be encrypted...
3396                  */
3397                 flags |= UPL_PAGING_ENCRYPTED;
3398         }
3399
3400         pages_in_cl = 1 << vs->vs_clshift;
3401         cl_size = pages_in_cl * vm_page_size;
3402
3403 #if CONFIG_FREEZE
3404         minimal_clustering = TRUE;
3405 #else
3406         if (dp_isssd == TRUE)
3407                 minimal_clustering = TRUE;
3408 #endif
3409         if (!dp_internal) {
3410                 unsigned int page_list_count;
3411                 int          request_flags;
3412                 unsigned int super_size;
3413                 int          first_dirty;
3414                 int          num_dirty;
3415                 int          num_of_pages;
3416                 int          seg_index;
3417                 upl_offset_t  upl_offset;
3418                 upl_offset_t  upl_offset_aligned;
3419                 dp_offset_t  seg_offset;
3420                 dp_offset_t  ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3421                 paging_segment_t   psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3422
3423
3424                 if (bs_low)
3425                         super_size = cl_size;
3426                 else
3427                         super_size = VM_SUPER_CLUSTER;
3428
3429                 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
3430                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
3431                                 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
3432
3433                 if (dp_encryption) {
3434                         /*
3435                          * ENCRYPTED SWAP:
3436                          * request that the UPL be prepared for
3437                          * encryption.
3438                          */
3439                         request_flags |= UPL_ENCRYPT;
3440                         flags |= UPL_PAGING_ENCRYPTED;
3441                 }
3442
3443                 page_list_count = 0;
3444                 memory_object_super_upl_request(vs->vs_control,
3445                                 (memory_object_offset_t)offset,
3446                                 cnt, super_size,
3447                                 &upl, NULL, &page_list_count,
3448                                 request_flags | UPL_FOR_PAGEOUT);
3449
3450                 /*
3451                  * The default pager does not handle objects larger than
3452                  * 4GB, so it does not deal with offset that don't fit in
3453                  * 32-bit.  Cast down upl->offset now and make sure we
3454                  * did not lose any valuable bits.
3455                  */
3456                 upl_offset_in_object = (unsigned int) upl->offset;
3457                 assert(upl->offset == upl_offset_in_object);
3458
3459                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3460
3461                 seg_size = cl_size - (upl_offset_in_object % cl_size);
3462                 upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1);
3463                 page_index = 0;
3464                 page_max_index = upl->size / PAGE_SIZE;
3465                 found_dirty = TRUE;
3466
3467                 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
3468
3469                         unsigned int    seg_pgcnt;
3470
3471                         seg_pgcnt = seg_size / PAGE_SIZE;
3472
3473                         if (minimal_clustering == TRUE) {
3474                                 unsigned int    non_dirty;
3475
3476                                 non_dirty = 0;
3477                                 found_dirty = FALSE;
3478
3479                                 for (; non_dirty < seg_pgcnt; non_dirty++) {
3480                                         if ((page_index + non_dirty) >= page_max_index)
3481                                                 break;
3482
3483                                         if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) ||
3484                                             UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) {
3485                                                 found_dirty = TRUE;
3486                                                 break;
3487                                         }
3488                                 }
3489                         }
3490                         if (found_dirty == TRUE) {
3491                                 ps_offset[seg_index] =
3492                                         ps_clmap(vs,
3493                                                  upl_offset_aligned,
3494                                                  &clmap, CL_ALLOC,
3495                                                  cl_size, 0);
3496
3497                                 if (ps_offset[seg_index] == (dp_offset_t) -1) {
3498                                         upl_abort(upl, 0);
3499                                         upl_deallocate(upl);
3500
3501                                         return KERN_FAILURE;
3502                                 }
3503                                 psp[seg_index] = CLMAP_PS(clmap);
3504                         }
3505                         if (transfer_size > seg_size) {
3506                                 page_index += seg_pgcnt;
3507                                 transfer_size -= seg_size;
3508                                 upl_offset_aligned += cl_size;
3509                                 seg_size = cl_size;
3510                                 seg_index++;
3511                         } else
3512                                 transfer_size = 0;
3513                 }
3514                 /*
3515                  * Ignore any non-present pages at the end of the
3516                  * UPL.
3517                  */
3518                 for (page_index = upl->size / vm_page_size; page_index > 0;)  {
3519                         if (UPL_PAGE_PRESENT(pl, --page_index)) {
3520                                 page_index++;
3521                                 break;
3522                         }
3523                 }
3524                 if (page_index == 0) {
3525                         /*
3526                          * no pages in the UPL
3527                          * abort and return
3528                          */
3529                         upl_abort(upl, 0);
3530                         upl_deallocate(upl);
3531
3532                         return KERN_SUCCESS;
3533                 }
3534                 num_of_pages = page_index;
3535
3536                 base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE;
3537
3538                 for (page_index = 0; page_index < num_of_pages; ) {
3539                         /*
3540                          * skip over non-dirty pages
3541                          */
3542                         for ( ; page_index < num_of_pages; page_index++) {
3543                                 if (UPL_DIRTY_PAGE(pl, page_index)
3544                                         || UPL_PRECIOUS_PAGE(pl, page_index))
3545                                         /*
3546                                          * this is a page we need to write
3547                                          * go see if we can buddy it up with
3548                                          * others that are contiguous to it
3549                                          */
3550                                         break;
3551                                 /*
3552                                  * if the page is not-dirty, but present we
3553                                  * need to commit it...  This is an unusual
3554                                  * case since we only asked for dirty pages
3555                                  */
3556                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
3557                                         boolean_t empty = FALSE;
3558                                         upl_commit_range(upl,
3559                                                  page_index * vm_page_size,
3560                                                  vm_page_size,
3561                                                  UPL_COMMIT_NOTIFY_EMPTY,
3562                                                  pl,
3563                                                  page_list_count,
3564                                                  &empty);
3565                                         if (empty) {
3566                                                 assert(page_index ==
3567                                                        num_of_pages - 1);
3568                                                 upl_deallocate(upl);
3569                                         }
3570                                 }
3571                         }
3572                         if (page_index == num_of_pages)
3573                                 /*
3574                                  * no more pages to look at, we're out of here
3575                                  */
3576                                 break;
3577
3578                         /*
3579                          * gather up contiguous dirty pages... we have at
3580                          * least 1 * otherwise we would have bailed above
3581                          * make sure that each physical segment that we step
3582                          * into is contiguous to the one we're currently in
3583                          * if it's not, we have to stop and write what we have
3584                          */
3585                         for (first_dirty = page_index;
3586                                         page_index < num_of_pages; ) {
3587                                 if ( !UPL_DIRTY_PAGE(pl, page_index)
3588                                         && !UPL_PRECIOUS_PAGE(pl, page_index))
3589                                         break;
3590                                 page_index++;
3591                                 /*
3592                                  * if we just looked at the last page in the UPL
3593                                  * we don't need to check for physical segment
3594                                  * continuity
3595                                  */
3596                                 if (page_index < num_of_pages) {
3597                                         int cur_seg;
3598                                         int nxt_seg;
3599
3600                                         cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3601                                         nxt_seg = (base_index + page_index)/pages_in_cl;
3602
3603                                         if (cur_seg != nxt_seg) {
3604                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3605                                                 /*
3606                                                  * if the segment we're about
3607                                                  * to step into is not
3608                                                  * contiguous to the one we're
3609                                                  * currently in, or it's in a
3610                                                  * different paging file....
3611                                                  * we stop here and generate
3612                                                  * the I/O
3613                                                  */
3614                                                         break;
3615                                         }
3616                                 }
3617                         }
3618                         num_dirty = page_index - first_dirty;
3619
3620                         if (num_dirty) {
3621                                 upl_offset = first_dirty * vm_page_size;
3622                                 transfer_size = num_dirty * vm_page_size;
3623
3624                                 while (transfer_size) {
3625
3626                                         if ((seg_size = cl_size -
3627                                                 ((upl_offset_in_object +
3628                                                   upl_offset) % cl_size))
3629                                                         > transfer_size)
3630                                                 seg_size = transfer_size;
3631
3632                                         ps_vs_write_complete(
3633                                                 vs,
3634                                                 (upl_offset_in_object +
3635                                                  upl_offset),
3636                                                 seg_size, error);
3637
3638                                         transfer_size -= seg_size;
3639                                         upl_offset += seg_size;
3640                                 }
3641                                 upl_offset = first_dirty * vm_page_size;
3642                                 transfer_size = num_dirty * vm_page_size;
3643
3644                                 seg_index  = (base_index + first_dirty) / pages_in_cl;
3645                                 seg_offset = (upl_offset_in_object + upl_offset) % cl_size;
3646
3647                                 error = ps_write_file(psp[seg_index],
3648                                                 upl, upl_offset,
3649                                                 ps_offset[seg_index]
3650                                                                 + seg_offset,
3651                                                 transfer_size, flags);
3652                         }
3653                 }
3654
3655         } else {
3656                 assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift));
3657                 list_size = cnt;
3658
3659                 page_index = 0;
3660                 /* The caller provides a mapped_data which is derived  */
3661                 /* from a temporary object.  The targeted pages are    */
3662                 /* guaranteed to be set at offset 0 in the mapped_data */
3663                 /* The actual offset however must still be derived     */
3664                 /* from the offset in the vs in question               */
3665                 mobj_base_addr = offset;
3666                 mobj_target_addr = mobj_base_addr;
3667
3668                 for (transfer_size = list_size; transfer_size != 0;) {
3669                         actual_offset = ps_clmap(vs, mobj_target_addr,
3670                                 &clmap, CL_ALLOC,
3671                                 transfer_size < cl_size ?
3672                                         transfer_size : cl_size, 0);
3673                         if(actual_offset == (dp_offset_t) -1) {
3674                                 error = 1;
3675                                 break;
3676                         }
3677                         cnt = MIN(transfer_size,
3678                                   (unsigned) CLMAP_NPGS(clmap) * vm_page_size);
3679                         ps = CLMAP_PS(clmap);
3680                         /* Assume that the caller has given us contiguous */
3681                         /* pages */
3682                         if(cnt) {
3683                                 ps_vs_write_complete(vs, mobj_target_addr,
3684                                                                 cnt, error);
3685                                 error = ps_write_file(ps, internal_upl,
3686                                                 0, actual_offset,
3687                                                 cnt, flags);
3688                                 if (error)
3689                                         break;
3690                            }
3691                         if (error)
3692                                 break;
3693                         actual_offset += cnt;
3694                         mobj_target_addr += cnt;
3695                         transfer_size -= cnt;
3696                         cnt = 0;
3697
3698                         if (error)
3699                                 break;
3700                 }
3701         }
3702         if(error)
3703                 return KERN_FAILURE;
3704         else
3705                 return KERN_SUCCESS;
3706 }
3707
3708 vm_size_t
3709 ps_vstruct_allocated_size(
3710         vstruct_t       vs)
3711 {
3712         int             num_pages;
3713         struct vs_map   *vsmap;
3714         unsigned int    i, j, k;
3715
3716         num_pages = 0;
3717         if (vs->vs_indirect) {
3718                 /* loop on indirect maps */
3719                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3720                         vsmap = vs->vs_imap[i];
3721                         if (vsmap == NULL)
3722                                 continue;
3723                         /* loop on clusters in this indirect map */
3724                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3725                                 if (VSM_ISCLR(vsmap[j]) ||
3726                                     VSM_ISERR(vsmap[j]))
3727                                         continue;
3728                                 /* loop on pages in this cluster */
3729                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3730                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3731                                                 num_pages++;
3732                                 }
3733                         }
3734                 }
3735         } else {
3736                 vsmap = vs->vs_dmap;
3737                 if (vsmap == NULL)
3738                         return 0;
3739                 /* loop on clusters in the direct map */
3740                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3741                         if (VSM_ISCLR(vsmap[j]) ||
3742                             VSM_ISERR(vsmap[j]))
3743                                 continue;
3744                         /* loop on pages in this cluster */
3745                         for (k = 0; k < VSCLSIZE(vs); k++) {
3746                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3747                                         num_pages++;
3748                         }
3749                 }
3750         }
3751
3752         return ptoa_32(num_pages);
3753 }
3754
3755 unsigned int
3756 ps_vstruct_allocated_pages(
3757         vstruct_t               vs,
3758         default_pager_page_t    *pages,
3759         unsigned int            pages_size)
3760 {
3761         unsigned int    num_pages;
3762         struct vs_map   *vsmap;
3763         dp_offset_t     offset;
3764         unsigned int    i, j, k;
3765
3766         num_pages = 0;
3767         offset = 0;
3768         if (vs->vs_indirect) {
3769                 /* loop on indirect maps */
3770                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3771                         vsmap = vs->vs_imap[i];
3772                         if (vsmap == NULL) {
3773                                 offset += (vm_page_size * CLMAP_ENTRIES *
3774                                            VSCLSIZE(vs));
3775                                 continue;
3776                         }
3777                         /* loop on clusters in this indirect map */
3778                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3779                                 if (VSM_ISCLR(vsmap[j]) ||
3780                                     VSM_ISERR(vsmap[j])) {
3781                                         offset += vm_page_size * VSCLSIZE(vs);
3782                                         continue;
3783                                 }
3784                                 /* loop on pages in this cluster */
3785                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3786                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3787                                                 num_pages++;
3788                                                 if (num_pages < pages_size)
3789                                                         pages++->dpp_offset =
3790                                                                 offset;
3791                                         }
3792                                         offset += vm_page_size;
3793                                 }
3794                         }
3795                 }
3796         } else {
3797                 vsmap = vs->vs_dmap;
3798                 if (vsmap == NULL)
3799                         return 0;
3800                 /* loop on clusters in the direct map */
3801                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3802                         if (VSM_ISCLR(vsmap[j]) ||
3803                             VSM_ISERR(vsmap[j])) {
3804                                 offset += vm_page_size * VSCLSIZE(vs);
3805                                 continue;
3806                         }
3807                         /* loop on pages in this cluster */
3808                         for (k = 0; k < VSCLSIZE(vs); k++) {
3809                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3810                                         num_pages++;
3811                                         if (num_pages < pages_size)
3812                                                 pages++->dpp_offset = offset;
3813                                 }
3814                                 offset += vm_page_size;
3815                         }
3816                 }
3817         }
3818
3819         return num_pages;
3820 }
3821
3822
3823 kern_return_t
3824 ps_vstruct_transfer_from_segment(
3825         vstruct_t        vs,
3826         paging_segment_t segment,
3827         upl_t            upl)
3828 {
3829         struct vs_map   *vsmap;
3830 //      struct vs_map   old_vsmap;
3831 //      struct vs_map   new_vsmap;
3832         unsigned int    i, j;
3833
3834         VS_LOCK(vs);    /* block all work on this vstruct */
3835                         /* can't allow the normal multiple write */
3836                         /* semantic because writes may conflict */
3837         vs->vs_xfer_pending = TRUE;
3838         vs_wait_for_sync_writers(vs);
3839         vs_start_write(vs);
3840         vs_wait_for_readers(vs);
3841         /* we will unlock the vs to allow other writes while transferring */
3842         /* and will be guaranteed of the persistance of the vs struct     */
3843         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3844         /* vs_async_pending */
3845         /* OK we now have guaranteed no other parties are accessing this */
3846         /* vs.  Now that we are also supporting simple lock versions of  */
3847         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3848         /* our purpose in holding it before was the multiple write case */
3849         /* we now use the boolean xfer_pending to do that.  We can use  */
3850         /* a boolean instead of a count because we have guaranteed single */
3851         /* file access to this code in its caller */
3852         VS_UNLOCK(vs);
3853 vs_changed:
3854         if (vs->vs_indirect) {
3855                 unsigned int    vsmap_size;
3856                 int             clmap_off;
3857                 /* loop on indirect maps */
3858                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3859                         vsmap = vs->vs_imap[i];
3860                         if (vsmap == NULL)
3861                                 continue;
3862                         /* loop on clusters in this indirect map */
3863                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3864                                            VSCLSIZE(vs) * i);
3865                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3866                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3867                         else
3868                                 vsmap_size = CLMAP_ENTRIES;
3869                         for (j = 0; j < vsmap_size; j++) {
3870                                 if (VSM_ISCLR(vsmap[j]) ||
3871                                     VSM_ISERR(vsmap[j]) ||
3872                                     (VSM_PS(vsmap[j]) != segment))
3873                                         continue;
3874                                 if(vs_cluster_transfer(vs,
3875                                         (vm_page_size * (j << vs->vs_clshift))
3876                                         + clmap_off,
3877                                         vm_page_size << vs->vs_clshift,
3878                                         upl)
3879                                                 != KERN_SUCCESS) {
3880                                    VS_LOCK(vs);
3881                                    vs->vs_xfer_pending = FALSE;
3882                                    VS_UNLOCK(vs);
3883                                    vs_finish_write(vs);
3884                                    return KERN_FAILURE;
3885                                 }
3886                                 /* allow other readers/writers during transfer*/
3887                                 VS_LOCK(vs);
3888                                 vs->vs_xfer_pending = FALSE;
3889                                 VS_UNLOCK(vs);
3890                                 vs_finish_write(vs);
3891
3892                                 if (backing_store_abort_compaction || backing_store_stop_compaction) {
3893                                         backing_store_abort_compaction = FALSE;
3894                                         dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n"));
3895                                         return KERN_FAILURE;
3896                                 }
3897                                 vnode_pager_throttle();
3898
3899                                 VS_LOCK(vs);
3900                                 vs->vs_xfer_pending = TRUE;
3901                                 vs_wait_for_sync_writers(vs);
3902                                 vs_start_write(vs);
3903                                 vs_wait_for_readers(vs);
3904                                 VS_UNLOCK(vs);
3905                                 if (!(vs->vs_indirect)) {
3906                                         goto vs_changed;
3907                                 }
3908                         }
3909                 }
3910         } else {
3911                 vsmap = vs->vs_dmap;
3912                 if (vsmap == NULL) {
3913                         VS_LOCK(vs);
3914                         vs->vs_xfer_pending = FALSE;
3915                         VS_UNLOCK(vs);
3916                         vs_finish_write(vs);
3917                         return KERN_SUCCESS;
3918                 }
3919                 /* loop on clusters in the direct map */
3920                 for (j = 0; j < vs->vs_size; j++) {
3921                         if (VSM_ISCLR(vsmap[j]) ||
3922                             VSM_ISERR(vsmap[j]) ||
3923                             (VSM_PS(vsmap[j]) != segment))
3924                                 continue;
3925                         if(vs_cluster_transfer(vs,
3926                                 vm_page_size * (j << vs->vs_clshift),
3927                                 vm_page_size << vs->vs_clshift,
3928                                 upl) != KERN_SUCCESS) {
3929                            VS_LOCK(vs);
3930                            vs->vs_xfer_pending = FALSE;
3931                            VS_UNLOCK(vs);
3932                            vs_finish_write(vs);
3933                            return KERN_FAILURE;
3934                         }
3935                         /* allow other readers/writers during transfer*/
3936                         VS_LOCK(vs);
3937                         vs->vs_xfer_pending = FALSE;
3938                         VS_UNLOCK(vs);
3939                         vs_finish_write(vs);
3940                         VS_LOCK(vs);
3941                         vs->vs_xfer_pending = TRUE;
3942                         vs_wait_for_sync_writers(vs);
3943                         vs_start_write(vs);
3944                         vs_wait_for_readers(vs);
3945                         VS_UNLOCK(vs);
3946                         if (vs->vs_indirect) {
3947                                 goto vs_changed;
3948                         }
3949                 }
3950         }
3951
3952         VS_LOCK(vs);
3953         vs->vs_xfer_pending = FALSE;
3954         VS_UNLOCK(vs);
3955         vs_finish_write(vs);
3956         return KERN_SUCCESS;
3957 }
3958
3959
3960
3961 vs_map_t
3962 vs_get_map_entry(
3963         vstruct_t       vs,
3964         dp_offset_t     offset)
3965 {
3966         struct vs_map   *vsmap;
3967         dp_offset_t     cluster;
3968
3969         cluster = atop_32(offset) >> vs->vs_clshift;
3970         if (vs->vs_indirect) {
3971                 long    ind_block = cluster/CLMAP_ENTRIES;
3972
3973                 /* Is the indirect block allocated? */
3974                 vsmap = vs->vs_imap[ind_block];
3975                 if(vsmap == (vs_map_t) NULL)
3976                         return vsmap;
3977         } else
3978                 vsmap = vs->vs_dmap;
3979         vsmap += cluster%CLMAP_ENTRIES;
3980         return vsmap;
3981 }
3982
3983 kern_return_t
3984 vs_cluster_transfer(
3985         vstruct_t       vs,
3986         dp_offset_t     offset,
3987         dp_size_t       cnt,
3988         upl_t           upl)
3989 {
3990         dp_offset_t             actual_offset;
3991         paging_segment_t        ps;
3992         struct clmap            clmap;
3993         kern_return_t           error = KERN_SUCCESS;
3994         unsigned int            size, size_wanted;
3995         int                     i;
3996         unsigned int            residual = 0;
3997         unsigned int            unavail_size;
3998 //      default_pager_thread_t  *dpt;
3999 //      boolean_t               dealloc;
4000         struct  vs_map          *vsmap_ptr = NULL;
4001         struct  vs_map          read_vsmap;
4002         struct  vs_map          original_read_vsmap;
4003         struct  vs_map          write_vsmap;
4004 //      upl_t                           sync_upl;
4005 //      vm_offset_t                     ioaddr;
4006
4007         /* vs_cluster_transfer reads in the pages of a cluster and
4008          * then writes these pages back to new backing store.  The
4009          * segment the pages are being read from is assumed to have
4010          * been taken off-line and is no longer considered for new
4011          * space requests.
4012          */
4013
4014         /*
4015          * This loop will be executed once per cluster referenced.
4016          * Typically this means once, since it's unlikely that the
4017          * VM system will ask for anything spanning cluster boundaries.
4018          *
4019          * If there are holes in a cluster (in a paging segment), we stop
4020          * reading at the hole, then loop again, hoping to
4021          * find valid pages later in the cluster.  This continues until
4022          * the entire range has been examined, and read, if present.  The
4023          * pages are written as they are read.  If a failure occurs after
4024          * some pages are written the unmap call at the bottom of the loop
4025          * recovers the backing store and the old backing store remains
4026          * in effect.
4027          */
4028
4029         VSM_CLR(write_vsmap);
4030         VSM_CLR(original_read_vsmap);
4031         /* grab the actual object's pages to sync with I/O */
4032         while (cnt && (error == KERN_SUCCESS)) {
4033                 vsmap_ptr = vs_get_map_entry(vs, offset);
4034                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
4035
4036                 if (actual_offset == (dp_offset_t) -1) {
4037
4038                         /*
4039                          * Nothing left to write in this cluster at least
4040                          * set write cluster information for any previous
4041                          * write, clear for next cluster, if there is one
4042                          */
4043                         unsigned int local_size, clmask, clsize;
4044
4045                         clsize = vm_page_size << vs->vs_clshift;
4046                         clmask = clsize - 1;
4047                         local_size = clsize - (offset & clmask);
4048                         ASSERT(local_size);
4049                         local_size = MIN(local_size, cnt);
4050
4051                         /* This cluster has no data in it beyond what may */
4052                         /* have been found on a previous iteration through */
4053                         /* the loop "write_vsmap" */
4054                         *vsmap_ptr = write_vsmap;
4055                         VSM_CLR(write_vsmap);
4056                         VSM_CLR(original_read_vsmap);
4057
4058                         cnt -= local_size;
4059                         offset += local_size;
4060                         continue;
4061                 }
4062
4063                 /*
4064                  * Count up contiguous available or unavailable
4065                  * pages.
4066                  */
4067                 ps = CLMAP_PS(clmap);
4068                 ASSERT(ps);
4069                 size = 0;
4070                 unavail_size = 0;
4071                 for (i = 0;
4072                      (size < cnt) && (unavail_size < cnt) &&
4073                      (i < CLMAP_NPGS(clmap)); i++) {
4074                         if (CLMAP_ISSET(clmap, i)) {
4075                                 if (unavail_size != 0)
4076                                         break;
4077                                 size += vm_page_size;
4078                                 BS_STAT(ps->ps_bs,
4079                                         ps->ps_bs->bs_pages_in++);
4080                         } else {
4081                                 if (size != 0)
4082                                         break;
4083                                 unavail_size += vm_page_size;
4084                         }
4085                 }
4086
4087                 if (size == 0) {
4088                         ASSERT(unavail_size);
4089                         ps_clunmap(vs, offset, unavail_size);
4090                         cnt -= unavail_size;
4091                         offset += unavail_size;
4092                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
4093                                 == 0) {
4094                                 /* There is no more to transfer in this
4095                                    cluster
4096                                 */
4097                                 *vsmap_ptr = write_vsmap;
4098                                 VSM_CLR(write_vsmap);
4099                                 VSM_CLR(original_read_vsmap);
4100                         }
4101                         continue;
4102                 }
4103
4104                 if(VSM_ISCLR(original_read_vsmap))
4105                         original_read_vsmap = *vsmap_ptr;
4106
4107                 if(ps->ps_segtype == PS_PARTITION) {
4108                         panic("swap partition not supported\n");
4109                         /*NOTREACHED*/
4110                         error = KERN_FAILURE;
4111                         residual = size;
4112 /*
4113                         NEED TO ISSUE WITH SYNC & NO COMMIT
4114                         error = ps_read_device(ps, actual_offset, &buffer,
4115                                        size, &residual, flags);
4116 */
4117                 } else {
4118                         /* NEED TO ISSUE WITH SYNC & NO COMMIT */
4119                         error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
4120                                         size, &residual,
4121                                         (UPL_IOSYNC | UPL_NOCOMMIT | (dp_encryption ? UPL_PAGING_ENCRYPTED : 0)));
4122                 }
4123
4124                 read_vsmap = *vsmap_ptr;
4125
4126
4127                 /*
4128                  * Adjust counts and put data in new BS.  Optimize for the
4129                  * common case, i.e. no error and/or partial data.
4130                  * If there was an error, then we need to error the entire
4131                  * range, even if some data was successfully read.
4132                  *
4133                  */
4134                 if ((error == KERN_SUCCESS) && (residual == 0)) {
4135
4136                         /*
4137                          * Got everything we asked for, supply the data to
4138                          * the new BS.  Note that as a side effect of supplying
4139                          * the data, the buffer holding the supplied data is
4140                          * deallocated from the pager's address space unless
4141                          * the write is unsuccessful.
4142                          */
4143
4144                         /* note buffer will be cleaned up in all cases by */
4145                         /* internal_cluster_write or if an error on write */
4146                         /* the vm_map_copy_page_discard call              */
4147                         *vsmap_ptr = write_vsmap;
4148
4149                         if(vs_cluster_write(vs, upl, offset,
4150                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
4151                                 error = KERN_FAILURE;
4152                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
4153                                         /* unmap the new backing store object */
4154                                         ps_clunmap(vs, offset, size);
4155                                 }
4156                                 /* original vsmap */
4157                                 *vsmap_ptr = original_read_vsmap;
4158                                 VSM_CLR(write_vsmap);
4159                         } else {
4160                                if((offset + size) &
4161                                         ((vm_page_size << vs->vs_clshift)
4162                                         - 1)) {
4163                                         /* There is more to transfer in this
4164                                            cluster
4165                                         */
4166                                         write_vsmap = *vsmap_ptr;
4167                                         *vsmap_ptr = read_vsmap;
4168                                         ps_clunmap(vs, offset, size);
4169                                 } else {
4170                                         /* discard the old backing object */
4171                                         write_vsmap = *vsmap_ptr;
4172                                         *vsmap_ptr = read_vsmap;
4173                                         ps_clunmap(vs, offset, size);
4174                                         *vsmap_ptr = write_vsmap;
4175                                         VSM_CLR(write_vsmap);
4176                                         VSM_CLR(original_read_vsmap);
4177                                 }
4178                         }
4179                 } else {
4180                         size_wanted = size;
4181                         if (error == KERN_SUCCESS) {
4182                                 if (residual == size) {
4183                                         /*
4184                                          * If a read operation returns no error
4185                                          * and no data moved, we turn it into
4186                                          * an error, assuming we're reading at
4187                                          * or beyond EOF.
4188                                          * Fall through and error the entire
4189                                          * range.
4190                                          */
4191                                         error = KERN_FAILURE;
4192                                         *vsmap_ptr = write_vsmap;
4193                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
4194                                         /* unmap the new backing store object */
4195                                         ps_clunmap(vs, offset, size);
4196                                         }
4197                                         *vsmap_ptr = original_read_vsmap;
4198                                         VSM_CLR(write_vsmap);
4199                                         continue;
4200                                 } else {
4201                                         /*
4202                                          * Otherwise, we have partial read.
4203                                          * This is also considered an error
4204                                          * for the purposes of cluster transfer
4205                                          */
4206                                         error = KERN_FAILURE;
4207                                         *vsmap_ptr = write_vsmap;
4208                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
4209                                         /* unmap the new backing store object */
4210                                         ps_clunmap(vs, offset, size);
4211                                         }
4212                                         *vsmap_ptr = original_read_vsmap;
4213                                         VSM_CLR(write_vsmap);
4214                                         continue;
4215                                 }
4216                         }
4217
4218                 }
4219                 cnt -= size;
4220                 offset += size;
4221
4222         } /* END while (cnt && (error == 0)) */
4223         if(!VSM_ISCLR(write_vsmap))
4224                 *vsmap_ptr = write_vsmap;
4225
4226         return error;
4227 }
4228
4229 kern_return_t
4230 default_pager_add_file(
4231         MACH_PORT_FACE  backing_store,
4232         vnode_ptr_t     vp,
4233         int             record_size,
4234         vm_size_t       size)
4235 {
4236         backing_store_t         bs;
4237         paging_segment_t        ps;
4238         int                     i;
4239         unsigned int            j;
4240         int                     error;
4241
4242         if ((bs = backing_store_lookup(backing_store))
4243             == BACKING_STORE_NULL)
4244                 return KERN_INVALID_ARGUMENT;
4245
4246         PSL_LOCK();
4247         for (i = 0; i <= paging_segment_max; i++) {
4248                 ps = paging_segments[i];
4249                 if (ps == PAGING_SEGMENT_NULL)
4250                         continue;
4251                 if (ps->ps_segtype != PS_FILE)
4252                         continue;
4253
4254                 /*
4255                  * Check for overlap on same device.
4256                  */
4257                 if (ps->ps_vnode == (struct vnode *)vp) {
4258                         PSL_UNLOCK();
4259                         BS_UNLOCK(bs);
4260                         return KERN_INVALID_ARGUMENT;
4261                 }
4262         }
4263         PSL_UNLOCK();
4264
4265         /*
4266          * Set up the paging segment
4267          */
4268         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
4269         if (ps == PAGING_SEGMENT_NULL) {
4270                 BS_UNLOCK(bs);
4271                 return KERN_RESOURCE_SHORTAGE;
4272         }
4273
4274         ps->ps_segtype = PS_FILE;
4275         ps->ps_vnode = (struct vnode *)vp;
4276         ps->ps_offset = 0;
4277         ps->ps_record_shift = local_log2(vm_page_size / record_size);
4278         assert((dp_size_t) size == size);
4279         ps->ps_recnum = (dp_size_t) size;
4280         ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift;
4281
4282         ps->ps_pgcount = ps->ps_pgnum;
4283         ps->ps_clshift = local_log2(bs->bs_clsize);
4284         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
4285         ps->ps_special_clusters = 0;
4286         ps->ps_hint = 0;
4287
4288         PS_LOCK_INIT(ps);
4289         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
4290         if (!ps->ps_bmap) {
4291                 PS_LOCK_DESTROY(ps);
4292                 kfree(ps, sizeof *ps);
4293                 BS_UNLOCK(bs);
4294                 return KERN_RESOURCE_SHORTAGE;
4295         }
4296         for (j = 0; j < ps->ps_ncls; j++) {
4297                 clrbit(ps->ps_bmap, j);
4298         }
4299
4300         if(paging_segment_count == 0) {
4301                 ps->ps_state = PS_EMERGENCY_SEGMENT;
4302                 if(use_emergency_swap_file_first) {
4303                         ps->ps_state |= PS_CAN_USE;
4304                 }
4305                 emergency_segment_backing_store = backing_store;
4306         } else {
4307                 ps->ps_state = PS_CAN_USE;
4308         }
4309
4310         ps->ps_bs = bs;
4311
4312         if ((error = ps_enter(ps)) != 0) {
4313                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
4314                 PS_LOCK_DESTROY(ps);
4315                 kfree(ps, sizeof *ps);
4316                 BS_UNLOCK(bs);
4317                 return KERN_RESOURCE_SHORTAGE;
4318         }
4319
4320         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
4321         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
4322         PSL_LOCK();
4323         if(IS_PS_OK_TO_USE(ps)) {
4324                 dp_pages_free += ps->ps_pgcount;
4325         } else {
4326                 dp_pages_reserve += ps->ps_pgcount;
4327         }
4328         PSL_UNLOCK();
4329
4330         BS_UNLOCK(bs);
4331
4332         bs_more_space(ps->ps_clcount);
4333
4334         /*
4335          * If the paging segment being activated is not the emergency
4336          * segment and we notice that the emergency segment is being
4337          * used then we help recover it. If all goes well, the
4338          * emergency segment will be back to its original state of
4339          * online but not activated (till it's needed the next time).
4340          */
4341 #if CONFIG_FREEZE
4342         if (!memorystatus_freeze_enabled)
4343 #endif
4344         {
4345                 ps = paging_segments[EMERGENCY_PSEG_INDEX];
4346                 if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) {
4347                         if(default_pager_backing_store_delete(emergency_segment_backing_store)) {
4348                                 dprintf(("Failed to recover emergency paging segment\n"));
4349                         } else {
4350                                 dprintf(("Recovered emergency paging segment\n"));
4351                         }
4352                 }
4353         }
4354
4355         DP_DEBUG(DEBUG_BS_INTERNAL,
4356                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
4357                   device, offset, (dp_size_t) size, record_size,
4358                   ps->ps_record_shift, ps->ps_pgnum));
4359
4360         return KERN_SUCCESS;
4361 }
4362
4363
4364
4365 kern_return_t
4366 ps_read_file(
4367         paging_segment_t        ps,
4368         upl_t                   upl,
4369         upl_offset_t            upl_offset,
4370         dp_offset_t             offset,
4371         upl_size_t              size,
4372         unsigned int            *residualp,
4373         int                     flags)
4374 {
4375         vm_object_offset_t      f_offset;
4376         int                     error = 0;
4377         int                     result;
4378
4379         assert(dp_encryption_inited);
4380
4381         clustered_reads[atop_32(size)]++;
4382
4383         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4384
4385         /*
4386          * for transfer case we need to pass uploffset and flags
4387          */
4388         assert((upl_size_t) size == size);
4389         error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL);
4390
4391         /* The vnode_pagein semantic is somewhat at odds with the existing   */
4392         /* device_read semantic.  Partial reads are not experienced at this  */
4393         /* level.  It is up to the bit map code and cluster read code to     */
4394         /* check that requested data locations are actually backed, and the  */
4395         /* pagein code to either read all of the requested data or return an */
4396         /* error. */
4397
4398         if (error)
4399                 result = KERN_FAILURE;
4400         else {
4401                 *residualp = 0;
4402                 result = KERN_SUCCESS;
4403         }
4404         return result;
4405 }
4406
4407 kern_return_t
4408 ps_write_file(
4409         paging_segment_t        ps,
4410         upl_t                   upl,
4411         upl_offset_t            upl_offset,
4412         dp_offset_t             offset,
4413         unsigned int            size,
4414         int                     flags)
4415 {
4416         vm_object_offset_t      f_offset;
4417         kern_return_t           result;
4418
4419         assert(dp_encryption_inited);
4420
4421         clustered_writes[atop_32(size)]++;
4422         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4423
4424         if (flags & UPL_PAGING_ENCRYPTED) {
4425                 /*
4426                  * ENCRYPTED SWAP:
4427                  * encrypt all the pages that we're going
4428                  * to pageout.
4429                  */
4430                 upl_encrypt(upl, upl_offset, size);
4431         }
4432         assert((upl_size_t) size == size);
4433         if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL))
4434                 result = KERN_FAILURE;
4435         else
4436                 result = KERN_SUCCESS;
4437
4438         return result;
4439 }
4440
4441 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data)
4442 {
4443 #pragma unused(data)
4444 }
4445
4446 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data)
4447 {
4448 #pragma unused(data)
4449 }
4450
4451 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length)
4452 {
4453 #pragma unused(data, map, shift, length)
4454 }
4455
4456 kern_return_t
4457 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
4458         int             hi_wat,
4459         int             lo_wat,
4460         int             flags,
4461         MACH_PORT_FACE  trigger_port)
4462 {
4463         MACH_PORT_FACE release = IPC_PORT_NULL;
4464         kern_return_t kr;
4465         clock_sec_t now;
4466         clock_nsec_t nanoseconds_dummy;
4467         static clock_sec_t error_notify = 0;
4468
4469         PSL_LOCK();
4470         if (flags == SWAP_ENCRYPT_ON) {
4471                 /* ENCRYPTED SWAP: turn encryption on */
4472                 release = trigger_port;
4473                 if (!dp_encryption_inited) {
4474                         dp_encryption_inited = TRUE;
4475                         dp_encryption = TRUE;
4476                         kr = KERN_SUCCESS;
4477                 } else {
4478                         kr = KERN_FAILURE;
4479                 }
4480         } else if (flags == SWAP_ENCRYPT_OFF) {
4481                 /* ENCRYPTED SWAP: turn encryption off */
4482                 release = trigger_port;
4483                 if (!dp_encryption_inited) {
4484                         dp_encryption_inited = TRUE;
4485                         dp_encryption = FALSE;
4486                         kr = KERN_SUCCESS;
4487                 } else {
4488                         kr = KERN_FAILURE;
4489                 }
4490         } else if (flags == HI_WAT_ALERT) {
4491                 release = min_pages_trigger_port;
4492 #if CONFIG_FREEZE
4493                 /* High and low water signals aren't applicable when freeze is */
4494                 /* enabled, so release the trigger ports here and return       */
4495                 /* KERN_FAILURE.                                               */
4496                 if (memorystatus_freeze_enabled) {
4497                         if (IP_VALID( trigger_port )){
4498                                 ipc_port_release_send( trigger_port );
4499                         }
4500                         min_pages_trigger_port = IPC_PORT_NULL;
4501                         kr = KERN_FAILURE;
4502                 }
4503                 else
4504 #endif
4505                 {
4506                         min_pages_trigger_port = trigger_port;
4507                         minimum_pages_remaining = hi_wat/vm_page_size;
4508                         bs_low = FALSE;
4509                         kr = KERN_SUCCESS;
4510                 }
4511         } else if (flags ==  LO_WAT_ALERT) {
4512                 release = max_pages_trigger_port;
4513 #if CONFIG_FREEZE
4514                 if (memorystatus_freeze_enabled) {
4515                         if (IP_VALID( trigger_port )){
4516                                 ipc_port_release_send( trigger_port );
4517                         }
4518                         max_pages_trigger_port = IPC_PORT_NULL;
4519                         kr = KERN_FAILURE;
4520                 }
4521                 else
4522 #endif
4523                 {
4524                         max_pages_trigger_port = trigger_port;
4525                         maximum_pages_free = lo_wat/vm_page_size;
4526                         kr = KERN_SUCCESS;
4527                 }
4528         } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) {
4529                 use_emergency_swap_file_first = TRUE;
4530                 release = trigger_port;
4531                 kr = KERN_SUCCESS;
4532         } else if (flags == SWAP_FILE_CREATION_ERROR) {
4533                 release = trigger_port;
4534                 kr = KERN_SUCCESS;
4535                 if( paging_segment_count == 1) {
4536                         use_emergency_swap_file_first = TRUE;
4537                 }
4538                 no_paging_space_action();
4539                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
4540                 if (now > error_notify + 5) {
4541                         dprintf(("Swap File Error.\n"));
4542                         error_notify = now;
4543                 }
4544         } else {
4545                 release = trigger_port;
4546                 kr =  KERN_INVALID_ARGUMENT;
4547         }
4548         PSL_UNLOCK();
4549
4550         if (IP_VALID(release))
4551                 ipc_port_release_send(release);
4552
4553         return kr;
4554 }
4555
4556 /*
4557  * Monitor the amount of available backing store vs. the amount of
4558  * required backing store, notify a listener (if present) when
4559  * backing store may safely be removed.
4560  *
4561  * We attempt to avoid the situation where backing store is
4562  * discarded en masse, as this can lead to thrashing as the
4563  * backing store is compacted.
4564  */
4565
4566 #define PF_INTERVAL     3       /* time between free level checks */
4567 #define PF_LATENCY      10      /* number of intervals before release */
4568
4569 static int dp_pages_free_low_count = 0;
4570 thread_call_t default_pager_backing_store_monitor_callout;
4571
4572 void
4573 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
4574                                                                         __unused thread_call_param_t p2)
4575 {
4576 //      unsigned long long      average;
4577         ipc_port_t              trigger;
4578         uint64_t                deadline;
4579
4580         /*
4581          * We determine whether it will be safe to release some
4582          * backing store by watching the free page level.  If
4583          * it remains below the maximum_pages_free threshold for
4584          * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4585          * then we deem it safe.
4586          *
4587          * Note that this establishes a maximum rate at which backing
4588          * store will be released, as each notification (currently)
4589          * only results in a single backing store object being
4590          * released.
4591          */
4592         if (dp_pages_free > maximum_pages_free) {
4593                 dp_pages_free_low_count++;
4594         } else {
4595                 dp_pages_free_low_count = 0;
4596         }
4597
4598         /* decide whether to send notification */
4599         trigger = IP_NULL;
4600         if (max_pages_trigger_port &&
4601             (backing_store_release_trigger_disable == 0) &&
4602             (dp_pages_free_low_count > PF_LATENCY)) {
4603                 trigger = max_pages_trigger_port;
4604                 max_pages_trigger_port = NULL;
4605         }
4606
4607         /* send notification */
4608         if (trigger != IP_NULL) {
4609                 VSL_LOCK();
4610                 if(backing_store_release_trigger_disable != 0) {
4611                         assert_wait((event_t)
4612                                     &backing_store_release_trigger_disable,
4613                                     THREAD_UNINT);
4614                         VSL_UNLOCK();
4615                         thread_block(THREAD_CONTINUE_NULL);
4616                 } else {
4617                         VSL_UNLOCK();
4618                 }
4619                 dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n"));
4620
4621                 default_pager_space_alert(trigger, LO_WAT_ALERT);
4622                 ipc_port_release_send(trigger);
4623                 dp_pages_free_low_count = 0;
4624         }
4625
4626         clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
4627         thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
4628 }
4629
4630 #if CONFIG_FREEZE
4631 unsigned int default_pager_swap_pages_free() {
4632         return dp_pages_free;
4633 }
4634 #endif