osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56
  57 /*
  58  *      Default Pager.
  59  *              Paging File Management.
  60  */
  61
  62 #include <mach/host_priv.h>
  63 #include <mach/memory_object_control.h>
  64 #include <mach/memory_object_server.h>
  65 #include <mach/upl.h>
  66 #include <default_pager/default_pager_internal.h>
  67 #include <default_pager/default_pager_alerts.h>
  68 #include <default_pager/default_pager_object_server.h>
  69
  70 #include <ipc/ipc_types.h>
  71 #include <ipc/ipc_port.h>
  72 #include <ipc/ipc_space.h>
  73
  74 #include <kern/kern_types.h>
  75 #include <kern/host.h>
  76 #include <kern/queue.h>
  77 #include <kern/counters.h>
  78 #include <kern/sched_prim.h>
  79
  80 #include <vm/vm_kern.h>
  81 #include <vm/vm_pageout.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/vm_object.h>
  84 #include <vm/vm_protos.h>
  85
  86
  87 /* todo - need large internal object support */
  88
  89 /*
  90  * ALLOC_STRIDE... the maximum number of bytes allocated from
  91  * a swap file before moving on to the next swap file... if
  92  * all swap files reside on a single disk, this value should
  93  * be very large (this is the default assumption)... if the
  94  * swap files are spread across multiple disks, than this value
  95  * should be small (128 * 1024)...
  96  *
  97  * This should be determined dynamically in the future
  98  */
  99
 100 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
 101 int physical_transfer_cluster_count = 0;
 102
 103 #define VM_SUPER_CLUSTER        0x40000
 104 #define VM_SUPER_PAGES          (VM_SUPER_CLUSTER / PAGE_SIZE)
 105
 106 /*
 107  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
 108  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
 109  */
 110 #define VSTRUCT_MIN_CLSHIFT     0
 111
 112 #define VSTRUCT_DEF_CLSHIFT     2
 113 int default_pager_clsize = 0;
 114
 115 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
 116
 117 /* statistics */
 118 unsigned int clustered_writes[VM_SUPER_PAGES+1];
 119 unsigned int clustered_reads[VM_SUPER_PAGES+1];
 120
 121 /*
 122  * Globals used for asynchronous paging operations:
 123  *      vs_async_list:  head of list of to-be-completed I/O ops
 124  *      async_num_queued: number of pages completed, but not yet
 125  *              processed by async thread.
 126  *      async_requests_out: number of pages of requests not completed.
 127  */
 128
 129 #if 0
 130 struct vs_async *vs_async_list;
 131 int     async_num_queued;
 132 int     async_requests_out;
 133 #endif
 134
 135
 136 #define VS_ASYNC_REUSE 1
 137 struct vs_async *vs_async_free_list;
 138
 139 lck_mtx_t       default_pager_async_lock;       /* Protects globals above */
 140
 141
 142 int vs_alloc_async_failed = 0;                  /* statistics */
 143 int vs_alloc_async_count = 0;                   /* statistics */
 144 struct vs_async *vs_alloc_async(void);          /* forward */
 145 void vs_free_async(struct vs_async *vsa);       /* forward */
 146
 147
 148 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 149 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 150
 151 #define VS_ASYNC_LOCK()         lck_mtx_lock(&default_pager_async_lock)
 152 #define VS_ASYNC_UNLOCK()       lck_mtx_unlock(&default_pager_async_lock)
 153 #define VS_ASYNC_LOCK_INIT()    lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
 154 #define VS_ASYNC_LOCK_DESTROY() lck_mtx_destroy(&default_pager_async_lock, &default_pager_lck_grp)
 155 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 156 /*
 157  *  Paging Space Hysteresis triggers and the target notification port
 158  *
 159  */
 160 unsigned int    dp_pages_free_drift_count = 0;
 161 unsigned int    dp_pages_free_drifted_max = 0;
 162 unsigned int    minimum_pages_remaining = 0;
 163 unsigned int    maximum_pages_free = 0;
 164 ipc_port_t      min_pages_trigger_port = NULL;
 165 ipc_port_t      max_pages_trigger_port = NULL;
 166
 167 #if CONFIG_FREEZE
 168 boolean_t       use_emergency_swap_file_first = TRUE;
 169 #else
 170 boolean_t       use_emergency_swap_file_first = FALSE;
 171 #endif
 172 boolean_t       bs_low = FALSE;
 173 int             backing_store_release_trigger_disable = 0;
 174 boolean_t       backing_store_stop_compaction = FALSE;
 175 boolean_t       backing_store_abort_compaction = FALSE;
 176
 177 /* Have we decided if swap needs to be encrypted yet ? */
 178 boolean_t       dp_encryption_inited = FALSE;
 179 /* Should we encrypt swap ? */
 180 boolean_t       dp_encryption = FALSE;
 181
 182 boolean_t       dp_isssd = FALSE;
 183
 184 /*
 185  * Object sizes are rounded up to the next power of 2,
 186  * unless they are bigger than a given maximum size.
 187  */
 188 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 189
 190 /*
 191  * List of all backing store and segments.
 192  */
 193 MACH_PORT_FACE          emergency_segment_backing_store;
 194 struct backing_store_list_head backing_store_list;
 195 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 196 lck_mtx_t                       paging_segments_lock;
 197 int                     paging_segment_max = 0;
 198 int                     paging_segment_count = 0;
 199 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 200
 201
 202 /*
 203  * Total pages free in system
 204  * This differs from clusters committed/avail which is a measure of the
 205  * over commitment of paging segments to backing store.  An idea which is
 206  * likely to be deprecated.
 207  */
 208 unsigned  int   dp_pages_free = 0;
 209 unsigned  int   dp_pages_reserve = 0;
 210 unsigned  int   cluster_transfer_minimum = 100;
 211
 212 /*
 213  * Trim state
 214  */
 215 struct ps_vnode_trim_data {
 216         struct vnode *vp;
 217         dp_offset_t   offset;
 218         dp_size_t     length;
 219 };
 220
 221 /* forward declarations */
 222 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int);     /* forward */
 223 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int);     /* forward */
 224 default_pager_thread_t *get_read_buffer( void );
 225 kern_return_t ps_vstruct_transfer_from_segment(
 226         vstruct_t        vs,
 227         paging_segment_t segment,
 228         upl_t            upl);
 229 kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
 230 kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
 231 kern_return_t vs_cluster_transfer(
 232         vstruct_t       vs,
 233         dp_offset_t     offset,
 234         dp_size_t       cnt,
 235         upl_t           upl);
 236 vs_map_t vs_get_map_entry(
 237         vstruct_t       vs,
 238         dp_offset_t     offset);
 239
 240 kern_return_t
 241 default_pager_backing_store_delete_internal( MACH_PORT_FACE );
 242
 243 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data);
 244 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data);
 245 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length);
 246
 247 default_pager_thread_t *
 248 get_read_buffer( void )
 249 {
 250         int     i;
 251
 252         DPT_LOCK(dpt_lock);
 253         while(TRUE) {
 254                 for (i=0; i<default_pager_internal_count; i++) {
 255                         if(dpt_array[i]->checked_out == FALSE) {
 256                           dpt_array[i]->checked_out = TRUE;
 257                           DPT_UNLOCK(dpt_lock);
 258                           return  dpt_array[i];
 259                         }
 260                 }
 261                 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
 262         }
 263 }
 264
 265 void
 266 bs_initialize(void)
 267 {
 268         int i;
 269
 270         /*
 271          * List of all backing store.
 272          */
 273         BSL_LOCK_INIT();
 274         queue_init(&backing_store_list.bsl_queue);
 275         PSL_LOCK_INIT();
 276
 277         VS_ASYNC_LOCK_INIT();
 278 #if     VS_ASYNC_REUSE
 279         vs_async_free_list = NULL;
 280 #endif  /* VS_ASYNC_REUSE */
 281
 282         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 283                 clustered_writes[i] = 0;
 284                 clustered_reads[i] = 0;
 285         }
 286
 287 }
 288
 289 /*
 290  * When things do not quite workout...
 291  */
 292 void bs_no_paging_space(boolean_t);     /* forward */
 293
 294 void
 295 bs_no_paging_space(
 296         boolean_t out_of_memory)
 297 {
 298
 299         if (out_of_memory)
 300                 dprintf(("*** OUT OF MEMORY ***\n"));
 301         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 302 }
 303
 304 void bs_more_space(int);        /* forward */
 305 void bs_commit(int);            /* forward */
 306
 307 boolean_t       user_warned = FALSE;
 308 unsigned int    clusters_committed = 0;
 309 unsigned int    clusters_available = 0;
 310 unsigned int    clusters_committed_peak = 0;
 311
 312 void
 313 bs_more_space(
 314         int     nclusters)
 315 {
 316         BSL_LOCK();
 317         /*
 318          * Account for new paging space.
 319          */
 320         clusters_available += nclusters;
 321
 322         if (clusters_available >= clusters_committed) {
 323                 if (verbose && user_warned) {
 324                         printf("%s%s - %d excess clusters now.\n",
 325                                my_name,
 326                                "paging space is OK now",
 327                                clusters_available - clusters_committed);
 328                         user_warned = FALSE;
 329                         clusters_committed_peak = 0;
 330                 }
 331         } else {
 332                 if (verbose && user_warned) {
 333                         printf("%s%s - still short of %d clusters.\n",
 334                                my_name,
 335                                "WARNING: paging space over-committed",
 336                                clusters_committed - clusters_available);
 337                         clusters_committed_peak -= nclusters;
 338                 }
 339         }
 340         BSL_UNLOCK();
 341
 342         return;
 343 }
 344
 345 void
 346 bs_commit(
 347         int     nclusters)
 348 {
 349         BSL_LOCK();
 350         clusters_committed += nclusters;
 351         if (clusters_committed > clusters_available) {
 352                 if (verbose && !user_warned) {
 353                         user_warned = TRUE;
 354                         printf("%s%s - short of %d clusters.\n",
 355                                my_name,
 356                                "WARNING: paging space over-committed",
 357                                clusters_committed - clusters_available);
 358                 }
 359                 if (clusters_committed > clusters_committed_peak) {
 360                         clusters_committed_peak = clusters_committed;
 361                 }
 362         } else {
 363                 if (verbose && user_warned) {
 364                         printf("%s%s - was short of up to %d clusters.\n",
 365                                my_name,
 366                                "paging space is OK now",
 367                                clusters_committed_peak - clusters_available);
 368                         user_warned = FALSE;
 369                         clusters_committed_peak = 0;
 370                 }
 371         }
 372         BSL_UNLOCK();
 373
 374         return;
 375 }
 376
 377 int default_pager_info_verbose = 1;
 378
 379 void
 380 bs_global_info(
 381         uint64_t        *totalp,
 382         uint64_t        *freep)
 383 {
 384         uint64_t                pages_total, pages_free;
 385         paging_segment_t        ps;
 386         int                     i;
 387
 388         PSL_LOCK();
 389         pages_total = pages_free = 0;
 390         for (i = 0; i <= paging_segment_max; i++) {
 391                 ps = paging_segments[i];
 392                 if (ps == PAGING_SEGMENT_NULL)
 393                         continue;
 394
 395                 /*
 396                  * no need to lock: by the time this data
 397                  * gets back to any remote requestor it
 398                  * will be obsolete anyways
 399                  */
 400                 pages_total += ps->ps_pgnum;
 401                 pages_free += ps->ps_clcount << ps->ps_clshift;
 402                 DP_DEBUG(DEBUG_BS_INTERNAL,
 403                          ("segment #%d: %d total, %d free\n",
 404                           i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 405         }
 406         *totalp = pages_total;
 407         *freep = pages_free;
 408         if (verbose && user_warned && default_pager_info_verbose) {
 409                 if (clusters_available < clusters_committed) {
 410                         printf("%s %d clusters committed, %d available.\n",
 411                                my_name,
 412                                clusters_committed,
 413                                clusters_available);
 414                 }
 415         }
 416         PSL_UNLOCK();
 417 }
 418
 419 backing_store_t backing_store_alloc(void);      /* forward */
 420
 421 backing_store_t
 422 backing_store_alloc(void)
 423 {
 424         backing_store_t bs;
 425
 426         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 427         if (bs == BACKING_STORE_NULL)
 428                 panic("backing_store_alloc: no memory");
 429
 430         BS_LOCK_INIT(bs);
 431         bs->bs_port = MACH_PORT_NULL;
 432         bs->bs_priority = 0;
 433         bs->bs_clsize = 0;
 434         bs->bs_pages_total = 0;
 435         bs->bs_pages_in = 0;
 436         bs->bs_pages_in_fail = 0;
 437         bs->bs_pages_out = 0;
 438         bs->bs_pages_out_fail = 0;
 439
 440         return bs;
 441 }
 442
 443 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 444
 445 /* Even in both the component space and external versions of this pager, */
 446 /* backing_store_lookup will be called from tasks in the application space */
 447 backing_store_t
 448 backing_store_lookup(
 449         MACH_PORT_FACE port)
 450 {
 451         backing_store_t bs;
 452
 453 /*
 454         port is currently backed with a vs structure in the alias field
 455         we could create an ISBS alias and a port_is_bs call but frankly
 456         I see no reason for the test, the bs->port == port check below
 457         will work properly on junk entries.
 458
 459         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 460 */
 461         if (port == MACH_PORT_NULL)
 462                 return BACKING_STORE_NULL;
 463
 464         BSL_LOCK();
 465         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 466                       bs_links) {
 467                 BS_LOCK(bs);
 468                 if (bs->bs_port == port) {
 469                         BSL_UNLOCK();
 470                         /* Success, return it locked. */
 471                         return bs;
 472                 }
 473                 BS_UNLOCK(bs);
 474         }
 475         BSL_UNLOCK();
 476         return BACKING_STORE_NULL;
 477 }
 478
 479 void backing_store_add(backing_store_t);        /* forward */
 480
 481 void
 482 backing_store_add(
 483         __unused backing_store_t bs)
 484 {
 485 //      MACH_PORT_FACE          port = bs->bs_port;
 486 //      MACH_PORT_FACE          pset = default_pager_default_set;
 487         kern_return_t           kr = KERN_SUCCESS;
 488
 489         if (kr != KERN_SUCCESS)
 490                 panic("backing_store_add: add to set");
 491
 492 }
 493
 494 /*
 495  * Set up default page shift, but only if not already
 496  * set and argument is within range.
 497  */
 498 boolean_t
 499 bs_set_default_clsize(unsigned int npages)
 500 {
 501         switch(npages){
 502             case 1:
 503             case 2:
 504             case 4:
 505             case 8:
 506                 if (default_pager_clsize == 0)  /* if not yet set */
 507                         vstruct_def_clshift = local_log2(npages);
 508                 return(TRUE);
 509         }
 510         return(FALSE);
 511 }
 512
 513 int bs_get_global_clsize(int clsize);   /* forward */
 514
 515 int
 516 bs_get_global_clsize(
 517         int     clsize)
 518 {
 519         int                     i;
 520         memory_object_default_t dmm;
 521         kern_return_t           kr;
 522
 523         /*
 524          * Only allow setting of cluster size once. If called
 525          * with no cluster size (default), we use the compiled-in default
 526          * for the duration. The same cluster size is used for all
 527          * paging segments.
 528          */
 529         if (default_pager_clsize == 0) {
 530                 /*
 531                  * Keep cluster size in bit shift because it's quicker
 532                  * arithmetic, and easier to keep at a power of 2.
 533                  */
 534                 if (clsize != NO_CLSIZE) {
 535                         for (i = 0; (1 << i) < clsize; i++);
 536                         if (i > MAX_CLUSTER_SHIFT)
 537                                 i = MAX_CLUSTER_SHIFT;
 538                         vstruct_def_clshift = i;
 539                 }
 540                 default_pager_clsize = (1 << vstruct_def_clshift);
 541
 542                 /*
 543                  * Let the user know the new (and definitive) cluster size.
 544                  */
 545                 if (verbose)
 546                         printf("%scluster size = %d page%s\n",
 547                                 my_name, default_pager_clsize,
 548                                 (default_pager_clsize == 1) ? "" : "s");
 549
 550                 /*
 551                  * Let the kernel know too, in case it hasn't used the
 552                  * default value provided in main() yet.
 553                  */
 554                 dmm = default_pager_object;
 555                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 556                 kr = host_default_memory_manager(host_priv_self(),
 557                                                  &dmm,
 558                                                  clsize);
 559                 memory_object_default_deallocate(dmm);
 560
 561                 if (kr != KERN_SUCCESS) {
 562                    panic("bs_get_global_cl_size:host_default_memory_manager");
 563                 }
 564                 if (dmm != default_pager_object) {
 565                   panic("bs_get_global_cl_size:there is another default pager");
 566                 }
 567         }
 568         ASSERT(default_pager_clsize > 0 &&
 569                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 570
 571         return default_pager_clsize;
 572 }
 573
 574 kern_return_t
 575 default_pager_backing_store_create(
 576         memory_object_default_t pager,
 577         int                     priority,
 578         int                     clsize,         /* in bytes */
 579         MACH_PORT_FACE          *backing_store)
 580 {
 581         backing_store_t bs;
 582         MACH_PORT_FACE  port;
 583 //      kern_return_t   kr;
 584         struct vstruct_alias *alias_struct;
 585
 586         if (pager != default_pager_object)
 587                 return KERN_INVALID_ARGUMENT;
 588
 589         bs = backing_store_alloc();
 590         port = ipc_port_alloc_kernel();
 591         ipc_port_make_send(port);
 592         assert (port != IP_NULL);
 593
 594         DP_DEBUG(DEBUG_BS_EXTERNAL,
 595                  ("priority=%d clsize=%d bs_port=0x%x\n",
 596                   priority, clsize, (int) backing_store));
 597
 598         alias_struct = (struct vstruct_alias *)
 599                                 kalloc(sizeof (struct vstruct_alias));
 600         if(alias_struct != NULL) {
 601                 alias_struct->vs = (struct vstruct *)bs;
 602                 alias_struct->name = &default_pager_ops;
 603                 port->ip_alias = (uintptr_t) alias_struct;
 604         }
 605         else {
 606                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 607
 608                 BS_LOCK_DESTROY(bs);
 609                 kfree(bs, sizeof (struct backing_store));
 610
 611                 return KERN_RESOURCE_SHORTAGE;
 612         }
 613
 614         bs->bs_port = port;
 615         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 616                 priority = BS_MAXPRI;
 617         else if (priority == BS_NOPRI)
 618                 priority = BS_MAXPRI;
 619         else
 620                 priority = BS_MINPRI;
 621         bs->bs_priority = priority;
 622
 623         bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
 624
 625         BSL_LOCK();
 626         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 627                     bs_links);
 628         BSL_UNLOCK();
 629
 630         backing_store_add(bs);
 631
 632         *backing_store = port;
 633         return KERN_SUCCESS;
 634 }
 635
 636 kern_return_t
 637 default_pager_backing_store_info(
 638         MACH_PORT_FACE          backing_store,
 639         backing_store_flavor_t  flavour,
 640         backing_store_info_t    info,
 641         mach_msg_type_number_t  *size)
 642 {
 643         backing_store_t                 bs;
 644         backing_store_basic_info_t      basic;
 645         int                             i;
 646         paging_segment_t                ps;
 647
 648         if (flavour != BACKING_STORE_BASIC_INFO ||
 649             *size < BACKING_STORE_BASIC_INFO_COUNT)
 650                 return KERN_INVALID_ARGUMENT;
 651
 652         basic = (backing_store_basic_info_t)info;
 653         *size = BACKING_STORE_BASIC_INFO_COUNT;
 654
 655         VSTATS_LOCK(&global_stats.gs_lock);
 656         basic->pageout_calls    = global_stats.gs_pageout_calls;
 657         basic->pagein_calls     = global_stats.gs_pagein_calls;
 658         basic->pages_in         = global_stats.gs_pages_in;
 659         basic->pages_out        = global_stats.gs_pages_out;
 660         basic->pages_unavail    = global_stats.gs_pages_unavail;
 661         basic->pages_init       = global_stats.gs_pages_init;
 662         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 663         VSTATS_UNLOCK(&global_stats.gs_lock);
 664
 665         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 666                 return KERN_INVALID_ARGUMENT;
 667
 668         basic->bs_pages_total   = bs->bs_pages_total;
 669         PSL_LOCK();
 670         bs->bs_pages_free = 0;
 671         for (i = 0; i <= paging_segment_max; i++) {
 672                 ps = paging_segments[i];
 673                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 674                         PS_LOCK(ps);
 675                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 676                         PS_UNLOCK(ps);
 677                 }
 678         }
 679         PSL_UNLOCK();
 680         basic->bs_pages_free    = bs->bs_pages_free;
 681         basic->bs_pages_in      = bs->bs_pages_in;
 682         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 683         basic->bs_pages_out     = bs->bs_pages_out;
 684         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 685
 686         basic->bs_priority      = bs->bs_priority;
 687         basic->bs_clsize        = ptoa_32(bs->bs_clsize);       /* in bytes */
 688
 689         BS_UNLOCK(bs);
 690
 691         return KERN_SUCCESS;
 692 }
 693
 694 int ps_delete(paging_segment_t);        /* forward */
 695 boolean_t current_thread_aborted(void);
 696
 697 int
 698 ps_delete(
 699         paging_segment_t ps)
 700 {
 701         vstruct_t       vs;
 702         kern_return_t   error = KERN_SUCCESS;
 703         int             vs_count;
 704
 705         VSL_LOCK();             /* get the lock on the list of vs's      */
 706
 707         /* The lock relationship and sequence is farily complicated      */
 708         /* this code looks at a live list, locking and unlocking the list */
 709         /* as it traverses it.  It depends on the locking behavior of    */
 710         /* default_pager_no_senders.  no_senders always locks the vstruct */
 711         /* targeted for removal before locking the vstruct list.  However */
 712         /* it will remove that member of the list without locking its    */
 713         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 714         /* it cannot be removed from the list but we must hold the list  */
 715         /* lock to be sure that its pointers to its neighbors are valid. */
 716         /* Also, we can hold off destruction of a vstruct when the list  */
 717         /* lock and the vs locks are not being held by bumping the       */
 718         /* vs_async_pending count.      */
 719
 720
 721         while(backing_store_release_trigger_disable != 0) {
 722                 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
 723         }
 724
 725         /* we will choose instead to hold a send right */
 726         vs_count = vstruct_list.vsl_count;
 727         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 728         if(vs == (vstruct_t)&vstruct_list)  {
 729                 VSL_UNLOCK();
 730                 return KERN_SUCCESS;
 731         }
 732         VS_LOCK(vs);
 733         vs_async_wait(vs);  /* wait for any pending async writes */
 734         if ((vs_count != 0) && (vs != NULL))
 735                 vs->vs_async_pending += 1;  /* hold parties calling  */
 736                                             /* vs_async_wait */
 737
 738         if (bs_low == FALSE)
 739                 backing_store_abort_compaction = FALSE;
 740
 741         VS_UNLOCK(vs);
 742         VSL_UNLOCK();
 743         while((vs_count != 0) && (vs != NULL)) {
 744                 /* We take the count of AMO's before beginning the         */
 745                 /* transfer of of the target segment.                      */
 746                 /* We are guaranteed that the target segment cannot get    */
 747                 /* more users.  We also know that queue entries are        */
 748                 /* made at the back of the list.  If some of the entries   */
 749                 /* we would check disappear while we are traversing the    */
 750                 /* list then we will either check new entries which        */
 751                 /* do not have any backing store in the target segment     */
 752                 /* or re-check old entries.  This might not be optimal     */
 753                 /* but it will always be correct. The alternative is to    */
 754                 /* take a snapshot of the list.                            */
 755                 vstruct_t       next_vs;
 756
 757                 if(dp_pages_free < cluster_transfer_minimum)
 758                         error = KERN_FAILURE;
 759                 else {
 760                         vm_object_t     transfer_object;
 761                         unsigned int    count;
 762                         upl_t           upl;
 763                         int             upl_flags;
 764
 765                         transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
 766                         count = 0;
 767                         upl_flags = (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE |
 768                                      UPL_SET_LITE | UPL_SET_INTERNAL);
 769                         if (dp_encryption) {
 770                                 /* mark the pages as "encrypted" when they come in */
 771                                 upl_flags |= UPL_ENCRYPT;
 772                         }
 773                         error = vm_object_upl_request(transfer_object,
 774                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 775                                 &upl, NULL, &count, upl_flags);
 776
 777                         if(error == KERN_SUCCESS) {
 778                                 error = ps_vstruct_transfer_from_segment(
 779                                                         vs, ps, upl);
 780                                 upl_commit(upl, NULL, 0);
 781                                 upl_deallocate(upl);
 782                         } else {
 783                                 error = KERN_FAILURE;
 784                         }
 785                         vm_object_deallocate(transfer_object);
 786                 }
 787                 if(error || current_thread_aborted()) {
 788                         VS_LOCK(vs);
 789                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 790                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 791                                 vs->vs_waiting_async = FALSE;
 792                                 VS_UNLOCK(vs);
 793                                 thread_wakeup(&vs->vs_async_pending);
 794                         } else {
 795                                 VS_UNLOCK(vs);
 796                         }
 797                         return KERN_FAILURE;
 798                 }
 799
 800                 VSL_LOCK();
 801
 802                 while(backing_store_release_trigger_disable != 0) {
 803                         VSL_SLEEP(&backing_store_release_trigger_disable,
 804                                   THREAD_UNINT);
 805                 }
 806
 807                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 808                 if((next_vs != (vstruct_t)&vstruct_list) &&
 809                                 (vs != next_vs) && (vs_count != 1)) {
 810                         VS_LOCK(next_vs);
 811                         vs_async_wait(next_vs);  /* wait for any  */
 812                                                  /* pending async writes */
 813                         next_vs->vs_async_pending += 1; /* hold parties  */
 814                                                 /* calling vs_async_wait */
 815                         VS_UNLOCK(next_vs);
 816                 }
 817                 VSL_UNLOCK();
 818                 VS_LOCK(vs);
 819                 vs->vs_async_pending -= 1;
 820                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 821                         vs->vs_waiting_async = FALSE;
 822                         VS_UNLOCK(vs);
 823                         thread_wakeup(&vs->vs_async_pending);
 824                 } else {
 825                         VS_UNLOCK(vs);
 826                 }
 827                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 828                         vs = NULL;
 829                 else
 830                         vs = next_vs;
 831                 vs_count--;
 832         }
 833         return KERN_SUCCESS;
 834 }
 835
 836
 837 kern_return_t
 838 default_pager_backing_store_delete_internal(
 839         MACH_PORT_FACE backing_store)
 840 {
 841         backing_store_t         bs;
 842         int                     i;
 843         paging_segment_t        ps;
 844         int                     error;
 845         int                     interim_pages_removed = 0;
 846         boolean_t               dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store );
 847
 848         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 849                 return KERN_INVALID_ARGUMENT;
 850
 851 restart:
 852         PSL_LOCK();
 853         error = KERN_SUCCESS;
 854         for (i = 0; i <= paging_segment_max; i++) {
 855                 ps = paging_segments[i];
 856                 if (ps != PAGING_SEGMENT_NULL &&
 857                     ps->ps_bs == bs &&
 858                     ! IS_PS_GOING_AWAY(ps)) {
 859                         PS_LOCK(ps);
 860
 861                         if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) {
 862                         /*
 863                          * Someone is already busy reclamining this paging segment.
 864                          * If it's the emergency segment we are looking at then check
 865                          * that someone has not already recovered it and set the right
 866                          * state i.e. online but not activated.
 867                          */
 868                                 PS_UNLOCK(ps);
 869                                 continue;
 870                         }
 871
 872                         /* disable access to this segment */
 873                         ps->ps_state &= ~PS_CAN_USE;
 874                         ps->ps_state |= PS_GOING_AWAY;
 875                         PS_UNLOCK(ps);
 876                         /*
 877                          * The "ps" segment is "off-line" now,
 878                          * we can try and delete it...
 879                          */
 880                         if(dp_pages_free < (cluster_transfer_minimum
 881                                                         + ps->ps_pgcount)) {
 882                                 error = KERN_FAILURE;
 883                                 PSL_UNLOCK();
 884                         }
 885                         else {
 886                                 /* remove all pages associated with the  */
 887                                 /* segment from the list of free pages   */
 888                                 /* when transfer is through, all target  */
 889                                 /* segment pages will appear to be free  */
 890
 891                                 dp_pages_free -=  ps->ps_pgcount;
 892                                 interim_pages_removed += ps->ps_pgcount;
 893                                 PSL_UNLOCK();
 894                                 error = ps_delete(ps);
 895                         }
 896                         if (error != KERN_SUCCESS) {
 897                                 /*
 898                                  * We couldn't delete the segment,
 899                                  * probably because there's not enough
 900                                  * virtual memory left.
 901                                  * Re-enable all the segments.
 902                                  */
 903                                 PSL_LOCK();
 904                                 break;
 905                         }
 906                         goto restart;
 907                 }
 908         }
 909
 910         if (error != KERN_SUCCESS) {
 911                 for (i = 0; i <= paging_segment_max; i++) {
 912                         ps = paging_segments[i];
 913                         if (ps != PAGING_SEGMENT_NULL &&
 914                             ps->ps_bs == bs &&
 915                             IS_PS_GOING_AWAY(ps)) {
 916                                 PS_LOCK(ps);
 917
 918                                 if( !IS_PS_GOING_AWAY(ps)) {
 919                                         PS_UNLOCK(ps);
 920                                         continue;
 921                                 }
 922                                 /* Handle the special clusters that came in while we let go the lock*/
 923                                 if( ps->ps_special_clusters) {
 924                                         dp_pages_free += ps->ps_special_clusters << ps->ps_clshift;
 925                                         ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift;
 926                                         ps->ps_clcount += ps->ps_special_clusters;
 927                                         if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) {
 928                                                 ps_select_array[ps->ps_bs->bs_priority] = 0;
 929                                         }
 930                                         ps->ps_special_clusters = 0;
 931                                 }
 932                                 /* re-enable access to this segment */
 933                                 ps->ps_state &= ~PS_GOING_AWAY;
 934                                 ps->ps_state |= PS_CAN_USE;
 935                                 PS_UNLOCK(ps);
 936                         }
 937                 }
 938                 dp_pages_free += interim_pages_removed;
 939                 PSL_UNLOCK();
 940                 BS_UNLOCK(bs);
 941                 return error;
 942         }
 943
 944         for (i = 0; i <= paging_segment_max; i++) {
 945                 ps = paging_segments[i];
 946                 if (ps != PAGING_SEGMENT_NULL &&
 947                     ps->ps_bs == bs) {
 948                         if(IS_PS_GOING_AWAY(ps)) {
 949                                 if(IS_PS_EMERGENCY_SEGMENT(ps)) {
 950                                         PS_LOCK(ps);
 951                                         ps->ps_state &= ~PS_GOING_AWAY;
 952                                         ps->ps_special_clusters = 0;
 953                                         ps->ps_pgcount = ps->ps_pgnum;
 954                                         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
 955                                         dp_pages_reserve += ps->ps_pgcount;
 956                                         PS_UNLOCK(ps);
 957                                 } else {
 958                                         paging_segments[i] = PAGING_SEGMENT_NULL;
 959                                         paging_segment_count--;
 960                                         PS_LOCK(ps);
 961                                         kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
 962                                         kfree(ps, sizeof *ps);
 963                                 }
 964                         }
 965                 }
 966         }
 967
 968         /* Scan the entire ps array separately to make certain we find the */
 969         /* proper paging_segment_max                                       */
 970         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 971                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 972                    paging_segment_max = i;
 973         }
 974
 975         PSL_UNLOCK();
 976
 977         if( dealing_with_emergency_segment ) {
 978                 BS_UNLOCK(bs);
 979                 return KERN_SUCCESS;
 980         }
 981
 982         /*
 983          * All the segments have been deleted.
 984          * We can remove the backing store.
 985          */
 986
 987         /*
 988          * Disable lookups of this backing store.
 989          */
 990         if((void *)bs->bs_port->ip_alias != NULL)
 991                 kfree((void *) bs->bs_port->ip_alias,
 992                       sizeof (struct vstruct_alias));
 993         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 994         bs->bs_port = MACH_PORT_NULL;
 995         BS_UNLOCK(bs);
 996
 997         /*
 998          * Remove backing store from backing_store list.
 999          */
1000         BSL_LOCK();
1001         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
1002                      bs_links);
1003         BSL_UNLOCK();
1004
1005         /*
1006          * Free the backing store structure.
1007          */
1008         BS_LOCK_DESTROY(bs);
1009         kfree(bs, sizeof *bs);
1010
1011         return KERN_SUCCESS;
1012 }
1013
1014 kern_return_t
1015 default_pager_backing_store_delete(
1016         MACH_PORT_FACE backing_store)
1017 {
1018         if( backing_store != emergency_segment_backing_store ) {
1019                 default_pager_backing_store_delete_internal(emergency_segment_backing_store);
1020         }
1021         return(default_pager_backing_store_delete_internal(backing_store));
1022 }
1023
1024 int     ps_enter(paging_segment_t);     /* forward */
1025
1026 int
1027 ps_enter(
1028         paging_segment_t ps)
1029 {
1030         int i;
1031
1032         PSL_LOCK();
1033
1034         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
1035                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
1036                         break;
1037         }
1038
1039         if (i < MAX_NUM_PAGING_SEGMENTS) {
1040                 paging_segments[i] = ps;
1041                 if (i > paging_segment_max)
1042                         paging_segment_max = i;
1043                 paging_segment_count++;
1044                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
1045                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
1046                         ps_select_array[ps->ps_bs->bs_priority] = 0;
1047                 i = 0;
1048         } else {
1049                 PSL_UNLOCK();
1050                 return KERN_RESOURCE_SHORTAGE;
1051         }
1052
1053         PSL_UNLOCK();
1054         return i;
1055 }
1056
1057 #ifdef DEVICE_PAGING
1058 kern_return_t
1059 default_pager_add_segment(
1060         MACH_PORT_FACE  backing_store,
1061         MACH_PORT_FACE  device,
1062         recnum_t        offset,
1063         recnum_t        count,
1064         int             record_size)
1065 {
1066         backing_store_t         bs;
1067         paging_segment_t        ps;
1068         int                     i;
1069         int                     error;
1070
1071         if ((bs = backing_store_lookup(backing_store))
1072             == BACKING_STORE_NULL)
1073                 return KERN_INVALID_ARGUMENT;
1074
1075         PSL_LOCK();
1076         for (i = 0; i <= paging_segment_max; i++) {
1077                 ps = paging_segments[i];
1078                 if (ps == PAGING_SEGMENT_NULL)
1079                         continue;
1080
1081                 /*
1082                  * Check for overlap on same device.
1083                  */
1084                 if (!(ps->ps_device != device
1085                       || offset >= ps->ps_offset + ps->ps_recnum
1086                       || offset + count <= ps->ps_offset)) {
1087                         PSL_UNLOCK();
1088                         BS_UNLOCK(bs);
1089                         return KERN_INVALID_ARGUMENT;
1090                 }
1091         }
1092         PSL_UNLOCK();
1093
1094         /*
1095          * Set up the paging segment
1096          */
1097         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1098         if (ps == PAGING_SEGMENT_NULL) {
1099                 BS_UNLOCK(bs);
1100                 return KERN_RESOURCE_SHORTAGE;
1101         }
1102
1103         ps->ps_segtype = PS_PARTITION;
1104         ps->ps_device = device;
1105         ps->ps_offset = offset;
1106         ps->ps_record_shift = local_log2(vm_page_size / record_size);
1107         ps->ps_recnum = count;
1108         ps->ps_pgnum = count >> ps->ps_record_shift;
1109
1110         ps->ps_pgcount = ps->ps_pgnum;
1111         ps->ps_clshift = local_log2(bs->bs_clsize);
1112         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1113         ps->ps_hint = 0;
1114
1115         PS_LOCK_INIT(ps);
1116         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1117         if (!ps->ps_bmap) {
1118                 PS_LOCK_DESTROY(ps);
1119                 kfree(ps, sizeof *ps);
1120                 BS_UNLOCK(bs);
1121                 return KERN_RESOURCE_SHORTAGE;
1122         }
1123         for (i = 0; i < ps->ps_ncls; i++) {
1124                 clrbit(ps->ps_bmap, i);
1125         }
1126
1127         if(paging_segment_count == 0) {
1128                 ps->ps_state = PS_EMERGENCY_SEGMENT;
1129                 if(use_emergency_swap_file_first) {
1130                         ps->ps_state |= PS_CAN_USE;
1131                 }
1132         } else {
1133                 ps->ps_state = PS_CAN_USE;
1134         }
1135
1136         ps->ps_bs = bs;
1137
1138         if ((error = ps_enter(ps)) != 0) {
1139                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1140
1141                 PS_LOCK_DESTROY(ps);
1142                 kfree(ps, sizeof *ps);
1143                 BS_UNLOCK(bs);
1144                 return KERN_RESOURCE_SHORTAGE;
1145         }
1146
1147         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1148         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1149         BS_UNLOCK(bs);
1150
1151         PSL_LOCK();
1152         if(IS_PS_OK_TO_USE(ps)) {
1153                 dp_pages_free += ps->ps_pgcount;
1154         } else {
1155                 dp_pages_reserve += ps->ps_pgcount;
1156         }
1157         PSL_UNLOCK();
1158
1159         bs_more_space(ps->ps_clcount);
1160
1161         DP_DEBUG(DEBUG_BS_INTERNAL,
1162                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1163                   device, offset, count, record_size,
1164                   ps->ps_record_shift, ps->ps_pgnum));
1165
1166         return KERN_SUCCESS;
1167 }
1168
1169 boolean_t
1170 bs_add_device(
1171         char            *dev_name,
1172         MACH_PORT_FACE  master)
1173 {
1174         security_token_t        null_security_token = {
1175                 { 0, 0 }
1176         };
1177         MACH_PORT_FACE  device;
1178         int             info[DEV_GET_SIZE_COUNT];
1179         mach_msg_type_number_t info_count;
1180         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1181         unsigned int    rec_size;
1182         recnum_t        count;
1183         int             clsize;
1184         MACH_PORT_FACE  reply_port;
1185
1186         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1187                         null_security_token, dev_name, &device))
1188                 return FALSE;
1189
1190         info_count = DEV_GET_SIZE_COUNT;
1191         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1192                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1193                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1194                 clsize = bs_get_global_clsize(0);
1195                 if (!default_pager_backing_store_create(
1196                                         default_pager_object,
1197                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1198                                         (clsize * vm_page_size),
1199                                         &bs)) {
1200                         if (!default_pager_add_segment(bs, device,
1201                                                        0, count, rec_size)) {
1202                                 return TRUE;
1203                         }
1204                         ipc_port_release_receive(bs);
1205                 }
1206         }
1207
1208         ipc_port_release_send(device);
1209         return FALSE;
1210 }
1211 #endif /* DEVICE_PAGING */
1212
1213 #if     VS_ASYNC_REUSE
1214
1215 struct vs_async *
1216 vs_alloc_async(void)
1217 {
1218         struct vs_async *vsa;
1219         MACH_PORT_FACE  reply_port;
1220 //      kern_return_t   kr;
1221
1222         VS_ASYNC_LOCK();
1223         if (vs_async_free_list == NULL) {
1224                 VS_ASYNC_UNLOCK();
1225                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1226                 if (vsa != NULL) {
1227                         /*
1228                          * Try allocating a reply port named after the
1229                          * address of the vs_async structure.
1230                          */
1231                         struct vstruct_alias    *alias_struct;
1232
1233                         reply_port = ipc_port_alloc_kernel();
1234                         alias_struct = (struct vstruct_alias *)
1235                                 kalloc(sizeof (struct vstruct_alias));
1236                         if(alias_struct != NULL) {
1237                                 alias_struct->vs = (struct vstruct *)vsa;
1238                                 alias_struct->name = &default_pager_ops;
1239                                 reply_port->ip_alias = (uintptr_t) alias_struct;
1240                                 vsa->reply_port = reply_port;
1241                                 vs_alloc_async_count++;
1242                         }
1243                         else {
1244                                 vs_alloc_async_failed++;
1245                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1246                                                                 (reply_port));
1247                                 kfree(vsa, sizeof (struct vs_async));
1248                                 vsa = NULL;
1249                         }
1250                 }
1251         } else {
1252                 vsa = vs_async_free_list;
1253                 vs_async_free_list = vs_async_free_list->vsa_next;
1254                 VS_ASYNC_UNLOCK();
1255         }
1256
1257         return vsa;
1258 }
1259
1260 void
1261 vs_free_async(
1262         struct vs_async *vsa)
1263 {
1264         VS_ASYNC_LOCK();
1265         vsa->vsa_next = vs_async_free_list;
1266         vs_async_free_list = vsa;
1267         VS_ASYNC_UNLOCK();
1268 }
1269
1270 #else   /* VS_ASYNC_REUSE */
1271
1272 struct vs_async *
1273 vs_alloc_async(void)
1274 {
1275         struct vs_async *vsa;
1276         MACH_PORT_FACE  reply_port;
1277         kern_return_t   kr;
1278
1279         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1280         if (vsa != NULL) {
1281                 /*
1282                  * Try allocating a reply port named after the
1283                  * address of the vs_async structure.
1284                  */
1285                         reply_port = ipc_port_alloc_kernel();
1286                         alias_struct = (vstruct_alias *)
1287                                 kalloc(sizeof (struct vstruct_alias));
1288                         if(alias_struct != NULL) {
1289                                 alias_struct->vs = reply_port;
1290                                 alias_struct->name = &default_pager_ops;
1291                                 reply_port->defpager_importance.alias = (int) vsa;
1292                                 vsa->reply_port = reply_port;
1293                                 vs_alloc_async_count++;
1294                         }
1295                         else {
1296                                 vs_alloc_async_failed++;
1297                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1298                                                                 (reply_port));
1299                                 kfree(vsa, sizeof (struct vs_async));
1300                                 vsa = NULL;
1301                         }
1302         }
1303
1304         return vsa;
1305 }
1306
1307 void
1308 vs_free_async(
1309         struct vs_async *vsa)
1310 {
1311         MACH_PORT_FACE  reply_port;
1312         kern_return_t   kr;
1313
1314         reply_port = vsa->reply_port;
1315         kfree(reply_port->ip_alias, sizeof (struct vstuct_alias));
1316         kfree(vsa, sizeof (struct vs_async));
1317         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1318 #if 0
1319         VS_ASYNC_LOCK();
1320         vs_alloc_async_count--;
1321         VS_ASYNC_UNLOCK();
1322 #endif
1323 }
1324
1325 #endif  /* VS_ASYNC_REUSE */
1326
1327 zone_t  vstruct_zone;
1328
1329 vstruct_t
1330 ps_vstruct_create(
1331         dp_size_t size)
1332 {
1333         vstruct_t       vs;
1334         unsigned int    i;
1335
1336         vs = (vstruct_t) zalloc(vstruct_zone);
1337         if (vs == VSTRUCT_NULL) {
1338                 return VSTRUCT_NULL;
1339         }
1340
1341         VS_LOCK_INIT(vs);
1342
1343         /*
1344          * The following fields will be provided later.
1345          */
1346         vs->vs_pager_ops = NULL;
1347         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1348         vs->vs_references = 1;
1349         vs->vs_seqno = 0;
1350
1351         vs->vs_waiting_seqno = FALSE;
1352         vs->vs_waiting_read = FALSE;
1353         vs->vs_waiting_write = FALSE;
1354         vs->vs_waiting_async = FALSE;
1355
1356         vs->vs_readers = 0;
1357         vs->vs_writers = 0;
1358
1359         vs->vs_errors = 0;
1360
1361         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1362         vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1363         vs->vs_async_pending = 0;
1364
1365         /*
1366          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1367          * depending on the size of the memory object.
1368          */
1369         if (INDIRECT_CLMAP(vs->vs_size)) {
1370                 vs->vs_imap = (struct vs_map **)
1371                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1372                 vs->vs_indirect = TRUE;
1373         } else {
1374                 vs->vs_dmap = (struct vs_map *)
1375                         kalloc(CLMAP_SIZE(vs->vs_size));
1376                 vs->vs_indirect = FALSE;
1377         }
1378         vs->vs_xfer_pending = FALSE;
1379         DP_DEBUG(DEBUG_VS_INTERNAL,
1380                  ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1381
1382         /*
1383          * Check to see that we got the space.
1384          */
1385         if (!vs->vs_dmap) {
1386                 kfree(vs, sizeof *vs);
1387                 return VSTRUCT_NULL;
1388         }
1389
1390         /*
1391          * Zero the indirect pointers, or clear the direct pointers.
1392          */
1393         if (vs->vs_indirect)
1394                 memset(vs->vs_imap, 0,
1395                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1396         else
1397                 for (i = 0; i < vs->vs_size; i++)
1398                         VSM_CLR(vs->vs_dmap[i]);
1399
1400         VS_MAP_LOCK_INIT(vs);
1401
1402         bs_commit(vs->vs_size);
1403
1404         return vs;
1405 }
1406
1407 paging_segment_t ps_select_segment(unsigned int, int *);        /* forward */
1408
1409 paging_segment_t
1410 ps_select_segment(
1411         unsigned int    shift,
1412         int             *psindex)
1413 {
1414         paging_segment_t        ps;
1415         int                     i;
1416         int                     j;
1417
1418         /*
1419          * Optimize case where there's only one segment.
1420          * paging_segment_max will index the one and only segment.
1421          */
1422
1423         PSL_LOCK();
1424         if (paging_segment_count == 1) {
1425                 paging_segment_t lps = PAGING_SEGMENT_NULL;     /* used to avoid extra PS_UNLOCK */
1426                 ipc_port_t trigger = IP_NULL;
1427
1428                 ps = paging_segments[paging_segment_max];
1429                 *psindex = paging_segment_max;
1430                 PS_LOCK(ps);
1431                 if( !IS_PS_EMERGENCY_SEGMENT(ps) ) {
1432                         panic("Emergency paging segment missing\n");
1433                 }
1434                 ASSERT(ps->ps_clshift >= shift);
1435                 if(IS_PS_OK_TO_USE(ps)) {
1436                         if (ps->ps_clcount) {
1437                                 ps->ps_clcount--;
1438                                 dp_pages_free -=  1 << ps->ps_clshift;
1439                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1440                                 if(min_pages_trigger_port &&
1441                                   (dp_pages_free < minimum_pages_remaining)) {
1442                                         trigger = min_pages_trigger_port;
1443                                         min_pages_trigger_port = NULL;
1444                                         bs_low = TRUE;
1445                                         backing_store_abort_compaction = TRUE;
1446                                 }
1447                                 lps = ps;
1448                         }
1449                 }
1450                 PS_UNLOCK(ps);
1451
1452                 if( lps == PAGING_SEGMENT_NULL ) {
1453                         if(dp_pages_free) {
1454                                 dp_pages_free_drift_count++;
1455                                 if(dp_pages_free > dp_pages_free_drifted_max) {
1456                                         dp_pages_free_drifted_max = dp_pages_free;
1457                                 }
1458                                 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1459                         }
1460                         dp_pages_free = 0;
1461                 }
1462
1463                 PSL_UNLOCK();
1464
1465                 if (trigger != IP_NULL) {
1466                         dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1467
1468                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1469                         ipc_port_release_send(trigger);
1470                 }
1471                 return lps;
1472         }
1473
1474         if (paging_segment_count == 0) {
1475                 if(dp_pages_free) {
1476                         dp_pages_free_drift_count++;
1477                         if(dp_pages_free > dp_pages_free_drifted_max) {
1478                                 dp_pages_free_drifted_max = dp_pages_free;
1479                         }
1480                         dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1481                 }
1482                 dp_pages_free = 0;
1483                 PSL_UNLOCK();
1484                 return PAGING_SEGMENT_NULL;
1485         }
1486
1487         for (i = BS_MAXPRI;
1488              i >= BS_MINPRI; i--) {
1489                 int start_index;
1490
1491                 if ((ps_select_array[i] == BS_NOPRI) ||
1492                                 (ps_select_array[i] == BS_FULLPRI))
1493                         continue;
1494                 start_index = ps_select_array[i];
1495
1496                 if(!(paging_segments[start_index])) {
1497                         j = start_index+1;
1498                         physical_transfer_cluster_count = 0;
1499                 }
1500                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1501                                 (((paging_segments[start_index])->ps_clshift)
1502                                 + vm_page_shift))) {
1503                         physical_transfer_cluster_count = 0;
1504                         j = start_index + 1;
1505                 } else {
1506                         physical_transfer_cluster_count+=1;
1507                         j = start_index;
1508                         if(start_index == 0)
1509                                 start_index = paging_segment_max;
1510                         else
1511                                 start_index = start_index - 1;
1512                 }
1513
1514                 while (1) {
1515                         if (j > paging_segment_max)
1516                                 j = 0;
1517                         if ((ps = paging_segments[j]) &&
1518                             (ps->ps_bs->bs_priority == i)) {
1519                                 /*
1520                                  * Force the ps cluster size to be
1521                                  * >= that of the vstruct.
1522                                  */
1523                                 PS_LOCK(ps);
1524                                 if (IS_PS_OK_TO_USE(ps)) {
1525                                         if ((ps->ps_clcount) &&
1526                                                    (ps->ps_clshift >= shift)) {
1527                                                 ipc_port_t trigger = IP_NULL;
1528
1529                                                 ps->ps_clcount--;
1530                                                 dp_pages_free -=  1 << ps->ps_clshift;
1531                                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1532                                                 if(min_pages_trigger_port &&
1533                                                         (dp_pages_free <
1534                                                         minimum_pages_remaining)) {
1535                                                         trigger = min_pages_trigger_port;
1536                                                         min_pages_trigger_port = NULL;
1537                                                         bs_low = TRUE;
1538                                                         backing_store_abort_compaction = TRUE;
1539                                                 }
1540                                                 PS_UNLOCK(ps);
1541                                                 /*
1542                                                  * found one, quit looking.
1543                                                  */
1544                                                 ps_select_array[i] = j;
1545                                                 PSL_UNLOCK();
1546
1547                                                 if (trigger != IP_NULL) {
1548                                                         dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1549
1550                                                         default_pager_space_alert(
1551                                                                 trigger,
1552                                                                 HI_WAT_ALERT);
1553                                                         ipc_port_release_send(trigger);
1554                                                 }
1555                                                 *psindex = j;
1556                                                 return ps;
1557                                         }
1558                                 }
1559                                 PS_UNLOCK(ps);
1560                         }
1561                         if (j == start_index) {
1562                                 /*
1563                                  * none at this priority -- mark it full
1564                                  */
1565                                 ps_select_array[i] = BS_FULLPRI;
1566                                 break;
1567                         }
1568                         j++;
1569                 }
1570         }
1571
1572         if(dp_pages_free) {
1573                 dp_pages_free_drift_count++;
1574                 if(dp_pages_free > dp_pages_free_drifted_max) {
1575                         dp_pages_free_drifted_max = dp_pages_free;
1576                 }
1577                 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free));
1578         }
1579         dp_pages_free = 0;
1580         PSL_UNLOCK();
1581         return PAGING_SEGMENT_NULL;
1582 }
1583
1584 dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1585
1586 dp_offset_t
1587 ps_allocate_cluster(
1588         vstruct_t               vs,
1589         int                     *psindex,
1590         paging_segment_t        use_ps)
1591 {
1592         unsigned int            byte_num;
1593         int                     bit_num = 0;
1594         paging_segment_t        ps;
1595         dp_offset_t             cluster;
1596         ipc_port_t              trigger = IP_NULL;
1597
1598         /*
1599          * Find best paging segment.
1600          * ps_select_segment will decrement cluster count on ps.
1601          * Must pass cluster shift to find the most appropriate segment.
1602          */
1603         /* NOTE:  The addition of paging segment delete capability threatened
1604          * to seriously complicate the treatment of paging segments in this
1605          * module and the ones that call it (notably ps_clmap), because of the
1606          * difficulty in assuring that the paging segment would continue to
1607          * exist between being unlocked and locked.   This was
1608          * avoided because all calls to this module are based in either
1609          * dp_memory_object calls which rely on the vs lock, or by
1610          * the transfer function which is part of the segment delete path.
1611          * The transfer function which is part of paging segment delete is
1612          * protected from multiple callers by the backing store lock.
1613          * The paging segment delete function treats mappings to a paging
1614          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1615          * while data is transferred to the remaining segments.  This is in
1616          * line with the view that incomplete or in-transition mappings between
1617          * data, a vstruct, and backing store are protected by the vs lock.
1618          * This and the ordering of the paging segment "going_away" bit setting
1619          * protects us.
1620          */
1621 retry:
1622         if (use_ps != PAGING_SEGMENT_NULL) {
1623                 ps = use_ps;
1624                 PSL_LOCK();
1625                 PS_LOCK(ps);
1626
1627                 ASSERT(ps->ps_clcount != 0);
1628
1629                 ps->ps_clcount--;
1630                 dp_pages_free -=  1 << ps->ps_clshift;
1631                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1632                 if(min_pages_trigger_port &&
1633                                 (dp_pages_free < minimum_pages_remaining)) {
1634                         trigger = min_pages_trigger_port;
1635                         min_pages_trigger_port = NULL;
1636                         bs_low = TRUE;
1637                         backing_store_abort_compaction = TRUE;
1638                 }
1639                 PSL_UNLOCK();
1640                 PS_UNLOCK(ps);
1641                 if (trigger != IP_NULL) {
1642                         dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1643
1644                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1645                         ipc_port_release_send(trigger);
1646                 }
1647
1648         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1649                    PAGING_SEGMENT_NULL) {
1650                 static clock_sec_t lastnotify = 0;
1651                 clock_sec_t now;
1652                 clock_nsec_t nanoseconds_dummy;
1653
1654                 /*
1655                  * Don't immediately jump to the emergency segment. Give the
1656                  * dynamic pager a chance to create it's first normal swap file.
1657                  * Unless, of course the very first normal swap file can't be
1658                  * created due to some problem and we didn't expect that problem
1659                  * i.e. use_emergency_swap_file_first was never set to true initially.
1660                  * It then gets set in the swap file creation error handling.
1661                  */
1662                 if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) {
1663
1664                         ps = paging_segments[EMERGENCY_PSEG_INDEX];
1665                         if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) {
1666                                 PSL_LOCK();
1667                                 PS_LOCK(ps);
1668
1669                                 if(IS_PS_GOING_AWAY(ps)) {
1670                                         /* Someone de-activated the emergency paging segment*/
1671                                         PS_UNLOCK(ps);
1672                                         PSL_UNLOCK();
1673
1674                                 } else if(dp_pages_free) {
1675                                         /*
1676                                          * Someone has already activated the emergency paging segment
1677                                          * OR
1678                                          * Between us having rec'd a NULL segment from ps_select_segment
1679                                          * and reaching here a new normal segment could have been added.
1680                                          * E.g. we get NULL segment and another thread just added the
1681                                          * new swap file. Hence check to see if we have more dp_pages_free
1682                                          * before activating the emergency segment.
1683                                          */
1684                                         PS_UNLOCK(ps);
1685                                         PSL_UNLOCK();
1686                                         goto retry;
1687
1688                                 } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) {
1689                                         /*
1690                                          * PS_CAN_USE is only reset from the emergency segment when it's
1691                                          * been successfully recovered. So it's legal to have an emergency
1692                                          * segment that has PS_CAN_USE but no clusters because it's recovery
1693                                          * failed.
1694                                          */
1695                                         backing_store_t bs = ps->ps_bs;
1696                                         ps->ps_state |= PS_CAN_USE;
1697                                         if(ps_select_array[bs->bs_priority] == BS_FULLPRI ||
1698                                                 ps_select_array[bs->bs_priority] == BS_NOPRI) {
1699                                                 ps_select_array[bs->bs_priority] = 0;
1700                                         }
1701                                         dp_pages_free += ps->ps_pgcount;
1702                                         dp_pages_reserve -= ps->ps_pgcount;
1703                                         PS_UNLOCK(ps);
1704                                         PSL_UNLOCK();
1705                                         dprintf(("Switching ON Emergency paging segment\n"));
1706                                         goto retry;
1707                                 }
1708
1709                                 PS_UNLOCK(ps);
1710                                 PSL_UNLOCK();
1711                         }
1712                 }
1713
1714                 /*
1715                  * Emit a notification of the low-paging resource condition
1716                  * but don't issue it more than once every five seconds.  This
1717                  * prevents us from overflowing logs with thousands of
1718                  * repetitions of the message.
1719                  */
1720                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1721                 if (paging_segment_count > 1 && (now > lastnotify + 5)) {
1722                         /* With an activated emergency paging segment we still
1723                          * didn't get any clusters. This could mean that the
1724                          * emergency paging segment is exhausted.
1725                          */
1726                         dprintf(("System is out of paging space.\n"));
1727                         lastnotify = now;
1728                 }
1729
1730                 PSL_LOCK();
1731
1732                 if(min_pages_trigger_port) {
1733                         trigger = min_pages_trigger_port;
1734                         min_pages_trigger_port = NULL;
1735                         bs_low = TRUE;
1736                         backing_store_abort_compaction = TRUE;
1737                 }
1738                 PSL_UNLOCK();
1739                 if (trigger != IP_NULL) {
1740                         dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1741
1742                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1743                         ipc_port_release_send(trigger);
1744                 }
1745                 return (dp_offset_t) -1;
1746         }
1747
1748         /*
1749          * Look for an available cluster.  At the end of the loop,
1750          * byte_num is the byte offset and bit_num is the bit offset of the
1751          * first zero bit in the paging segment bitmap.
1752          */
1753         PS_LOCK(ps);
1754         byte_num = ps->ps_hint;
1755         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1756                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1757                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1758                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1759                                         break;
1760                         }
1761                         ASSERT(bit_num != NBBY);
1762                         break;
1763                 }
1764         }
1765         ps->ps_hint = byte_num;
1766         cluster = (byte_num*NBBY) + bit_num;
1767
1768         /* Space was reserved, so this must be true */
1769         ASSERT(cluster < ps->ps_ncls);
1770
1771         setbit(ps->ps_bmap, cluster);
1772         PS_UNLOCK(ps);
1773
1774         return cluster;
1775 }
1776
1777 void ps_deallocate_cluster(paging_segment_t, dp_offset_t);      /* forward */
1778
1779 void
1780 ps_deallocate_cluster(
1781         paging_segment_t        ps,
1782         dp_offset_t             cluster)
1783 {
1784
1785         if (cluster >= ps->ps_ncls)
1786                 panic("ps_deallocate_cluster: Invalid cluster number");
1787
1788         /*
1789          * Lock the paging segment, clear the cluster's bitmap and increment the
1790          * number of free cluster.
1791          */
1792         PSL_LOCK();
1793         PS_LOCK(ps);
1794         clrbit(ps->ps_bmap, cluster);
1795         if( IS_PS_OK_TO_USE(ps)) {
1796                 ++ps->ps_clcount;
1797                 ps->ps_pgcount +=  1 << ps->ps_clshift;
1798                 dp_pages_free +=  1 << ps->ps_clshift;
1799         } else {
1800                 ps->ps_special_clusters += 1;
1801         }
1802
1803         /*
1804          * Move the hint down to the freed cluster if it is
1805          * less than the current hint.
1806          */
1807         if ((cluster/NBBY) < ps->ps_hint) {
1808                 ps->ps_hint = (cluster/NBBY);
1809         }
1810
1811
1812         /*
1813          * If we're freeing space on a full priority, reset the array.
1814          */
1815         if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1816                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1817         PS_UNLOCK(ps);
1818         PSL_UNLOCK();
1819
1820         return;
1821 }
1822
1823 void ps_dealloc_vsmap(struct vs_map *, dp_size_t);      /* forward */
1824
1825 void
1826 ps_dealloc_vsmap(
1827         struct vs_map   *vsmap,
1828         dp_size_t       size)
1829 {
1830         unsigned int i;
1831         struct ps_vnode_trim_data trim_data;
1832
1833         ps_vnode_trim_init(&trim_data);
1834
1835         for (i = 0; i < size; i++) {
1836                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) {
1837                         ps_vnode_trim_more(&trim_data,
1838                                               &vsmap[i],
1839                                               VSM_PS(vsmap[i])->ps_clshift,
1840                                               vm_page_size << VSM_PS(vsmap[i])->ps_clshift);
1841                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1842                                               VSM_CLOFF(vsmap[i]));
1843                 } else {
1844                         ps_vnode_trim_now(&trim_data);
1845                 }
1846         }
1847         ps_vnode_trim_now(&trim_data);
1848 }
1849
1850 void
1851 ps_vstruct_dealloc(
1852         vstruct_t vs)
1853 {
1854         unsigned int    i;
1855 //      spl_t   s;
1856
1857         VS_MAP_LOCK(vs);
1858
1859         /*
1860          * If this is an indirect structure, then we walk through the valid
1861          * (non-zero) indirect pointers and deallocate the clusters
1862          * associated with each used map entry (via ps_dealloc_vsmap).
1863          * When all of the clusters in an indirect block have been
1864          * freed, we deallocate the block.  When all of the indirect
1865          * blocks have been deallocated we deallocate the memory
1866          * holding the indirect pointers.
1867          */
1868         if (vs->vs_indirect) {
1869                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1870                         if (vs->vs_imap[i] != NULL) {
1871                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1872                                 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1873                         }
1874                 }
1875                 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1876         } else {
1877                 /*
1878                  * Direct map.  Free used clusters, then memory.
1879                  */
1880                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1881                 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1882         }
1883         VS_MAP_UNLOCK(vs);
1884
1885         bs_commit(- vs->vs_size);
1886
1887         VS_MAP_LOCK_DESTROY(vs);
1888
1889         zfree(vstruct_zone, vs);
1890 }
1891
1892 kern_return_t
1893 ps_vstruct_reclaim(
1894         vstruct_t vs,
1895         boolean_t return_to_vm,
1896         boolean_t reclaim_backing_store)
1897 {
1898         unsigned int    i, j;
1899         struct vs_map   *vsmap;
1900         boolean_t       vsmap_all_clear, vsimap_all_clear;
1901         struct vm_object_fault_info fault_info;
1902         int             clmap_off;
1903         unsigned int    vsmap_size;
1904         kern_return_t   kr = KERN_SUCCESS;
1905
1906         VS_MAP_LOCK(vs);
1907
1908         fault_info.cluster_size = VM_SUPER_CLUSTER;
1909         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
1910         fault_info.user_tag = 0;
1911         fault_info.lo_offset = 0;
1912         fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift);
1913         fault_info.io_sync = reclaim_backing_store;
1914         fault_info.batch_pmap_op = FALSE;
1915
1916         /*
1917          * If this is an indirect structure, then we walk through the valid
1918          * (non-zero) indirect pointers and deallocate the clusters
1919          * associated with each used map entry (via ps_dealloc_vsmap).
1920          * When all of the clusters in an indirect block have been
1921          * freed, we deallocate the block.  When all of the indirect
1922          * blocks have been deallocated we deallocate the memory
1923          * holding the indirect pointers.
1924          */
1925         if (vs->vs_indirect) {
1926                 vsimap_all_clear = TRUE;
1927                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1928                         vsmap = vs->vs_imap[i];
1929                         if (vsmap == NULL)
1930                                 continue;
1931                         /* loop on clusters in this indirect map */
1932                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
1933                                      VSCLSIZE(vs) * i);
1934                         if (i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
1935                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
1936                         else
1937                                 vsmap_size = CLMAP_ENTRIES;
1938                         vsmap_all_clear = TRUE;
1939                         if (return_to_vm) {
1940                                 for (j = 0; j < vsmap_size;) {
1941                                         if (VSM_ISCLR(vsmap[j]) ||
1942                                             VSM_ISERR(vsmap[j])) {
1943                                                 j++;
1944                                                 clmap_off += vm_page_size * VSCLSIZE(vs);
1945                                                 continue;
1946                                         }
1947                                         VS_MAP_UNLOCK(vs);
1948                                         kr = pvs_cluster_read(
1949                                                 vs,
1950                                                 clmap_off,
1951                                                 (dp_size_t) -1, /* read whole cluster */
1952                                                 &fault_info);
1953
1954                                         VS_MAP_LOCK(vs); /* XXX what if it changed ? */
1955                                         if (kr != KERN_SUCCESS) {
1956                                                 vsmap_all_clear = FALSE;
1957                                                 vsimap_all_clear = FALSE;
1958
1959                                                 kr = KERN_MEMORY_ERROR;
1960                                                 goto out;
1961                                         }
1962                                 }
1963                         }
1964                         if (vsmap_all_clear) {
1965                                 ps_dealloc_vsmap(vsmap, CLMAP_ENTRIES);
1966                                 kfree(vsmap, CLMAP_THRESHOLD);
1967                                 vs->vs_imap[i] = NULL;
1968                         }
1969                 }
1970                 if (vsimap_all_clear) {
1971 //                      kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1972                 }
1973         } else {
1974                 /*
1975                  * Direct map.  Free used clusters, then memory.
1976                  */
1977                 vsmap = vs->vs_dmap;
1978                 if (vsmap == NULL) {
1979                         goto out;
1980                 }
1981                 vsmap_all_clear = TRUE;
1982                 /* loop on clusters in the direct map */
1983                 if (return_to_vm) {
1984                         for (j = 0; j < vs->vs_size;) {
1985                                 if (VSM_ISCLR(vsmap[j]) ||
1986                                     VSM_ISERR(vsmap[j])) {
1987                                         j++;
1988                                         continue;
1989                                 }
1990                                 clmap_off = vm_page_size * (j << vs->vs_clshift);
1991                                 VS_MAP_UNLOCK(vs);
1992                                 kr = pvs_cluster_read(
1993                                         vs,
1994                                         clmap_off,
1995                                         (dp_size_t) -1, /* read whole cluster */
1996                                         &fault_info);
1997
1998                                 VS_MAP_LOCK(vs); /* XXX what if it changed ? */
1999                                 if (kr != KERN_SUCCESS) {
2000                                         vsmap_all_clear = FALSE;
2001
2002                                         kr = KERN_MEMORY_ERROR;
2003                                         goto out;
2004                                 } else {
2005 //                                      VSM_CLR(vsmap[j]);
2006                                 }
2007                         }
2008                 }
2009                 if (vsmap_all_clear) {
2010                         ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
2011 //                      kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
2012                 }
2013         }
2014 out:
2015         VS_MAP_UNLOCK(vs);
2016
2017         return kr;
2018 }
2019
2020 int ps_map_extend(vstruct_t, unsigned int);     /* forward */
2021
2022 int ps_map_extend(
2023         vstruct_t       vs,
2024         unsigned int    new_size)
2025 {
2026         struct vs_map   **new_imap;
2027         struct vs_map   *new_dmap = NULL;
2028         int             newdsize;
2029         int             i;
2030         void            *old_map = NULL;
2031         int             old_map_size = 0;
2032
2033         if (vs->vs_size >= new_size) {
2034                 /*
2035                  * Someone has already done the work.
2036                  */
2037                 return 0;
2038         }
2039
2040         /*
2041          * If the new size extends into the indirect range, then we have one
2042          * of two cases: we are going from indirect to indirect, or we are
2043          * going from direct to indirect.  If we are going from indirect to
2044          * indirect, then it is possible that the new size will fit in the old
2045          * indirect map.  If this is the case, then just reset the size of the
2046          * vstruct map and we are done.  If the new size will not
2047          * fit into the old indirect map, then we have to allocate a new
2048          * indirect map and copy the old map pointers into this new map.
2049          *
2050          * If we are going from direct to indirect, then we have to allocate a
2051          * new indirect map and copy the old direct pages into the first
2052          * indirect page of the new map.
2053          * NOTE: allocating memory here is dangerous, as we're in the
2054          * pageout path.
2055          */
2056         if (INDIRECT_CLMAP(new_size)) {
2057                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
2058
2059                 /*
2060                  * Get a new indirect map and zero it.
2061                  */
2062                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
2063                 if (vs->vs_indirect &&
2064                     (new_map_size == old_map_size)) {
2065                         bs_commit(new_size - vs->vs_size);
2066                         vs->vs_size = new_size;
2067                         return 0;
2068                 }
2069
2070                 new_imap = (struct vs_map **)kalloc(new_map_size);
2071                 if (new_imap == NULL) {
2072                         return -1;
2073                 }
2074                 memset(new_imap, 0, new_map_size);
2075
2076                 if (vs->vs_indirect) {
2077                         /* Copy old entries into new map */
2078                         memcpy(new_imap, vs->vs_imap, old_map_size);
2079                         /* Arrange to free the old map */
2080                         old_map = (void *) vs->vs_imap;
2081                         newdsize = 0;
2082                 } else {        /* Old map was a direct map */
2083                         /* Allocate an indirect page */
2084                         if ((new_imap[0] = (struct vs_map *)
2085                              kalloc(CLMAP_THRESHOLD)) == NULL) {
2086                                 kfree(new_imap, new_map_size);
2087                                 return -1;
2088                         }
2089                         new_dmap = new_imap[0];
2090                         newdsize = CLMAP_ENTRIES;
2091                 }
2092         } else {
2093                 new_imap = NULL;
2094                 newdsize = new_size;
2095                 /*
2096                  * If the new map is a direct map, then the old map must
2097                  * also have been a direct map.  All we have to do is
2098                  * to allocate a new direct map, copy the old entries
2099                  * into it and free the old map.
2100                  */
2101                 if ((new_dmap = (struct vs_map *)
2102                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
2103                         return -1;
2104                 }
2105         }
2106         if (newdsize) {
2107
2108                 /* Free the old map */
2109                 old_map = (void *) vs->vs_dmap;
2110                 old_map_size = CLMAP_SIZE(vs->vs_size);
2111
2112                 /* Copy info from the old map into the new map */
2113                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
2114
2115                 /* Initialize the rest of the new map */
2116                 for (i = vs->vs_size; i < newdsize; i++)
2117                         VSM_CLR(new_dmap[i]);
2118         }
2119         if (new_imap) {
2120                 vs->vs_imap = new_imap;
2121                 vs->vs_indirect = TRUE;
2122         } else
2123                 vs->vs_dmap = new_dmap;
2124         bs_commit(new_size - vs->vs_size);
2125         vs->vs_size = new_size;
2126         if (old_map)
2127                 kfree(old_map, old_map_size);
2128         return 0;
2129 }
2130
2131 dp_offset_t
2132 ps_clmap(
2133         vstruct_t       vs,
2134         dp_offset_t     offset,
2135         struct clmap    *clmap,
2136         int             flag,
2137         dp_size_t       size,
2138         int             error)
2139 {
2140         dp_offset_t     cluster;        /* The cluster of offset.       */
2141         dp_offset_t     newcl;          /* The new cluster allocated.   */
2142         dp_offset_t     newoff;
2143         unsigned int    i;
2144         struct vs_map   *vsmap;
2145
2146         VS_MAP_LOCK(vs);
2147
2148         ASSERT(vs->vs_dmap);
2149         cluster = atop_32(offset) >> vs->vs_clshift;
2150
2151         /*
2152          * Initialize cluster error value
2153          */
2154         clmap->cl_error = 0;
2155
2156         /*
2157          * If the object has grown, extend the page map.
2158          */
2159         if (cluster >= vs->vs_size) {
2160                 if (flag == CL_FIND) {
2161                         /* Do not allocate if just doing a lookup */
2162                         VS_MAP_UNLOCK(vs);
2163                         return (dp_offset_t) -1;
2164                 }
2165                 if (ps_map_extend(vs, cluster + 1)) {
2166                         VS_MAP_UNLOCK(vs);
2167                         return (dp_offset_t) -1;
2168                 }
2169         }
2170
2171         /*
2172          * Look for the desired cluster.  If the map is indirect, then we
2173          * have a two level lookup.  First find the indirect block, then
2174          * find the actual cluster.  If the indirect block has not yet
2175          * been allocated, then do so.  If the cluster has not yet been
2176          * allocated, then do so.
2177          *
2178          * If any of the allocations fail, then return an error.
2179          * Don't allocate if just doing a lookup.
2180          */
2181         if (vs->vs_indirect) {
2182                 long    ind_block = cluster/CLMAP_ENTRIES;
2183
2184                 /* Is the indirect block allocated? */
2185                 vsmap = vs->vs_imap[ind_block];
2186                 if (vsmap == NULL) {
2187                         if (flag == CL_FIND) {
2188                                 VS_MAP_UNLOCK(vs);
2189                                 return (dp_offset_t) -1;
2190                         }
2191
2192                         /* Allocate the indirect block */
2193                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
2194                         if (vsmap == NULL) {
2195                                 VS_MAP_UNLOCK(vs);
2196                                 return (dp_offset_t) -1;
2197                         }
2198                         /* Initialize the cluster offsets */
2199                         for (i = 0; i < CLMAP_ENTRIES; i++)
2200                                 VSM_CLR(vsmap[i]);
2201                         vs->vs_imap[ind_block] = vsmap;
2202                 }
2203         } else
2204                 vsmap = vs->vs_dmap;
2205
2206         ASSERT(vsmap);
2207         vsmap += cluster%CLMAP_ENTRIES;
2208
2209         /*
2210          * At this point, vsmap points to the struct vs_map desired.
2211          *
2212          * Look in the map for the cluster, if there was an error on a
2213          * previous write, flag it and return.  If it is not yet
2214          * allocated, then allocate it, if we're writing; if we're
2215          * doing a lookup and the cluster's not allocated, return error.
2216          */
2217         if (VSM_ISERR(*vsmap)) {
2218                 clmap->cl_error = VSM_GETERR(*vsmap);
2219                 VS_MAP_UNLOCK(vs);
2220                 return (dp_offset_t) -1;
2221         } else if (VSM_ISCLR(*vsmap)) {
2222                 int psindex;
2223
2224                 if (flag == CL_FIND) {
2225                         /*
2226                          * If there's an error and the entry is clear, then
2227                          * we've run out of swap space.  Record the error
2228                          * here and return.
2229                          */
2230                         if (error) {
2231                                 VSM_SETERR(*vsmap, error);
2232                         }
2233                         VS_MAP_UNLOCK(vs);
2234                         return (dp_offset_t) -1;
2235                 } else {
2236                         /*
2237                          * Attempt to allocate a cluster from the paging segment
2238                          */
2239                         newcl = ps_allocate_cluster(vs, &psindex,
2240                                                     PAGING_SEGMENT_NULL);
2241                         if (newcl == (dp_offset_t) -1) {
2242                                 VS_MAP_UNLOCK(vs);
2243                                 return (dp_offset_t) -1;
2244                         }
2245                         VSM_CLR(*vsmap);
2246                         VSM_SETCLOFF(*vsmap, newcl);
2247                         VSM_SETPS(*vsmap, psindex);
2248                 }
2249         } else
2250                 newcl = VSM_CLOFF(*vsmap);
2251
2252         /*
2253          * Fill in pertinent fields of the clmap
2254          */
2255         clmap->cl_ps = VSM_PS(*vsmap);
2256         clmap->cl_numpages = VSCLSIZE(vs);
2257         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
2258
2259         /*
2260          * Byte offset in paging segment is byte offset to cluster plus
2261          * byte offset within cluster.  It looks ugly, but should be
2262          * relatively quick.
2263          */
2264         ASSERT(trunc_page(offset) == offset);
2265         newcl = ptoa_32(newcl) << vs->vs_clshift;
2266         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
2267         if (flag == CL_ALLOC) {
2268                 /*
2269                  * set bits in the allocation bitmap according to which
2270                  * pages were requested.  size is in bytes.
2271                  */
2272                 i = atop_32(newoff);
2273                 while ((size > 0) && (i < VSCLSIZE(vs))) {
2274                         VSM_SETALLOC(*vsmap, i);
2275                         i++;
2276                         size -= vm_page_size;
2277                 }
2278         }
2279         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
2280         if (newoff) {
2281                 /*
2282                  * Offset is not cluster aligned, so number of pages
2283                  * and bitmaps must be adjusted
2284                  */
2285                 clmap->cl_numpages -= atop_32(newoff);
2286                 CLMAP_SHIFT(clmap, vs);
2287                 CLMAP_SHIFTALLOC(clmap, vs);
2288         }
2289
2290         /*
2291          *
2292          * The setting of valid bits and handling of write errors
2293          * must be done here, while we hold the lock on the map.
2294          * It logically should be done in ps_vs_write_complete().
2295          * The size and error information has been passed from
2296          * ps_vs_write_complete().  If the size parameter is non-zero,
2297          * then there is work to be done.  If error is also non-zero,
2298          * then the error number is recorded in the cluster and the
2299          * entire cluster is in error.
2300          */
2301         if (size && flag == CL_FIND) {
2302                 dp_offset_t off = (dp_offset_t) 0;
2303
2304                 if (!error) {
2305                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
2306                              i++) {
2307                                 VSM_SETPG(*vsmap, i);
2308                                 size -= vm_page_size;
2309                         }
2310                         ASSERT(i <= VSCLSIZE(vs));
2311                 } else {
2312                         BS_STAT(clmap->cl_ps->ps_bs,
2313                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
2314                                         atop_32(size));
2315                         off = VSM_CLOFF(*vsmap);
2316                         VSM_SETERR(*vsmap, error);
2317                 }
2318                 /*
2319                  * Deallocate cluster if error, and no valid pages
2320                  * already present.
2321                  */
2322                 if (off != (dp_offset_t) 0)
2323                         ps_deallocate_cluster(clmap->cl_ps, off);
2324                 VS_MAP_UNLOCK(vs);
2325                 return (dp_offset_t) 0;
2326         } else
2327                 VS_MAP_UNLOCK(vs);
2328
2329         DP_DEBUG(DEBUG_VS_INTERNAL,
2330                  ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2331                   newcl+newoff, (int) vs, (int) vsmap, flag));
2332         DP_DEBUG(DEBUG_VS_INTERNAL,
2333                  ("     clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2334                   (int) clmap->cl_ps, clmap->cl_numpages,
2335                   (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
2336
2337         return (newcl + newoff);
2338 }
2339
2340 void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t);     /* forward */
2341
2342 void
2343 ps_clunmap(
2344         vstruct_t       vs,
2345         dp_offset_t     offset,
2346         dp_size_t       length)
2347 {
2348         dp_offset_t             cluster; /* The cluster number of offset */
2349         struct vs_map           *vsmap;
2350         struct ps_vnode_trim_data trim_data;
2351
2352         ps_vnode_trim_init(&trim_data);
2353
2354         VS_MAP_LOCK(vs);
2355
2356         /*
2357          * Loop through all clusters in this range, freeing paging segment
2358          * clusters and map entries as encountered.
2359          */
2360         while (length > 0) {
2361                 dp_offset_t     newoff;
2362                 unsigned int    i;
2363
2364                 cluster = atop_32(offset) >> vs->vs_clshift;
2365                 if (vs->vs_indirect)    /* indirect map */
2366                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2367                 else
2368                         vsmap = vs->vs_dmap;
2369                 if (vsmap == NULL) {
2370                         ps_vnode_trim_now(&trim_data);
2371                         VS_MAP_UNLOCK(vs);
2372                         return;
2373                 }
2374                 vsmap += cluster%CLMAP_ENTRIES;
2375                 if (VSM_ISCLR(*vsmap)) {
2376                         ps_vnode_trim_now(&trim_data);
2377                         length -= vm_page_size;
2378                         offset += vm_page_size;
2379                         continue;
2380                 }
2381                 /*
2382                  * We've got a valid mapping.  Clear it and deallocate
2383                  * paging segment cluster pages.
2384                  * Optimize for entire cluster cleraing.
2385                  */
2386                 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2387                         /*
2388                          * Not cluster aligned.
2389                          */
2390                         ASSERT(trunc_page(newoff) == newoff);
2391                         i = atop_32(newoff);
2392                 } else
2393                         i = 0;
2394                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2395                         VSM_CLRPG(*vsmap, i);
2396                         VSM_CLRALLOC(*vsmap, i);
2397                         length -= vm_page_size;
2398                         offset += vm_page_size;
2399                         i++;
2400                 }
2401
2402                 /*
2403                  * If map entry is empty, clear and deallocate cluster.
2404                  */
2405                 if (!VSM_BMAP(*vsmap)) {
2406                         ps_vnode_trim_more(&trim_data,
2407                                               vsmap,
2408                                               vs->vs_clshift,
2409                                               VSCLSIZE(vs) * vm_page_size);
2410                         ps_deallocate_cluster(VSM_PS(*vsmap),
2411                                               VSM_CLOFF(*vsmap));
2412                         VSM_CLR(*vsmap);
2413                 } else {
2414                         ps_vnode_trim_now(&trim_data);
2415                 }
2416         }
2417         ps_vnode_trim_now(&trim_data);
2418
2419         VS_MAP_UNLOCK(vs);
2420 }
2421
2422 void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */
2423
2424 void
2425 ps_vs_write_complete(
2426         vstruct_t       vs,
2427         dp_offset_t     offset,
2428         dp_size_t       size,
2429         int             error)
2430 {
2431         struct clmap    clmap;
2432
2433         /*
2434          * Get the struct vsmap for this cluster.
2435          * Use READ, even though it was written, because the
2436          * cluster MUST be present, unless there was an error
2437          * in the original ps_clmap (e.g. no space), in which
2438          * case, nothing happens.
2439          *
2440          * Must pass enough information to ps_clmap to allow it
2441          * to set the vs_map structure bitmap under lock.
2442          */
2443         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2444 }
2445
2446 void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int);    /* forward */
2447
2448 void
2449 vs_cl_write_complete(
2450         vstruct_t                       vs,
2451         __unused paging_segment_t       ps,
2452         dp_offset_t                     offset,
2453         __unused vm_offset_t            addr,
2454         dp_size_t                       size,
2455         boolean_t                       async,
2456         int                             error)
2457 {
2458 //      kern_return_t   kr;
2459
2460         if (error) {
2461                 /*
2462                  * For internal objects, the error is recorded on a
2463                  * per-cluster basis by ps_clmap() which is called
2464                  * by ps_vs_write_complete() below.
2465                  */
2466                 dprintf(("write failed error = 0x%x\n", error));
2467                 /* add upl_abort code here */
2468         } else
2469                 GSTAT(global_stats.gs_pages_out += atop_32(size));
2470         /*
2471          * Notify the vstruct mapping code, so it can do its accounting.
2472          */
2473         ps_vs_write_complete(vs, offset, size, error);
2474
2475         if (async) {
2476                 VS_LOCK(vs);
2477                 ASSERT(vs->vs_async_pending > 0);
2478                 vs->vs_async_pending -= size;
2479                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2480                         vs->vs_waiting_async = FALSE;
2481                         VS_UNLOCK(vs);
2482                         thread_wakeup(&vs->vs_async_pending);
2483                 } else {
2484                         VS_UNLOCK(vs);
2485                 }
2486         }
2487 }
2488
2489 #ifdef DEVICE_PAGING
2490 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2491
2492 kern_return_t
2493 device_write_reply(
2494         MACH_PORT_FACE  reply_port,
2495         kern_return_t   device_code,
2496         io_buf_len_t    bytes_written)
2497 {
2498         struct vs_async *vsa;
2499
2500         vsa = (struct vs_async *)
2501                 ((struct vstruct_alias *)(reply_port->ip_alias))->vs;
2502
2503         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2504                 device_code = KERN_FAILURE;
2505         }
2506
2507         vsa->vsa_error = device_code;
2508
2509
2510         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2511         if(vsa->vsa_flags & VSA_TRANSFER) {
2512                 /* revisit when async disk segments redone */
2513                 if(vsa->vsa_error) {
2514                    /* need to consider error condition.  re-write data or */
2515                    /* throw it away here. */
2516                    vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2517                 }
2518                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2519                                                 vsa->vsa_size, vsa->vsa_error);
2520         } else {
2521                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2522                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2523                              vsa->vsa_error);
2524         }
2525         VS_FREE_ASYNC(vsa);
2526
2527         return KERN_SUCCESS;
2528 }
2529
2530 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2531 kern_return_t
2532 device_write_reply_inband(
2533         MACH_PORT_FACE          reply_port,
2534         kern_return_t           return_code,
2535         io_buf_len_t            bytes_written)
2536 {
2537         panic("device_write_reply_inband: illegal");
2538         return KERN_SUCCESS;
2539 }
2540
2541 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2542 kern_return_t
2543 device_read_reply(
2544         MACH_PORT_FACE          reply_port,
2545         kern_return_t           return_code,
2546         io_buf_ptr_t            data,
2547         mach_msg_type_number_t  dataCnt)
2548 {
2549         struct vs_async *vsa;
2550         vsa = (struct vs_async *)
2551                 ((struct vstruct_alias *)(reply_port->defpager_importance.alias))->vs;
2552         vsa->vsa_addr = (vm_offset_t)data;
2553         vsa->vsa_size = (vm_size_t)dataCnt;
2554         vsa->vsa_error = return_code;
2555         thread_wakeup(&vsa);
2556         return KERN_SUCCESS;
2557 }
2558
2559 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2560 kern_return_t
2561 device_read_reply_inband(
2562         MACH_PORT_FACE          reply_port,
2563         kern_return_t           return_code,
2564         io_buf_ptr_inband_t     data,
2565         mach_msg_type_number_t  dataCnt)
2566 {
2567         panic("device_read_reply_inband: illegal");
2568         return KERN_SUCCESS;
2569 }
2570
2571 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2572 kern_return_t
2573 device_read_reply_overwrite(
2574         MACH_PORT_FACE          reply_port,
2575         kern_return_t           return_code,
2576         io_buf_len_t            bytes_read)
2577 {
2578         panic("device_read_reply_overwrite: illegal\n");
2579         return KERN_SUCCESS;
2580 }
2581
2582 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2583 kern_return_t
2584 device_open_reply(
2585         MACH_PORT_FACE          reply_port,
2586         kern_return_t           return_code,
2587         MACH_PORT_FACE          device_port)
2588 {
2589         panic("device_open_reply: illegal\n");
2590         return KERN_SUCCESS;
2591 }
2592
2593 kern_return_t
2594 ps_read_device(
2595         paging_segment_t        ps,
2596         dp_offset_t             offset,
2597         vm_offset_t             *bufferp,
2598         unsigned int            size,
2599         unsigned int            *residualp,
2600         int                     flags)
2601 {
2602         kern_return_t   kr;
2603         recnum_t        dev_offset;
2604         unsigned int    bytes_wanted;
2605         unsigned int    bytes_read;
2606         unsigned int    total_read;
2607         vm_offset_t     dev_buffer;
2608         vm_offset_t     buf_ptr;
2609         unsigned int    records_read;
2610         struct vs_async *vsa;
2611
2612         device_t        device;
2613         vm_map_copy_t   device_data = NULL;
2614         default_pager_thread_t *dpt = NULL;
2615
2616         device = dev_port_lookup(ps->ps_device);
2617         clustered_reads[atop_32(size)]++;
2618
2619         dev_offset = (ps->ps_offset +
2620                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2621         bytes_wanted = size;
2622         total_read = 0;
2623         *bufferp = (vm_offset_t)NULL;
2624
2625         do {
2626                 vsa = VS_ALLOC_ASYNC();
2627                 if (vsa) {
2628                         vsa->vsa_vs = NULL;
2629                         vsa->vsa_addr = 0;
2630                         vsa->vsa_offset = 0;
2631                         vsa->vsa_size = 0;
2632                         vsa->vsa_ps = NULL;
2633                 }
2634                 ip_lock(vsa->reply_port);
2635                 vsa->reply_port->ip_sorights++;
2636                 ip_reference(vsa->reply_port);
2637                 ip_unlock(vsa->reply_port);
2638                 kr = ds_device_read_common(device,
2639                                  vsa->reply_port,
2640                                  (mach_msg_type_name_t)
2641                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2642                                  (dev_mode_t) 0,
2643                                  dev_offset,
2644                                  bytes_wanted,
2645                                  (IO_READ | IO_CALL),
2646                                  (io_buf_ptr_t *) &dev_buffer,
2647                                  (mach_msg_type_number_t *) &bytes_read);
2648                 if(kr == MIG_NO_REPLY) {
2649                         assert_wait(&vsa, THREAD_UNINT);
2650                         thread_block(THREAD_CONTINUE_NULL);
2651
2652                         dev_buffer = vsa->vsa_addr;
2653                         bytes_read = (unsigned int)vsa->vsa_size;
2654                         kr = vsa->vsa_error;
2655                 }
2656                 VS_FREE_ASYNC(vsa);
2657                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2658                         break;
2659                 }
2660                 total_read += bytes_read;
2661
2662                 /*
2663                  * If we got the entire range, use the returned dev_buffer.
2664                  */
2665                 if (bytes_read == size) {
2666                         *bufferp = (vm_offset_t)dev_buffer;
2667                         break;
2668                 }
2669
2670 #if 1
2671                 dprintf(("read only %d bytes out of %d\n",
2672                          bytes_read, bytes_wanted));
2673 #endif
2674                 if(dpt == NULL) {
2675                         dpt = get_read_buffer();
2676                         buf_ptr = dpt->dpt_buffer;
2677                         *bufferp = (vm_offset_t)buf_ptr;
2678                 }
2679                 /*
2680                  * Otherwise, copy the data into the provided buffer (*bufferp)
2681                  * and append the rest of the range as it comes in.
2682                  */
2683                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2684                 buf_ptr += bytes_read;
2685                 bytes_wanted -= bytes_read;
2686                 records_read = (bytes_read >>
2687                                 (vm_page_shift - ps->ps_record_shift));
2688                 dev_offset += records_read;
2689                 DP_DEBUG(DEBUG_VS_INTERNAL,
2690                          ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2691                           dev_buffer, bytes_read));
2692                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2693                     != KERN_SUCCESS)
2694                         Panic("dealloc buf");
2695         } while (bytes_wanted);
2696
2697         *residualp = size - total_read;
2698         if((dev_buffer != *bufferp) && (total_read != 0)) {
2699                 vm_offset_t temp_buffer;
2700                 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2701                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2702                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2703                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2704                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2705                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2706                         (vm_map_copy_t *)&device_data, FALSE))
2707                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2708         }
2709         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2710                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2711                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2712                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2713                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2714                         (vm_map_copy_t *)&device_data, FALSE))
2715                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2716         }
2717         else {
2718                 device_data = NULL;
2719         }
2720         *bufferp = (vm_offset_t)device_data;
2721
2722         if(dpt != NULL) {
2723                 /* Free the receive buffer */
2724                 dpt->checked_out = 0;
2725                 thread_wakeup(&dpt_array);
2726         }
2727         return KERN_SUCCESS;
2728 }
2729
2730 kern_return_t
2731 ps_write_device(
2732         paging_segment_t        ps,
2733         dp_offset_t             offset,
2734         vm_offset_t             addr,
2735         unsigned int            size,
2736         struct vs_async         *vsa)
2737 {
2738         recnum_t        dev_offset;
2739         io_buf_len_t    bytes_to_write, bytes_written;
2740         recnum_t        records_written;
2741         kern_return_t   kr;
2742         MACH_PORT_FACE  reply_port;
2743
2744
2745
2746         clustered_writes[atop_32(size)]++;
2747
2748         dev_offset = (ps->ps_offset +
2749                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2750         bytes_to_write = size;
2751
2752         if (vsa) {
2753                 /*
2754                  * Asynchronous write.
2755                  */
2756                 reply_port = vsa->reply_port;
2757                 ip_lock(reply_port);
2758                 reply_port->ip_sorights++;
2759                 ip_reference(reply_port);
2760                 ip_unlock(reply_port);
2761                 {
2762                 device_t        device;
2763                 device = dev_port_lookup(ps->ps_device);
2764
2765                 vsa->vsa_addr = addr;
2766                 kr=ds_device_write_common(device,
2767                         reply_port,
2768                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2769                         (dev_mode_t) 0,
2770                         dev_offset,
2771                         (io_buf_ptr_t)  addr,
2772                         size,
2773                         (IO_WRITE | IO_CALL),
2774                         &bytes_written);
2775                 }
2776                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2777                         if (verbose)
2778                                 dprintf(("%s0x%x, addr=0x%x,"
2779                                          "size=0x%x,offset=0x%x\n",
2780                                          "device_write_request returned ",
2781                                          kr, addr, size, offset));
2782                         BS_STAT(ps->ps_bs,
2783                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2784                         /* do the completion notification to free resources */
2785                         device_write_reply(reply_port, kr, 0);
2786                         return PAGER_ERROR;
2787                 }
2788         } else do {
2789                 /*
2790                  * Synchronous write.
2791                  */
2792                 {
2793                 device_t        device;
2794                 device = dev_port_lookup(ps->ps_device);
2795                 kr=ds_device_write_common(device,
2796                         IP_NULL, 0,
2797                         (dev_mode_t) 0,
2798                         dev_offset,
2799                         (io_buf_ptr_t)  addr,
2800                         size,
2801                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2802                         &bytes_written);
2803                 }
2804                 if (kr != KERN_SUCCESS) {
2805                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2806                                  "device_write returned ",
2807                                  kr, addr, size, offset));
2808                         BS_STAT(ps->ps_bs,
2809                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2810                         return PAGER_ERROR;
2811                 }
2812                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2813                         Panic("fragmented write");
2814                 records_written = (bytes_written >>
2815                                    (vm_page_shift - ps->ps_record_shift));
2816                 dev_offset += records_written;
2817 #if 1
2818                 if (bytes_written != bytes_to_write) {
2819                         dprintf(("wrote only %d bytes out of %d\n",
2820                                  bytes_written, bytes_to_write));
2821                 }
2822 #endif
2823                 bytes_to_write -= bytes_written;
2824                 addr += bytes_written;
2825         } while (bytes_to_write > 0);
2826
2827         return PAGER_SUCCESS;
2828 }
2829
2830
2831 #else /* !DEVICE_PAGING */
2832
2833 kern_return_t
2834 ps_read_device(
2835         __unused paging_segment_t       ps,
2836         __unused dp_offset_t            offset,
2837         __unused vm_offset_t            *bufferp,
2838         __unused unsigned int           size,
2839         __unused unsigned int           *residualp,
2840         __unused int                            flags)
2841 {
2842   panic("ps_read_device not supported");
2843   return KERN_FAILURE;
2844 }
2845
2846 kern_return_t
2847 ps_write_device(
2848         __unused paging_segment_t       ps,
2849         __unused dp_offset_t            offset,
2850         __unused vm_offset_t            addr,
2851         __unused unsigned int           size,
2852         __unused struct vs_async        *vsa)
2853 {
2854   panic("ps_write_device not supported");
2855   return KERN_FAILURE;
2856 }
2857
2858 #endif /* DEVICE_PAGING */
2859 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t);      /* forward */
2860
2861 void
2862 pvs_object_data_provided(
2863         __unused vstruct_t              vs,
2864         __unused upl_t                  upl,
2865         __unused upl_offset_t   offset,
2866         upl_size_t                              size)
2867 {
2868 #if     RECLAIM_SWAP
2869         boolean_t       empty;
2870 #endif
2871
2872         DP_DEBUG(DEBUG_VS_INTERNAL,
2873                  ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2874                   upl, offset, size));
2875
2876         ASSERT(size > 0);
2877         GSTAT(global_stats.gs_pages_in += atop_32(size));
2878
2879 /* check upl iosync flag instead of using RECLAIM_SWAP*/
2880 #if     RECLAIM_SWAP
2881         if (size != upl->size) {
2882                 if (size) {
2883                         ps_clunmap(vs, offset, size);
2884                         upl_commit_range(upl, 0, size, 0, NULL, 0, &empty);
2885                 }
2886                 upl_abort(upl, UPL_ABORT_ERROR);
2887                 upl_deallocate(upl);
2888         } else {
2889                 ps_clunmap(vs, offset, size);
2890                 upl_commit(upl, NULL, 0);
2891                 upl_deallocate(upl);
2892         }
2893 #endif  /* RECLAIM_SWAP */
2894
2895 }
2896
2897 static memory_object_offset_t   last_start;
2898 static vm_size_t                last_length;
2899
2900 /*
2901  * A "cnt" of 0 means that the caller just wants to check if the page at
2902  * offset "vs_offset" exists in the backing store.  That page hasn't been
2903  * prepared, so no need to release it.
2904  *
2905  * A "cnt" of -1 means that the caller wants to bring back from the backing
2906  * store all existing pages in the cluster containing "vs_offset".
2907  */
2908 kern_return_t
2909 pvs_cluster_read(
2910         vstruct_t       vs,
2911         dp_offset_t     vs_offset,
2912         dp_size_t       cnt,
2913         void            *fault_info)
2914 {
2915         kern_return_t           error = KERN_SUCCESS;
2916         unsigned int            size;
2917         unsigned int            residual;
2918         unsigned int            request_flags;
2919         int                     io_flags = 0;
2920         int                     seg_index;
2921         int                     pages_in_cl;
2922         int                     cl_size;
2923         int                     cl_mask;
2924         int                     cl_index;
2925         unsigned int            xfer_size;
2926         dp_offset_t             orig_vs_offset;
2927         dp_offset_t       ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2928         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2929         struct clmap            clmap;
2930         upl_t                   upl;
2931         unsigned int            page_list_count;
2932         memory_object_offset_t  cluster_start;
2933         vm_size_t               cluster_length;
2934         uint32_t                io_streaming;
2935         int                     i;
2936         boolean_t               io_sync = FALSE;
2937         boolean_t               reclaim_all = FALSE;
2938
2939         pages_in_cl = 1 << vs->vs_clshift;
2940         cl_size = pages_in_cl * vm_page_size;
2941         cl_mask = cl_size - 1;
2942
2943         request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2944
2945         if (cnt == (dp_size_t) -1)
2946                 reclaim_all = TRUE;
2947
2948         if (reclaim_all == TRUE) {
2949                 /*
2950                  * We've been called from ps_vstruct_reclaim() to move all
2951                  * the object's swapped pages back to VM pages.
2952                  * This can put memory pressure on the system, so we do want
2953                  * to wait for free pages, to avoid getting in the way of the
2954                  * vm_pageout_scan() thread.
2955                  * Let's not use UPL_NOBLOCK in this case.
2956                  */
2957                 vs_offset &= ~cl_mask;
2958                 i = pages_in_cl;
2959         } else {
2960                 i = 1;
2961
2962                 /*
2963                  * if the I/O cluster size == PAGE_SIZE, we don't want to set
2964                  * the UPL_NOBLOCK since we may be trying to recover from a
2965                  * previous partial pagein I/O that occurred because we were low
2966                  * on memory and bailed early in order to honor the UPL_NOBLOCK...
2967                  * since we're only asking for a single page, we can block w/o fear
2968                  * of tying up pages while waiting for more to become available
2969                  */
2970                 if (fault_info == NULL || ((vm_object_fault_info_t)fault_info)->cluster_size > PAGE_SIZE)
2971                         request_flags |= UPL_NOBLOCK;
2972         }
2973
2974 again:
2975         cl_index = (vs_offset & cl_mask) / vm_page_size;
2976
2977         if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) ||
2978             !CLMAP_ISSET(clmap, cl_index)) {
2979                 /*
2980                  * the needed page doesn't exist in the backing store...
2981                  * we don't want to try to do any I/O, just abort the
2982                  * page and let the fault handler provide a zero-fill
2983                  */
2984                 if (cnt == 0) {
2985                         /*
2986                          * The caller was just poking at us to see if
2987                          * the page has been paged out.  No need to
2988                          * mess with the page at all.
2989                          * Just let the caller know we don't have that page.
2990                          */
2991                         return KERN_FAILURE;
2992                 }
2993                 if (reclaim_all == TRUE) {
2994                         i--;
2995                         if (i == 0) {
2996                                 /* no more pages in this cluster */
2997                                 return KERN_FAILURE;
2998                         }
2999                         /* try the next page in this cluster */
3000                         vs_offset += vm_page_size;
3001                         goto again;
3002                 }
3003
3004                 page_list_count = 0;
3005
3006                 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
3007                                                 PAGE_SIZE, PAGE_SIZE,
3008                                                 &upl, NULL, &page_list_count,
3009                                                 request_flags  | UPL_SET_INTERNAL);
3010                 upl_range_needed(upl, 0, 1);
3011
3012                 if (clmap.cl_error)
3013                         upl_abort(upl, UPL_ABORT_ERROR);
3014                 else
3015                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
3016                 upl_deallocate(upl);
3017
3018                 return KERN_SUCCESS;
3019         }
3020
3021         if (cnt == 0) {
3022                 /*
3023                  * The caller was just poking at us to see if
3024                  * the page has been paged out.  No need to
3025                  * mess with the page at all.
3026                  * Just let the caller know we do have that page.
3027                  */
3028                 return KERN_SUCCESS;
3029         }
3030
3031         if(((vm_object_fault_info_t)fault_info)->io_sync == TRUE ) {
3032                 io_sync = TRUE;
3033         } else {
3034 #if RECLAIM_SWAP
3035                 io_sync = TRUE;
3036 #endif  /* RECLAIM_SWAP */
3037         }
3038
3039         if( io_sync == TRUE ) {
3040
3041                 io_flags |= UPL_IOSYNC | UPL_NOCOMMIT;
3042 #if USE_PRECIOUS
3043                 request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE;
3044 #else   /* USE_PRECIOUS */
3045                 request_flags |= UPL_REQUEST_SET_DIRTY;
3046 #endif  /* USE_PRECIOUS */
3047         }
3048
3049         assert(dp_encryption_inited);
3050         if (dp_encryption) {
3051                 /*
3052                  * ENCRYPTED SWAP:
3053                  * request that the UPL be prepared for
3054                  * decryption.
3055                  */
3056                 request_flags |= UPL_ENCRYPT;
3057                 io_flags |= UPL_PAGING_ENCRYPTED;
3058         }
3059         orig_vs_offset = vs_offset;
3060
3061         assert(cnt != 0);
3062         cnt = VM_SUPER_CLUSTER;
3063         cluster_start = (memory_object_offset_t) vs_offset;
3064         cluster_length = (vm_size_t) cnt;
3065         io_streaming = 0;
3066
3067         /*
3068          * determine how big a speculative I/O we should try for...
3069          */
3070         if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
3071                 assert(vs_offset >= (dp_offset_t) cluster_start &&
3072                        vs_offset < (dp_offset_t) (cluster_start + cluster_length));
3073                 vs_offset = (dp_offset_t) cluster_start;
3074                 cnt = (dp_size_t) cluster_length;
3075         } else {
3076                 cluster_length = PAGE_SIZE;
3077                 cnt = PAGE_SIZE;
3078         }
3079
3080         if (io_streaming)
3081                 io_flags |= UPL_IOSTREAMING;
3082
3083         last_start = cluster_start;
3084         last_length = cluster_length;
3085
3086         /*
3087          * This loop will be executed multiple times until the entire
3088          * range has been looked at or we issue an I/O... if the request spans cluster
3089          * boundaries, the clusters will be checked for logical continunity,
3090          * if contiguous the I/O request will span multiple clusters...
3091          * at most only 1 I/O will be issued... it will encompass the original offset
3092          */
3093         while (cnt && error == KERN_SUCCESS) {
3094                 int     ps_info_valid;
3095
3096                 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
3097                         size = VM_SUPER_CLUSTER;
3098                         size -= vs_offset & cl_mask;
3099                 } else if (cnt > VM_SUPER_CLUSTER)
3100                         size = VM_SUPER_CLUSTER;
3101                 else
3102                         size = cnt;
3103
3104                 cnt -= size;
3105
3106                 ps_info_valid = 0;
3107                 seg_index     = 0;
3108
3109                 while (size > 0 && error == KERN_SUCCESS) {
3110                         unsigned int  abort_size;
3111                         unsigned int  lsize;
3112                         int           failed_size;
3113                         int           beg_pseg;
3114                         int           beg_indx;
3115                         dp_offset_t   cur_offset;
3116
3117                         if ( !ps_info_valid) {
3118                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3119                                 psp[seg_index]       = CLMAP_PS(clmap);
3120                                 ps_info_valid = 1;
3121                         }
3122                         /*
3123                          * skip over unallocated physical segments
3124                          */
3125                         if (ps_offset[seg_index] == (dp_offset_t) -1) {
3126                                 abort_size = cl_size - (vs_offset & cl_mask);
3127                                 abort_size = MIN(abort_size, size);
3128
3129                                 size      -= abort_size;
3130                                 vs_offset += abort_size;
3131
3132                                 seg_index++;
3133                                 ps_info_valid = 0;
3134
3135                                 continue;
3136                         }
3137                         cl_index = (vs_offset & cl_mask) / vm_page_size;
3138
3139                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
3140                                 /*
3141                                  * skip over unallocated pages
3142                                  */
3143                                 if (CLMAP_ISSET(clmap, cl_index))
3144                                         break;
3145                                 abort_size += vm_page_size;
3146                         }
3147                         if (abort_size) {
3148                                 size      -= abort_size;
3149                                 vs_offset += abort_size;
3150
3151                                 if (cl_index == pages_in_cl) {
3152                                         /*
3153                                          * if we're at the end of this physical cluster
3154                                          * then bump to the next one and continue looking
3155                                          */
3156                                         seg_index++;
3157                                         ps_info_valid = 0;
3158
3159                                         continue;
3160                                 }
3161                                 if (size == 0)
3162                                         break;
3163                         }
3164                         /*
3165                          * remember the starting point of the first allocated page
3166                          * for the I/O we're about to issue
3167                          */
3168                         beg_pseg   = seg_index;
3169                         beg_indx   = cl_index;
3170                         cur_offset = vs_offset;
3171
3172                         /*
3173                          * calculate the size of the I/O that we can do...
3174                          * this may span multiple physical segments if
3175                          * they are contiguous
3176                          */
3177                         for (xfer_size = 0; xfer_size < size; ) {
3178
3179                                 while (cl_index < pages_in_cl && xfer_size < size) {
3180                                         /*
3181                                          * accumulate allocated pages within
3182                                          * a physical segment
3183                                          */
3184                                         if (CLMAP_ISSET(clmap, cl_index)) {
3185                                                 xfer_size  += vm_page_size;
3186                                                 cur_offset += vm_page_size;
3187                                                 cl_index++;
3188
3189                                                 BS_STAT(psp[seg_index]->ps_bs,
3190                                                         psp[seg_index]->ps_bs->bs_pages_in++);
3191                                         } else
3192                                                 break;
3193                                 }
3194                                 if (cl_index < pages_in_cl || xfer_size >= size) {
3195                                         /*
3196                                          * we've hit an unallocated page or
3197                                          * the end of this request... see if
3198                                          * it's time to fire the I/O
3199                                          */
3200                                         break;
3201                                 }
3202                                 /*
3203                                  * we've hit the end of the current physical
3204                                  * segment and there's more to do, so try
3205                                  * moving to the next one
3206                                  */
3207                                 seg_index++;
3208
3209                                 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3210                                 psp[seg_index] = CLMAP_PS(clmap);
3211                                 ps_info_valid = 1;
3212
3213                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
3214                                         /*
3215                                          * if the physical segment we're about
3216                                          * to step into is not contiguous to
3217                                          * the one we're currently in, or it's
3218                                          * in a different paging file, or
3219                                          * it hasn't been allocated....
3220                                          * we stop this run and go check
3221                                          * to see if it's time to fire the I/O
3222                                          */
3223                                         break;
3224                                 }
3225                                 /*
3226                                  * start with first page of the next physical
3227                                  * segment
3228                                  */
3229                                 cl_index = 0;
3230                         }
3231                         if (xfer_size == 0) {
3232                                 /*
3233                                  * no I/O to generate for this segment
3234                                  */
3235                                 continue;
3236                         }
3237                         if (cur_offset <= orig_vs_offset) {
3238                                 /*
3239                                  * we've hit a hole in our speculative cluster
3240                                  * before the offset that we're really after...
3241                                  * don't issue the I/O since it doesn't encompass
3242                                  * the original offset and we're looking to only
3243                                  * pull in the speculative pages if they can be
3244                                  * made part of a single I/O
3245                                  */
3246                                 size      -= xfer_size;
3247                                 vs_offset += xfer_size;
3248
3249                                 continue;
3250                         }
3251                         /*
3252                          * we have a contiguous range of allocated pages
3253                          * to read from that encompasses the original offset
3254                          */
3255                         page_list_count = 0;
3256                         memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
3257                                                         xfer_size, xfer_size,
3258                                                         &upl, NULL, &page_list_count,
3259                                                         request_flags | UPL_SET_INTERNAL);
3260
3261                         error = ps_read_file(psp[beg_pseg],
3262                                              upl, (upl_offset_t) 0,
3263                                              ps_offset[beg_pseg] + (beg_indx * vm_page_size),
3264                                              xfer_size, &residual, io_flags);
3265
3266
3267                         /*
3268                          * Adjust counts and send response to VM.  Optimize
3269                          * for the common case, i.e. no error and/or partial
3270                          * data. If there was an error, then we need to error
3271                          * the entire range, even if some data was successfully
3272                          * read. If there was a partial read we may supply some
3273                          * data and may error some as well.  In all cases the
3274                          * VM must receive some notification for every page
3275                          * in the range.
3276                          */
3277                         if ((error == KERN_SUCCESS) && (residual == 0)) {
3278                                 /*
3279                                  * Got everything we asked for, supply the data
3280                                  * to the VM.  Note that as a side effect of
3281                                  * supplying the data, the buffer holding the
3282                                  * supplied data is deallocated from the pager's
3283                                  *  address space.
3284                                  */
3285                                 lsize = xfer_size;
3286                                 failed_size = 0;
3287                         } else {
3288                                 lsize = 0;
3289                                 failed_size = xfer_size;
3290
3291                                 if (error == KERN_SUCCESS) {
3292                                         if (residual == xfer_size) {
3293                                                 /*
3294                                                  * If a read operation returns no error
3295                                                  * and no data moved, we turn it into
3296                                                  * an error, assuming we're reading at
3297                                                  * or beyong EOF.
3298                                                  * Fall through and error the entire range.
3299                                                  */
3300                                                 error = KERN_FAILURE;
3301                                         } else {
3302                                                 /*
3303                                                  * Otherwise, we have partial read. If
3304                                                  * the part read is a integral number
3305                                                  * of pages supply it. Otherwise round
3306                                                  * it up to a page boundary, zero fill
3307                                                  * the unread part, and supply it.
3308                                                  * Fall through and error the remainder
3309                                                  * of the range, if any.
3310                                                  */
3311                                                 int fill;
3312
3313                                                 fill = residual & (vm_page_size - 1);
3314                                                 lsize = (xfer_size - residual) + fill;
3315
3316                                                 if (lsize < xfer_size)
3317                                                         failed_size = xfer_size - lsize;
3318
3319                                                 if (reclaim_all == FALSE)
3320                                                         error = KERN_FAILURE;
3321                                         }
3322                                 }
3323                         }
3324                         pvs_object_data_provided(vs, upl, vs_offset, lsize);
3325
3326                         if (failed_size) {
3327                                 /*
3328                                  * There was an error in some part of the range, tell
3329                                  * the VM. Note that error is explicitly checked again
3330                                  * since it can be modified above.
3331                                  */
3332                                 BS_STAT(psp[beg_pseg]->ps_bs,
3333                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
3334                         }
3335                         /*
3336                          * we've issued a single I/O that encompassed the original offset
3337                          * at this point we either met our speculative request length or
3338                          * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3339                          * not present or not physically contiguous to the previous one), so
3340                          * we're done issuing I/O at this point
3341                          */
3342                         return (error);
3343                 }
3344         }
3345         return error;
3346 }
3347
3348 int vs_do_async_write = 1;
3349
3350 kern_return_t
3351 vs_cluster_write(
3352         vstruct_t       vs,
3353         upl_t           internal_upl,
3354         upl_offset_t    offset,
3355         upl_size_t      cnt,
3356         boolean_t       dp_internal,
3357         int             flags)
3358 {
3359         upl_size_t      transfer_size;
3360         int             error = 0;
3361         struct clmap    clmap;
3362
3363         dp_offset_t     actual_offset;  /* Offset within paging segment */
3364         paging_segment_t ps;
3365         dp_offset_t     mobj_base_addr;
3366         dp_offset_t     mobj_target_addr;
3367
3368         upl_t           upl;
3369         upl_page_info_t *pl;
3370         int             page_index;
3371         unsigned int    page_max_index;
3372         int             list_size;
3373         int             pages_in_cl;
3374         unsigned int    cl_size;
3375         int             base_index;
3376         unsigned int    seg_size;
3377         unsigned int    upl_offset_in_object;
3378         boolean_t       minimal_clustering = FALSE;
3379         boolean_t       found_dirty;
3380
3381         if (!dp_encryption_inited) {
3382                 /*
3383                  * ENCRYPTED SWAP:
3384                  * Once we've started using swap, we
3385                  * can't change our mind on whether
3386                  * it needs to be encrypted or
3387                  * not.
3388                  */
3389                 dp_encryption_inited = TRUE;
3390         }
3391         if (dp_encryption) {
3392                 /*
3393                  * ENCRYPTED SWAP:
3394                  * the UPL will need to be encrypted...
3395                  */
3396                 flags |= UPL_PAGING_ENCRYPTED;
3397         }
3398
3399         pages_in_cl = 1 << vs->vs_clshift;
3400         cl_size = pages_in_cl * vm_page_size;
3401
3402 #if CONFIG_FREEZE
3403         minimal_clustering = TRUE;
3404 #else
3405         if (dp_isssd == TRUE)
3406                 minimal_clustering = TRUE;
3407 #endif
3408         if (!dp_internal) {
3409                 unsigned int page_list_count;
3410                 int          request_flags;
3411                 unsigned int super_size;
3412                 int          first_dirty;
3413                 int          num_dirty;
3414                 int          num_of_pages;
3415                 int          seg_index;
3416                 upl_offset_t  upl_offset;
3417                 upl_offset_t  upl_offset_aligned;
3418                 dp_offset_t  seg_offset;
3419                 dp_offset_t  ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3420                 paging_segment_t   psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3421
3422
3423                 if (bs_low)
3424                         super_size = cl_size;
3425                 else
3426                         super_size = VM_SUPER_CLUSTER;
3427
3428                 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
3429                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
3430                                 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
3431
3432                 if (dp_encryption) {
3433                         /*
3434                          * ENCRYPTED SWAP:
3435                          * request that the UPL be prepared for
3436                          * encryption.
3437                          */
3438                         request_flags |= UPL_ENCRYPT;
3439                         flags |= UPL_PAGING_ENCRYPTED;
3440                 }
3441
3442                 page_list_count = 0;
3443                 memory_object_super_upl_request(vs->vs_control,
3444                                 (memory_object_offset_t)offset,
3445                                 cnt, super_size,
3446                                 &upl, NULL, &page_list_count,
3447                                 request_flags | UPL_FOR_PAGEOUT);
3448
3449                 /*
3450                  * The default pager does not handle objects larger than
3451                  * 4GB, so it does not deal with offset that don't fit in
3452                  * 32-bit.  Cast down upl->offset now and make sure we
3453                  * did not lose any valuable bits.
3454                  */
3455                 upl_offset_in_object = (unsigned int) upl->offset;
3456                 assert(upl->offset == upl_offset_in_object);
3457
3458                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3459
3460                 seg_size = cl_size - (upl_offset_in_object % cl_size);
3461                 upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1);
3462                 page_index = 0;
3463                 page_max_index = upl->size / PAGE_SIZE;
3464                 found_dirty = TRUE;
3465
3466                 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
3467
3468                         unsigned int    seg_pgcnt;
3469
3470                         seg_pgcnt = seg_size / PAGE_SIZE;
3471
3472                         if (minimal_clustering == TRUE) {
3473                                 unsigned int    non_dirty;
3474
3475                                 non_dirty = 0;
3476                                 found_dirty = FALSE;
3477
3478                                 for (; non_dirty < seg_pgcnt; non_dirty++) {
3479                                         if ((page_index + non_dirty) >= page_max_index)
3480                                                 break;
3481
3482                                         if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) ||
3483                                             UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) {
3484                                                 found_dirty = TRUE;
3485                                                 break;
3486                                         }
3487                                 }
3488                         }
3489                         if (found_dirty == TRUE) {
3490                                 ps_offset[seg_index] =
3491                                         ps_clmap(vs,
3492                                                  upl_offset_aligned,
3493                                                  &clmap, CL_ALLOC,
3494                                                  cl_size, 0);
3495
3496                                 if (ps_offset[seg_index] == (dp_offset_t) -1) {
3497                                         upl_abort(upl, 0);
3498                                         upl_deallocate(upl);
3499
3500                                         return KERN_FAILURE;
3501                                 }
3502                                 psp[seg_index] = CLMAP_PS(clmap);
3503                         }
3504                         if (transfer_size > seg_size) {
3505                                 page_index += seg_pgcnt;
3506                                 transfer_size -= seg_size;
3507                                 upl_offset_aligned += cl_size;
3508                                 seg_size = cl_size;
3509                                 seg_index++;
3510                         } else
3511                                 transfer_size = 0;
3512                 }
3513                 /*
3514                  * Ignore any non-present pages at the end of the
3515                  * UPL.
3516                  */
3517                 for (page_index = upl->size / vm_page_size; page_index > 0;)  {
3518                         if (UPL_PAGE_PRESENT(pl, --page_index)) {
3519                                 page_index++;
3520                                 break;
3521                         }
3522                 }
3523                 if (page_index == 0) {
3524                         /*
3525                          * no pages in the UPL
3526                          * abort and return
3527                          */
3528                         upl_abort(upl, 0);
3529                         upl_deallocate(upl);
3530
3531                         return KERN_SUCCESS;
3532                 }
3533                 num_of_pages = page_index;
3534
3535                 base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE;
3536
3537                 for (page_index = 0; page_index < num_of_pages; ) {
3538                         /*
3539                          * skip over non-dirty pages
3540                          */
3541                         for ( ; page_index < num_of_pages; page_index++) {
3542                                 if (UPL_DIRTY_PAGE(pl, page_index)
3543                                         || UPL_PRECIOUS_PAGE(pl, page_index))
3544                                         /*
3545                                          * this is a page we need to write
3546                                          * go see if we can buddy it up with
3547                                          * others that are contiguous to it
3548                                          */
3549                                         break;
3550                                 /*
3551                                  * if the page is not-dirty, but present we
3552                                  * need to commit it...  This is an unusual
3553                                  * case since we only asked for dirty pages
3554                                  */
3555                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
3556                                         boolean_t empty = FALSE;
3557                                         upl_commit_range(upl,
3558                                                  page_index * vm_page_size,
3559                                                  vm_page_size,
3560                                                  UPL_COMMIT_NOTIFY_EMPTY,
3561                                                  pl,
3562                                                  page_list_count,
3563                                                  &empty);
3564                                         if (empty) {
3565                                                 assert(page_index ==
3566                                                        num_of_pages - 1);
3567                                                 upl_deallocate(upl);
3568                                         }
3569                                 }
3570                         }
3571                         if (page_index == num_of_pages)
3572                                 /*
3573                                  * no more pages to look at, we're out of here
3574                                  */
3575                                 break;
3576
3577                         /*
3578                          * gather up contiguous dirty pages... we have at
3579                          * least 1 * otherwise we would have bailed above
3580                          * make sure that each physical segment that we step
3581                          * into is contiguous to the one we're currently in
3582                          * if it's not, we have to stop and write what we have
3583                          */
3584                         for (first_dirty = page_index;
3585                                         page_index < num_of_pages; ) {
3586                                 if ( !UPL_DIRTY_PAGE(pl, page_index)
3587                                         && !UPL_PRECIOUS_PAGE(pl, page_index))
3588                                         break;
3589                                 page_index++;
3590                                 /*
3591                                  * if we just looked at the last page in the UPL
3592                                  * we don't need to check for physical segment
3593                                  * continuity
3594                                  */
3595                                 if (page_index < num_of_pages) {
3596                                         int cur_seg;
3597                                         int nxt_seg;
3598
3599                                         cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3600                                         nxt_seg = (base_index + page_index)/pages_in_cl;
3601
3602                                         if (cur_seg != nxt_seg) {
3603                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3604                                                 /*
3605                                                  * if the segment we're about
3606                                                  * to step into is not
3607                                                  * contiguous to the one we're
3608                                                  * currently in, or it's in a
3609                                                  * different paging file....
3610                                                  * we stop here and generate
3611                                                  * the I/O
3612                                                  */
3613                                                         break;
3614                                         }
3615                                 }
3616                         }
3617                         num_dirty = page_index - first_dirty;
3618
3619                         if (num_dirty) {
3620                                 upl_offset = first_dirty * vm_page_size;
3621                                 transfer_size = num_dirty * vm_page_size;
3622
3623                                 while (transfer_size) {
3624
3625                                         if ((seg_size = cl_size -
3626                                                 ((upl_offset_in_object +
3627                                                   upl_offset) % cl_size))
3628                                                         > transfer_size)
3629                                                 seg_size = transfer_size;
3630
3631                                         ps_vs_write_complete(
3632                                                 vs,
3633                                                 (upl_offset_in_object +
3634                                                  upl_offset),
3635                                                 seg_size, error);
3636
3637                                         transfer_size -= seg_size;
3638                                         upl_offset += seg_size;
3639                                 }
3640                                 upl_offset = first_dirty * vm_page_size;
3641                                 transfer_size = num_dirty * vm_page_size;
3642
3643                                 seg_index  = (base_index + first_dirty) / pages_in_cl;
3644                                 seg_offset = (upl_offset_in_object + upl_offset) % cl_size;
3645
3646                                 error = ps_write_file(psp[seg_index],
3647                                                 upl, upl_offset,
3648                                                 ps_offset[seg_index]
3649                                                                 + seg_offset,
3650                                                 transfer_size, flags);
3651                         }
3652                 }
3653
3654         } else {
3655                 assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift));
3656                 list_size = cnt;
3657
3658                 page_index = 0;
3659                 /* The caller provides a mapped_data which is derived  */
3660                 /* from a temporary object.  The targeted pages are    */
3661                 /* guaranteed to be set at offset 0 in the mapped_data */
3662                 /* The actual offset however must still be derived     */
3663                 /* from the offset in the vs in question               */
3664                 mobj_base_addr = offset;
3665                 mobj_target_addr = mobj_base_addr;
3666
3667                 for (transfer_size = list_size; transfer_size != 0;) {
3668                         actual_offset = ps_clmap(vs, mobj_target_addr,
3669                                 &clmap, CL_ALLOC,
3670                                 transfer_size < cl_size ?
3671                                         transfer_size : cl_size, 0);
3672                         if(actual_offset == (dp_offset_t) -1) {
3673                                 error = 1;
3674                                 break;
3675                         }
3676                         cnt = MIN(transfer_size,
3677                                   (unsigned) CLMAP_NPGS(clmap) * vm_page_size);
3678                         ps = CLMAP_PS(clmap);
3679                         /* Assume that the caller has given us contiguous */
3680                         /* pages */
3681                         if(cnt) {
3682                                 ps_vs_write_complete(vs, mobj_target_addr,
3683                                                                 cnt, error);
3684                                 error = ps_write_file(ps, internal_upl,
3685                                                 0, actual_offset,
3686                                                 cnt, flags);
3687                                 if (error)
3688                                         break;
3689                            }
3690                         if (error)
3691                                 break;
3692                         actual_offset += cnt;
3693                         mobj_target_addr += cnt;
3694                         transfer_size -= cnt;
3695                         cnt = 0;
3696
3697                         if (error)
3698                                 break;
3699                 }
3700         }
3701         if(error)
3702                 return KERN_FAILURE;
3703         else
3704                 return KERN_SUCCESS;
3705 }
3706
3707 vm_size_t
3708 ps_vstruct_allocated_size(
3709         vstruct_t       vs)
3710 {
3711         int             num_pages;
3712         struct vs_map   *vsmap;
3713         unsigned int    i, j, k;
3714
3715         num_pages = 0;
3716         if (vs->vs_indirect) {
3717                 /* loop on indirect maps */
3718                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3719                         vsmap = vs->vs_imap[i];
3720                         if (vsmap == NULL)
3721                                 continue;
3722                         /* loop on clusters in this indirect map */
3723                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3724                                 if (VSM_ISCLR(vsmap[j]) ||
3725                                     VSM_ISERR(vsmap[j]))
3726                                         continue;
3727                                 /* loop on pages in this cluster */
3728                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3729                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3730                                                 num_pages++;
3731                                 }
3732                         }
3733                 }
3734         } else {
3735                 vsmap = vs->vs_dmap;
3736                 if (vsmap == NULL)
3737                         return 0;
3738                 /* loop on clusters in the direct map */
3739                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3740                         if (VSM_ISCLR(vsmap[j]) ||
3741                             VSM_ISERR(vsmap[j]))
3742                                 continue;
3743                         /* loop on pages in this cluster */
3744                         for (k = 0; k < VSCLSIZE(vs); k++) {
3745                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3746                                         num_pages++;
3747                         }
3748                 }
3749         }
3750
3751         return ptoa_32(num_pages);
3752 }
3753
3754 unsigned int
3755 ps_vstruct_allocated_pages(
3756         vstruct_t               vs,
3757         default_pager_page_t    *pages,
3758         unsigned int            pages_size)
3759 {
3760         unsigned int    num_pages;
3761         struct vs_map   *vsmap;
3762         dp_offset_t     offset;
3763         unsigned int    i, j, k;
3764
3765         num_pages = 0;
3766         offset = 0;
3767         if (vs->vs_indirect) {
3768                 /* loop on indirect maps */
3769                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3770                         vsmap = vs->vs_imap[i];
3771                         if (vsmap == NULL) {
3772                                 offset += (vm_page_size * CLMAP_ENTRIES *
3773                                            VSCLSIZE(vs));
3774                                 continue;
3775                         }
3776                         /* loop on clusters in this indirect map */
3777                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3778                                 if (VSM_ISCLR(vsmap[j]) ||
3779                                     VSM_ISERR(vsmap[j])) {
3780                                         offset += vm_page_size * VSCLSIZE(vs);
3781                                         continue;
3782                                 }
3783                                 /* loop on pages in this cluster */
3784                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3785                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3786                                                 num_pages++;
3787                                                 if (num_pages < pages_size)
3788                                                         pages++->dpp_offset =
3789                                                                 offset;
3790                                         }
3791                                         offset += vm_page_size;
3792                                 }
3793                         }
3794                 }
3795         } else {
3796                 vsmap = vs->vs_dmap;
3797                 if (vsmap == NULL)
3798                         return 0;
3799                 /* loop on clusters in the direct map */
3800                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3801                         if (VSM_ISCLR(vsmap[j]) ||
3802                             VSM_ISERR(vsmap[j])) {
3803                                 offset += vm_page_size * VSCLSIZE(vs);
3804                                 continue;
3805                         }
3806                         /* loop on pages in this cluster */
3807                         for (k = 0; k < VSCLSIZE(vs); k++) {
3808                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3809                                         num_pages++;
3810                                         if (num_pages < pages_size)
3811                                                 pages++->dpp_offset = offset;
3812                                 }
3813                                 offset += vm_page_size;
3814                         }
3815                 }
3816         }
3817
3818         return num_pages;
3819 }
3820
3821
3822 kern_return_t
3823 ps_vstruct_transfer_from_segment(
3824         vstruct_t        vs,
3825         paging_segment_t segment,
3826         upl_t            upl)
3827 {
3828         struct vs_map   *vsmap;
3829 //      struct vs_map   old_vsmap;
3830 //      struct vs_map   new_vsmap;
3831         unsigned int    i, j;
3832
3833         VS_LOCK(vs);    /* block all work on this vstruct */
3834                         /* can't allow the normal multiple write */
3835                         /* semantic because writes may conflict */
3836         vs->vs_xfer_pending = TRUE;
3837         vs_wait_for_sync_writers(vs);
3838         vs_start_write(vs);
3839         vs_wait_for_readers(vs);
3840         /* we will unlock the vs to allow other writes while transferring */
3841         /* and will be guaranteed of the persistance of the vs struct     */
3842         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3843         /* vs_async_pending */
3844         /* OK we now have guaranteed no other parties are accessing this */
3845         /* vs.  Now that we are also supporting simple lock versions of  */
3846         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3847         /* our purpose in holding it before was the multiple write case */
3848         /* we now use the boolean xfer_pending to do that.  We can use  */
3849         /* a boolean instead of a count because we have guaranteed single */
3850         /* file access to this code in its caller */
3851         VS_UNLOCK(vs);
3852 vs_changed:
3853         if (vs->vs_indirect) {
3854                 unsigned int    vsmap_size;
3855                 int             clmap_off;
3856                 /* loop on indirect maps */
3857                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3858                         vsmap = vs->vs_imap[i];
3859                         if (vsmap == NULL)
3860                                 continue;
3861                         /* loop on clusters in this indirect map */
3862                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3863                                            VSCLSIZE(vs) * i);
3864                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3865                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3866                         else
3867                                 vsmap_size = CLMAP_ENTRIES;
3868                         for (j = 0; j < vsmap_size; j++) {
3869                                 if (VSM_ISCLR(vsmap[j]) ||
3870                                     VSM_ISERR(vsmap[j]) ||
3871                                     (VSM_PS(vsmap[j]) != segment))
3872                                         continue;
3873                                 if(vs_cluster_transfer(vs,
3874                                         (vm_page_size * (j << vs->vs_clshift))
3875                                         + clmap_off,
3876                                         vm_page_size << vs->vs_clshift,
3877                                         upl)
3878                                                 != KERN_SUCCESS) {
3879                                    VS_LOCK(vs);
3880                                    vs->vs_xfer_pending = FALSE;
3881                                    VS_UNLOCK(vs);
3882                                    vs_finish_write(vs);
3883                                    return KERN_FAILURE;
3884                                 }
3885                                 /* allow other readers/writers during transfer*/
3886                                 VS_LOCK(vs);
3887                                 vs->vs_xfer_pending = FALSE;
3888                                 VS_UNLOCK(vs);
3889                                 vs_finish_write(vs);
3890
3891                                 if (backing_store_abort_compaction || backing_store_stop_compaction) {
3892                                         backing_store_abort_compaction = FALSE;
3893                                         dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n"));
3894                                         return KERN_FAILURE;
3895                                 }
3896                                 vnode_pager_throttle();
3897
3898                                 VS_LOCK(vs);
3899                                 vs->vs_xfer_pending = TRUE;
3900                                 vs_wait_for_sync_writers(vs);
3901                                 vs_start_write(vs);
3902                                 vs_wait_for_readers(vs);
3903                                 VS_UNLOCK(vs);
3904                                 if (!(vs->vs_indirect)) {
3905                                         goto vs_changed;
3906                                 }
3907                         }
3908                 }
3909         } else {
3910                 vsmap = vs->vs_dmap;
3911                 if (vsmap == NULL) {
3912                         VS_LOCK(vs);
3913                         vs->vs_xfer_pending = FALSE;
3914                         VS_UNLOCK(vs);
3915                         vs_finish_write(vs);
3916                         return KERN_SUCCESS;
3917                 }
3918                 /* loop on clusters in the direct map */
3919                 for (j = 0; j < vs->vs_size; j++) {
3920                         if (VSM_ISCLR(vsmap[j]) ||
3921                             VSM_ISERR(vsmap[j]) ||
3922                             (VSM_PS(vsmap[j]) != segment))
3923                                 continue;
3924                         if(vs_cluster_transfer(vs,
3925                                 vm_page_size * (j << vs->vs_clshift),
3926                                 vm_page_size << vs->vs_clshift,
3927                                 upl) != KERN_SUCCESS) {
3928                            VS_LOCK(vs);
3929                            vs->vs_xfer_pending = FALSE;
3930                            VS_UNLOCK(vs);
3931                            vs_finish_write(vs);
3932                            return KERN_FAILURE;
3933                         }
3934                         /* allow other readers/writers during transfer*/
3935                         VS_LOCK(vs);
3936                         vs->vs_xfer_pending = FALSE;
3937                         VS_UNLOCK(vs);
3938                         vs_finish_write(vs);
3939                         VS_LOCK(vs);
3940                         vs->vs_xfer_pending = TRUE;
3941                         vs_wait_for_sync_writers(vs);
3942                         vs_start_write(vs);
3943                         vs_wait_for_readers(vs);
3944                         VS_UNLOCK(vs);
3945                         if (vs->vs_indirect) {
3946                                 goto vs_changed;
3947                         }
3948                 }
3949         }
3950
3951         VS_LOCK(vs);
3952         vs->vs_xfer_pending = FALSE;
3953         VS_UNLOCK(vs);
3954         vs_finish_write(vs);
3955         return KERN_SUCCESS;
3956 }
3957
3958
3959
3960 vs_map_t
3961 vs_get_map_entry(
3962         vstruct_t       vs,
3963         dp_offset_t     offset)
3964 {
3965         struct vs_map   *vsmap;
3966         dp_offset_t     cluster;
3967
3968         cluster = atop_32(offset) >> vs->vs_clshift;
3969         if (vs->vs_indirect) {
3970                 long    ind_block = cluster/CLMAP_ENTRIES;
3971
3972                 /* Is the indirect block allocated? */
3973                 vsmap = vs->vs_imap[ind_block];
3974                 if(vsmap == (vs_map_t) NULL)
3975                         return vsmap;
3976         } else
3977                 vsmap = vs->vs_dmap;
3978         vsmap += cluster%CLMAP_ENTRIES;
3979         return vsmap;
3980 }
3981
3982 kern_return_t
3983 vs_cluster_transfer(
3984         vstruct_t       vs,
3985         dp_offset_t     offset,
3986         dp_size_t       cnt,
3987         upl_t           upl)
3988 {
3989         dp_offset_t             actual_offset;
3990         paging_segment_t        ps;
3991         struct clmap            clmap;
3992         kern_return_t           error = KERN_SUCCESS;
3993         unsigned int            size, size_wanted;
3994         int                     i;
3995         unsigned int            residual = 0;
3996         unsigned int            unavail_size;
3997 //      default_pager_thread_t  *dpt;
3998 //      boolean_t               dealloc;
3999         struct  vs_map          *vsmap_ptr = NULL;
4000         struct  vs_map          read_vsmap;
4001         struct  vs_map          original_read_vsmap;
4002         struct  vs_map          write_vsmap;
4003 //      upl_t                           sync_upl;
4004 //      vm_offset_t                     ioaddr;
4005
4006         /* vs_cluster_transfer reads in the pages of a cluster and
4007          * then writes these pages back to new backing store.  The
4008          * segment the pages are being read from is assumed to have
4009          * been taken off-line and is no longer considered for new
4010          * space requests.
4011          */
4012
4013         /*
4014          * This loop will be executed once per cluster referenced.
4015          * Typically this means once, since it's unlikely that the
4016          * VM system will ask for anything spanning cluster boundaries.
4017          *
4018          * If there are holes in a cluster (in a paging segment), we stop
4019          * reading at the hole, then loop again, hoping to
4020          * find valid pages later in the cluster.  This continues until
4021          * the entire range has been examined, and read, if present.  The
4022          * pages are written as they are read.  If a failure occurs after
4023          * some pages are written the unmap call at the bottom of the loop
4024          * recovers the backing store and the old backing store remains
4025          * in effect.
4026          */
4027
4028         VSM_CLR(write_vsmap);
4029         VSM_CLR(original_read_vsmap);
4030         /* grab the actual object's pages to sync with I/O */
4031         while (cnt && (error == KERN_SUCCESS)) {
4032                 vsmap_ptr = vs_get_map_entry(vs, offset);
4033                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
4034
4035                 if (actual_offset == (dp_offset_t) -1) {
4036
4037                         /*
4038                          * Nothing left to write in this cluster at least
4039                          * set write cluster information for any previous
4040                          * write, clear for next cluster, if there is one
4041                          */
4042                         unsigned int local_size, clmask, clsize;
4043
4044                         clsize = vm_page_size << vs->vs_clshift;
4045                         clmask = clsize - 1;
4046                         local_size = clsize - (offset & clmask);
4047                         ASSERT(local_size);
4048                         local_size = MIN(local_size, cnt);
4049
4050                         /* This cluster has no data in it beyond what may */
4051                         /* have been found on a previous iteration through */
4052                         /* the loop "write_vsmap" */
4053                         *vsmap_ptr = write_vsmap;
4054                         VSM_CLR(write_vsmap);
4055                         VSM_CLR(original_read_vsmap);
4056
4057                         cnt -= local_size;
4058                         offset += local_size;
4059                         continue;
4060                 }
4061
4062                 /*
4063                  * Count up contiguous available or unavailable
4064                  * pages.
4065                  */
4066                 ps = CLMAP_PS(clmap);
4067                 ASSERT(ps);
4068                 size = 0;
4069                 unavail_size = 0;
4070                 for (i = 0;
4071                      (size < cnt) && (unavail_size < cnt) &&
4072                      (i < CLMAP_NPGS(clmap)); i++) {
4073                         if (CLMAP_ISSET(clmap, i)) {
4074                                 if (unavail_size != 0)
4075                                         break;
4076                                 size += vm_page_size;
4077                                 BS_STAT(ps->ps_bs,
4078                                         ps->ps_bs->bs_pages_in++);
4079                         } else {
4080                                 if (size != 0)
4081                                         break;
4082                                 unavail_size += vm_page_size;
4083                         }
4084                 }
4085
4086                 if (size == 0) {
4087                         ASSERT(unavail_size);
4088                         ps_clunmap(vs, offset, unavail_size);
4089                         cnt -= unavail_size;
4090                         offset += unavail_size;
4091                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
4092                                 == 0) {
4093                                 /* There is no more to transfer in this
4094                                    cluster
4095                                 */
4096                                 *vsmap_ptr = write_vsmap;
4097                                 VSM_CLR(write_vsmap);
4098                                 VSM_CLR(original_read_vsmap);
4099                         }
4100                         continue;
4101                 }
4102
4103                 if(VSM_ISCLR(original_read_vsmap))
4104                         original_read_vsmap = *vsmap_ptr;
4105
4106                 if(ps->ps_segtype == PS_PARTITION) {
4107                         panic("swap partition not supported\n");
4108                         /*NOTREACHED*/
4109                         error = KERN_FAILURE;
4110                         residual = size;
4111 /*
4112                         NEED TO ISSUE WITH SYNC & NO COMMIT
4113                         error = ps_read_device(ps, actual_offset, &buffer,
4114                                        size, &residual, flags);
4115 */
4116                 } else {
4117                         /* NEED TO ISSUE WITH SYNC & NO COMMIT */
4118                         error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
4119                                         size, &residual,
4120                                         (UPL_IOSYNC | UPL_NOCOMMIT | (dp_encryption ? UPL_PAGING_ENCRYPTED : 0)));
4121                 }
4122
4123                 read_vsmap = *vsmap_ptr;
4124
4125
4126                 /*
4127                  * Adjust counts and put data in new BS.  Optimize for the
4128                  * common case, i.e. no error and/or partial data.
4129                  * If there was an error, then we need to error the entire
4130                  * range, even if some data was successfully read.
4131                  *
4132                  */
4133                 if ((error == KERN_SUCCESS) && (residual == 0)) {
4134
4135                         /*
4136                          * Got everything we asked for, supply the data to
4137                          * the new BS.  Note that as a side effect of supplying
4138                          * the data, the buffer holding the supplied data is
4139                          * deallocated from the pager's address space unless
4140                          * the write is unsuccessful.
4141                          */
4142
4143                         /* note buffer will be cleaned up in all cases by */
4144                         /* internal_cluster_write or if an error on write */
4145                         /* the vm_map_copy_page_discard call              */
4146                         *vsmap_ptr = write_vsmap;
4147
4148                         if(vs_cluster_write(vs, upl, offset,
4149                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
4150                                 error = KERN_FAILURE;
4151                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
4152                                         /* unmap the new backing store object */
4153                                         ps_clunmap(vs, offset, size);
4154                                 }
4155                                 /* original vsmap */
4156                                 *vsmap_ptr = original_read_vsmap;
4157                                 VSM_CLR(write_vsmap);
4158                         } else {
4159                                if((offset + size) &
4160                                         ((vm_page_size << vs->vs_clshift)
4161                                         - 1)) {
4162                                         /* There is more to transfer in this
4163                                            cluster
4164                                         */
4165                                         write_vsmap = *vsmap_ptr;
4166                                         *vsmap_ptr = read_vsmap;
4167                                         ps_clunmap(vs, offset, size);
4168                                 } else {
4169                                         /* discard the old backing object */
4170                                         write_vsmap = *vsmap_ptr;
4171                                         *vsmap_ptr = read_vsmap;
4172                                         ps_clunmap(vs, offset, size);
4173                                         *vsmap_ptr = write_vsmap;
4174                                         VSM_CLR(write_vsmap);
4175                                         VSM_CLR(original_read_vsmap);
4176                                 }
4177                         }
4178                 } else {
4179                         size_wanted = size;
4180                         if (error == KERN_SUCCESS) {
4181                                 if (residual == size) {
4182                                         /*
4183                                          * If a read operation returns no error
4184                                          * and no data moved, we turn it into
4185                                          * an error, assuming we're reading at
4186                                          * or beyond EOF.
4187                                          * Fall through and error the entire
4188                                          * range.
4189                                          */
4190                                         error = KERN_FAILURE;
4191                                         *vsmap_ptr = write_vsmap;
4192                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
4193                                         /* unmap the new backing store object */
4194                                         ps_clunmap(vs, offset, size);
4195                                         }
4196                                         *vsmap_ptr = original_read_vsmap;
4197                                         VSM_CLR(write_vsmap);
4198                                         continue;
4199                                 } else {
4200                                         /*
4201                                          * Otherwise, we have partial read.
4202                                          * This is also considered an error
4203                                          * for the purposes of cluster transfer
4204                                          */
4205                                         error = KERN_FAILURE;
4206                                         *vsmap_ptr = write_vsmap;
4207                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
4208                                         /* unmap the new backing store object */
4209                                         ps_clunmap(vs, offset, size);
4210                                         }
4211                                         *vsmap_ptr = original_read_vsmap;
4212                                         VSM_CLR(write_vsmap);
4213                                         continue;
4214                                 }
4215                         }
4216
4217                 }
4218                 cnt -= size;
4219                 offset += size;
4220
4221         } /* END while (cnt && (error == 0)) */
4222         if(!VSM_ISCLR(write_vsmap))
4223                 *vsmap_ptr = write_vsmap;
4224
4225         return error;
4226 }
4227
4228 kern_return_t
4229 default_pager_add_file(
4230         MACH_PORT_FACE  backing_store,
4231         vnode_ptr_t     vp,
4232         int             record_size,
4233         vm_size_t       size)
4234 {
4235         backing_store_t         bs;
4236         paging_segment_t        ps;
4237         int                     i;
4238         unsigned int            j;
4239         int                     error;
4240
4241         if ((bs = backing_store_lookup(backing_store))
4242             == BACKING_STORE_NULL)
4243                 return KERN_INVALID_ARGUMENT;
4244
4245         PSL_LOCK();
4246         for (i = 0; i <= paging_segment_max; i++) {
4247                 ps = paging_segments[i];
4248                 if (ps == PAGING_SEGMENT_NULL)
4249                         continue;
4250                 if (ps->ps_segtype != PS_FILE)
4251                         continue;
4252
4253                 /*
4254                  * Check for overlap on same device.
4255                  */
4256                 if (ps->ps_vnode == (struct vnode *)vp) {
4257                         PSL_UNLOCK();
4258                         BS_UNLOCK(bs);
4259                         return KERN_INVALID_ARGUMENT;
4260                 }
4261         }
4262         PSL_UNLOCK();
4263
4264         /*
4265          * Set up the paging segment
4266          */
4267         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
4268         if (ps == PAGING_SEGMENT_NULL) {
4269                 BS_UNLOCK(bs);
4270                 return KERN_RESOURCE_SHORTAGE;
4271         }
4272
4273         ps->ps_segtype = PS_FILE;
4274         ps->ps_vnode = (struct vnode *)vp;
4275         ps->ps_offset = 0;
4276         ps->ps_record_shift = local_log2(vm_page_size / record_size);
4277         assert((dp_size_t) size == size);
4278         ps->ps_recnum = (dp_size_t) size;
4279         ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift;
4280
4281         ps->ps_pgcount = ps->ps_pgnum;
4282         ps->ps_clshift = local_log2(bs->bs_clsize);
4283         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
4284         ps->ps_special_clusters = 0;
4285         ps->ps_hint = 0;
4286
4287         PS_LOCK_INIT(ps);
4288         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
4289         if (!ps->ps_bmap) {
4290                 PS_LOCK_DESTROY(ps);
4291                 kfree(ps, sizeof *ps);
4292                 BS_UNLOCK(bs);
4293                 return KERN_RESOURCE_SHORTAGE;
4294         }
4295         for (j = 0; j < ps->ps_ncls; j++) {
4296                 clrbit(ps->ps_bmap, j);
4297         }
4298
4299         if(paging_segment_count == 0) {
4300                 ps->ps_state = PS_EMERGENCY_SEGMENT;
4301                 if(use_emergency_swap_file_first) {
4302                         ps->ps_state |= PS_CAN_USE;
4303                 }
4304                 emergency_segment_backing_store = backing_store;
4305         } else {
4306                 ps->ps_state = PS_CAN_USE;
4307         }
4308
4309         ps->ps_bs = bs;
4310
4311         if ((error = ps_enter(ps)) != 0) {
4312                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
4313                 PS_LOCK_DESTROY(ps);
4314                 kfree(ps, sizeof *ps);
4315                 BS_UNLOCK(bs);
4316                 return KERN_RESOURCE_SHORTAGE;
4317         }
4318
4319         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
4320         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
4321         PSL_LOCK();
4322         if(IS_PS_OK_TO_USE(ps)) {
4323                 dp_pages_free += ps->ps_pgcount;
4324         } else {
4325                 dp_pages_reserve += ps->ps_pgcount;
4326         }
4327         PSL_UNLOCK();
4328
4329         BS_UNLOCK(bs);
4330
4331         bs_more_space(ps->ps_clcount);
4332
4333         /*
4334          * If the paging segment being activated is not the emergency
4335          * segment and we notice that the emergency segment is being
4336          * used then we help recover it. If all goes well, the
4337          * emergency segment will be back to its original state of
4338          * online but not activated (till it's needed the next time).
4339          */
4340 #if CONFIG_FREEZE
4341         if (!memorystatus_freeze_enabled)
4342 #endif
4343         {
4344                 ps = paging_segments[EMERGENCY_PSEG_INDEX];
4345                 if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) {
4346                         if(default_pager_backing_store_delete(emergency_segment_backing_store)) {
4347                                 dprintf(("Failed to recover emergency paging segment\n"));
4348                         } else {
4349                                 dprintf(("Recovered emergency paging segment\n"));
4350                         }
4351                 }
4352         }
4353
4354         DP_DEBUG(DEBUG_BS_INTERNAL,
4355                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
4356                   device, offset, (dp_size_t) size, record_size,
4357                   ps->ps_record_shift, ps->ps_pgnum));
4358
4359         return KERN_SUCCESS;
4360 }
4361
4362
4363
4364 kern_return_t
4365 ps_read_file(
4366         paging_segment_t        ps,
4367         upl_t                   upl,
4368         upl_offset_t            upl_offset,
4369         dp_offset_t             offset,
4370         upl_size_t              size,
4371         unsigned int            *residualp,
4372         int                     flags)
4373 {
4374         vm_object_offset_t      f_offset;
4375         int                     error = 0;
4376         int                     result;
4377
4378         assert(dp_encryption_inited);
4379
4380         clustered_reads[atop_32(size)]++;
4381
4382         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4383
4384         /*
4385          * for transfer case we need to pass uploffset and flags
4386          */
4387         assert((upl_size_t) size == size);
4388         error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL);
4389
4390         /* The vnode_pagein semantic is somewhat at odds with the existing   */
4391         /* device_read semantic.  Partial reads are not experienced at this  */
4392         /* level.  It is up to the bit map code and cluster read code to     */
4393         /* check that requested data locations are actually backed, and the  */
4394         /* pagein code to either read all of the requested data or return an */
4395         /* error. */
4396
4397         if (error)
4398                 result = KERN_FAILURE;
4399         else {
4400                 *residualp = 0;
4401                 result = KERN_SUCCESS;
4402         }
4403         return result;
4404 }
4405
4406 kern_return_t
4407 ps_write_file(
4408         paging_segment_t        ps,
4409         upl_t                   upl,
4410         upl_offset_t            upl_offset,
4411         dp_offset_t             offset,
4412         unsigned int            size,
4413         int                     flags)
4414 {
4415         vm_object_offset_t      f_offset;
4416         kern_return_t           result;
4417
4418         assert(dp_encryption_inited);
4419
4420         clustered_writes[atop_32(size)]++;
4421         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4422
4423         if (flags & UPL_PAGING_ENCRYPTED) {
4424                 /*
4425                  * ENCRYPTED SWAP:
4426                  * encrypt all the pages that we're going
4427                  * to pageout.
4428                  */
4429                 upl_encrypt(upl, upl_offset, size);
4430         }
4431         assert((upl_size_t) size == size);
4432         if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL))
4433                 result = KERN_FAILURE;
4434         else
4435                 result = KERN_SUCCESS;
4436
4437         return result;
4438 }
4439
4440 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data)
4441 {
4442 #pragma unused(data)
4443 }
4444
4445 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data)
4446 {
4447 #pragma unused(data)
4448 }
4449
4450 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length)
4451 {
4452 #pragma unused(data, map, shift, length)
4453 }
4454
4455 kern_return_t
4456 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
4457         int             hi_wat,
4458         int             lo_wat,
4459         int             flags,
4460         MACH_PORT_FACE  trigger_port)
4461 {
4462         MACH_PORT_FACE release = IPC_PORT_NULL;
4463         kern_return_t kr;
4464         clock_sec_t now;
4465         clock_nsec_t nanoseconds_dummy;
4466         static clock_sec_t error_notify = 0;
4467
4468         PSL_LOCK();
4469         if (flags == SWAP_ENCRYPT_ON) {
4470                 /* ENCRYPTED SWAP: turn encryption on */
4471                 release = trigger_port;
4472                 if (!dp_encryption_inited) {
4473                         dp_encryption_inited = TRUE;
4474                         dp_encryption = TRUE;
4475                         kr = KERN_SUCCESS;
4476                 } else {
4477                         kr = KERN_FAILURE;
4478                 }
4479         } else if (flags == SWAP_ENCRYPT_OFF) {
4480                 /* ENCRYPTED SWAP: turn encryption off */
4481                 release = trigger_port;
4482                 if (!dp_encryption_inited) {
4483                         dp_encryption_inited = TRUE;
4484                         dp_encryption = FALSE;
4485                         kr = KERN_SUCCESS;
4486                 } else {
4487                         kr = KERN_FAILURE;
4488                 }
4489         } else if (flags == HI_WAT_ALERT) {
4490                 release = min_pages_trigger_port;
4491 #if CONFIG_FREEZE
4492                 /* High and low water signals aren't applicable when freeze is */
4493                 /* enabled, so release the trigger ports here and return       */
4494                 /* KERN_FAILURE.                                               */
4495                 if (memorystatus_freeze_enabled) {
4496                         if (IP_VALID( trigger_port )){
4497                                 ipc_port_release_send( trigger_port );
4498                         }
4499                         min_pages_trigger_port = IPC_PORT_NULL;
4500                         kr = KERN_FAILURE;
4501                 }
4502                 else
4503 #endif
4504                 {
4505                         min_pages_trigger_port = trigger_port;
4506                         minimum_pages_remaining = hi_wat/vm_page_size;
4507                         bs_low = FALSE;
4508                         kr = KERN_SUCCESS;
4509                 }
4510         } else if (flags ==  LO_WAT_ALERT) {
4511                 release = max_pages_trigger_port;
4512 #if CONFIG_FREEZE
4513                 if (memorystatus_freeze_enabled) {
4514                         if (IP_VALID( trigger_port )){
4515                                 ipc_port_release_send( trigger_port );
4516                         }
4517                         max_pages_trigger_port = IPC_PORT_NULL;
4518                         kr = KERN_FAILURE;
4519                 }
4520                 else
4521 #endif
4522                 {
4523                         max_pages_trigger_port = trigger_port;
4524                         maximum_pages_free = lo_wat/vm_page_size;
4525                         kr = KERN_SUCCESS;
4526                 }
4527         } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) {
4528                 use_emergency_swap_file_first = TRUE;
4529                 release = trigger_port;
4530                 kr = KERN_SUCCESS;
4531         } else if (flags == SWAP_FILE_CREATION_ERROR) {
4532                 release = trigger_port;
4533                 kr = KERN_SUCCESS;
4534                 if( paging_segment_count == 1) {
4535                         use_emergency_swap_file_first = TRUE;
4536                 }
4537                 no_paging_space_action();
4538                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
4539                 if (now > error_notify + 5) {
4540                         dprintf(("Swap File Error.\n"));
4541                         error_notify = now;
4542                 }
4543         } else {
4544                 release = trigger_port;
4545                 kr =  KERN_INVALID_ARGUMENT;
4546         }
4547         PSL_UNLOCK();
4548
4549         if (IP_VALID(release))
4550                 ipc_port_release_send(release);
4551
4552         return kr;
4553 }
4554
4555 /*
4556  * Monitor the amount of available backing store vs. the amount of
4557  * required backing store, notify a listener (if present) when
4558  * backing store may safely be removed.
4559  *
4560  * We attempt to avoid the situation where backing store is
4561  * discarded en masse, as this can lead to thrashing as the
4562  * backing store is compacted.
4563  */
4564
4565 #define PF_INTERVAL     3       /* time between free level checks */
4566 #define PF_LATENCY      10      /* number of intervals before release */
4567
4568 static int dp_pages_free_low_count = 0;
4569 thread_call_t default_pager_backing_store_monitor_callout;
4570
4571 void
4572 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
4573                                                                         __unused thread_call_param_t p2)
4574 {
4575 //      unsigned long long      average;
4576         ipc_port_t              trigger;
4577         uint64_t                deadline;
4578
4579         /*
4580          * We determine whether it will be safe to release some
4581          * backing store by watching the free page level.  If
4582          * it remains below the maximum_pages_free threshold for
4583          * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4584          * then we deem it safe.
4585          *
4586          * Note that this establishes a maximum rate at which backing
4587          * store will be released, as each notification (currently)
4588          * only results in a single backing store object being
4589          * released.
4590          */
4591         if (dp_pages_free > maximum_pages_free) {
4592                 dp_pages_free_low_count++;
4593         } else {
4594                 dp_pages_free_low_count = 0;
4595         }
4596
4597         /* decide whether to send notification */
4598         trigger = IP_NULL;
4599         if (max_pages_trigger_port &&
4600             (backing_store_release_trigger_disable == 0) &&
4601             (dp_pages_free_low_count > PF_LATENCY)) {
4602                 trigger = max_pages_trigger_port;
4603                 max_pages_trigger_port = NULL;
4604         }
4605
4606         /* send notification */
4607         if (trigger != IP_NULL) {
4608                 VSL_LOCK();
4609                 if(backing_store_release_trigger_disable != 0) {
4610                         assert_wait((event_t)
4611                                     &backing_store_release_trigger_disable,
4612                                     THREAD_UNINT);
4613                         VSL_UNLOCK();
4614                         thread_block(THREAD_CONTINUE_NULL);
4615                 } else {
4616                         VSL_UNLOCK();
4617                 }
4618                 dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n"));
4619
4620                 default_pager_space_alert(trigger, LO_WAT_ALERT);
4621                 ipc_port_release_send(trigger);
4622                 dp_pages_free_low_count = 0;
4623         }
4624
4625         clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
4626         thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
4627 }
4628
4629 #if CONFIG_FREEZE
4630 unsigned int default_pager_swap_pages_free() {
4631         return dp_pages_free;
4632 }
4633 #endif