osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56
  57 /*
  58  *      Default Pager.
  59  *              Paging File Management.
  60  */
  61
  62 #include <mach/host_priv.h>
  63 #include <mach/memory_object_control.h>
  64 #include <mach/memory_object_server.h>
  65 #include <mach/upl.h>
  66 #include <default_pager/default_pager_internal.h>
  67 #include <default_pager/default_pager_alerts.h>
  68 #include <default_pager/default_pager_object_server.h>
  69
  70 #include <ipc/ipc_types.h>
  71 #include <ipc/ipc_port.h>
  72 #include <ipc/ipc_space.h>
  73
  74 #include <kern/kern_types.h>
  75 #include <kern/host.h>
  76 #include <kern/queue.h>
  77 #include <kern/counters.h>
  78 #include <kern/sched_prim.h>
  79
  80 #include <vm/vm_kern.h>
  81 #include <vm/vm_pageout.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/vm_object.h>
  84 #include <vm/vm_protos.h>
  85
  86
  87 /* todo - need large internal object support */
  88
  89 /*
  90  * ALLOC_STRIDE... the maximum number of bytes allocated from
  91  * a swap file before moving on to the next swap file... if
  92  * all swap files reside on a single disk, this value should
  93  * be very large (this is the default assumption)... if the
  94  * swap files are spread across multiple disks, than this value
  95  * should be small (128 * 1024)...
  96  *
  97  * This should be determined dynamically in the future
  98  */
  99
 100 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
 101 int physical_transfer_cluster_count = 0;
 102
 103 #define VM_SUPER_CLUSTER        0x40000
 104 #define VM_SUPER_PAGES          (VM_SUPER_CLUSTER / PAGE_SIZE)
 105
 106 /*
 107  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
 108  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
 109  */
 110 #define VSTRUCT_MIN_CLSHIFT     0
 111
 112 #define VSTRUCT_DEF_CLSHIFT     2
 113 int default_pager_clsize = 0;
 114
 115 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
 116
 117 /* statistics */
 118 unsigned int clustered_writes[VM_SUPER_PAGES+1];
 119 unsigned int clustered_reads[VM_SUPER_PAGES+1];
 120
 121 /*
 122  * Globals used for asynchronous paging operations:
 123  *      vs_async_list:  head of list of to-be-completed I/O ops
 124  *      async_num_queued: number of pages completed, but not yet
 125  *              processed by async thread.
 126  *      async_requests_out: number of pages of requests not completed.
 127  */
 128
 129 #if 0
 130 struct vs_async *vs_async_list;
 131 int     async_num_queued;
 132 int     async_requests_out;
 133 #endif
 134
 135
 136 #define VS_ASYNC_REUSE 1
 137 struct vs_async *vs_async_free_list;
 138
 139 lck_mtx_t       default_pager_async_lock;       /* Protects globals above */
 140
 141
 142 int vs_alloc_async_failed = 0;                  /* statistics */
 143 int vs_alloc_async_count = 0;                   /* statistics */
 144 struct vs_async *vs_alloc_async(void);          /* forward */
 145 void vs_free_async(struct vs_async *vsa);       /* forward */
 146
 147
 148 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 149 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 150
 151 #define VS_ASYNC_LOCK()         lck_mtx_lock(&default_pager_async_lock)
 152 #define VS_ASYNC_UNLOCK()       lck_mtx_unlock(&default_pager_async_lock)
 153 #define VS_ASYNC_LOCK_INIT()    lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
 154 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 155 /*
 156  *  Paging Space Hysteresis triggers and the target notification port
 157  *
 158  */
 159 unsigned int    dp_pages_free_drift_count = 0;
 160 unsigned int    dp_pages_free_drifted_max = 0;
 161 unsigned int    minimum_pages_remaining = 0;
 162 unsigned int    maximum_pages_free = 0;
 163 ipc_port_t      min_pages_trigger_port = NULL;
 164 ipc_port_t      max_pages_trigger_port = NULL;
 165
 166 #if CONFIG_FREEZE
 167 boolean_t       use_emergency_swap_file_first = TRUE;
 168 #else
 169 boolean_t       use_emergency_swap_file_first = FALSE;
 170 #endif
 171 boolean_t       bs_low = FALSE;
 172 int             backing_store_release_trigger_disable = 0;
 173 boolean_t       backing_store_stop_compaction = FALSE;
 174 boolean_t       backing_store_abort_compaction = FALSE;
 175
 176 /* Have we decided if swap needs to be encrypted yet ? */
 177 boolean_t       dp_encryption_inited = FALSE;
 178 /* Should we encrypt swap ? */
 179 boolean_t       dp_encryption = FALSE;
 180
 181 boolean_t       dp_isssd = FALSE;
 182
 183 /*
 184  * Object sizes are rounded up to the next power of 2,
 185  * unless they are bigger than a given maximum size.
 186  */
 187 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 188
 189 /*
 190  * List of all backing store and segments.
 191  */
 192 MACH_PORT_FACE          emergency_segment_backing_store;
 193 struct backing_store_list_head backing_store_list;
 194 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 195 lck_mtx_t                       paging_segments_lock;
 196 int                     paging_segment_max = 0;
 197 int                     paging_segment_count = 0;
 198 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 199
 200
 201 /*
 202  * Total pages free in system
 203  * This differs from clusters committed/avail which is a measure of the
 204  * over commitment of paging segments to backing store.  An idea which is
 205  * likely to be deprecated.
 206  */
 207 unsigned  int   dp_pages_free = 0;
 208 unsigned  int   dp_pages_reserve = 0;
 209 unsigned  int   cluster_transfer_minimum = 100;
 210
 211 /*
 212  * Trim state
 213  */
 214 struct ps_vnode_trim_data {
 215         struct vnode *vp;
 216         dp_offset_t   offset;
 217         dp_size_t     length;
 218 };
 219
 220 /* forward declarations */
 221 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int);     /* forward */
 222 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int);     /* forward */
 223 default_pager_thread_t *get_read_buffer( void );
 224 kern_return_t ps_vstruct_transfer_from_segment(
 225         vstruct_t        vs,
 226         paging_segment_t segment,
 227         upl_t            upl);
 228 kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
 229 kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
 230 kern_return_t vs_cluster_transfer(
 231         vstruct_t       vs,
 232         dp_offset_t     offset,
 233         dp_size_t       cnt,
 234         upl_t           upl);
 235 vs_map_t vs_get_map_entry(
 236         vstruct_t       vs,
 237         dp_offset_t     offset);
 238
 239 kern_return_t
 240 default_pager_backing_store_delete_internal( MACH_PORT_FACE );
 241
 242 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data);
 243 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data);
 244 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length);
 245
 246 default_pager_thread_t *
 247 get_read_buffer( void )
 248 {
 249         int     i;
 250
 251         DPT_LOCK(dpt_lock);
 252         while(TRUE) {
 253                 for (i=0; i<default_pager_internal_count; i++) {
 254                         if(dpt_array[i]->checked_out == FALSE) {
 255                           dpt_array[i]->checked_out = TRUE;
 256                           DPT_UNLOCK(dpt_lock);
 257                           return  dpt_array[i];
 258                         }
 259                 }
 260                 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
 261         }
 262 }
 263
 264 void
 265 bs_initialize(void)
 266 {
 267         int i;
 268
 269         /*
 270          * List of all backing store.
 271          */
 272         BSL_LOCK_INIT();
 273         queue_init(&backing_store_list.bsl_queue);
 274         PSL_LOCK_INIT();
 275
 276         VS_ASYNC_LOCK_INIT();
 277 #if     VS_ASYNC_REUSE
 278         vs_async_free_list = NULL;
 279 #endif  /* VS_ASYNC_REUSE */
 280
 281         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 282                 clustered_writes[i] = 0;
 283                 clustered_reads[i] = 0;
 284         }
 285
 286 }
 287
 288 /*
 289  * When things do not quite workout...
 290  */
 291 void bs_no_paging_space(boolean_t);     /* forward */
 292
 293 void
 294 bs_no_paging_space(
 295         boolean_t out_of_memory)
 296 {
 297
 298         if (out_of_memory)
 299                 dprintf(("*** OUT OF MEMORY ***\n"));
 300         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 301 }
 302
 303 void bs_more_space(int);        /* forward */
 304 void bs_commit(int);            /* forward */
 305
 306 boolean_t       user_warned = FALSE;
 307 unsigned int    clusters_committed = 0;
 308 unsigned int    clusters_available = 0;
 309 unsigned int    clusters_committed_peak = 0;
 310
 311 void
 312 bs_more_space(
 313         int     nclusters)
 314 {
 315         BSL_LOCK();
 316         /*
 317          * Account for new paging space.
 318          */
 319         clusters_available += nclusters;
 320
 321         if (clusters_available >= clusters_committed) {
 322                 if (verbose && user_warned) {
 323                         printf("%s%s - %d excess clusters now.\n",
 324                                my_name,
 325                                "paging space is OK now",
 326                                clusters_available - clusters_committed);
 327                         user_warned = FALSE;
 328                         clusters_committed_peak = 0;
 329                 }
 330         } else {
 331                 if (verbose && user_warned) {
 332                         printf("%s%s - still short of %d clusters.\n",
 333                                my_name,
 334                                "WARNING: paging space over-committed",
 335                                clusters_committed - clusters_available);
 336                         clusters_committed_peak -= nclusters;
 337                 }
 338         }
 339         BSL_UNLOCK();
 340
 341         return;
 342 }
 343
 344 void
 345 bs_commit(
 346         int     nclusters)
 347 {
 348         BSL_LOCK();
 349         clusters_committed += nclusters;
 350         if (clusters_committed > clusters_available) {
 351                 if (verbose && !user_warned) {
 352                         user_warned = TRUE;
 353                         printf("%s%s - short of %d clusters.\n",
 354                                my_name,
 355                                "WARNING: paging space over-committed",
 356                                clusters_committed - clusters_available);
 357                 }
 358                 if (clusters_committed > clusters_committed_peak) {
 359                         clusters_committed_peak = clusters_committed;
 360                 }
 361         } else {
 362                 if (verbose && user_warned) {
 363                         printf("%s%s - was short of up to %d clusters.\n",
 364                                my_name,
 365                                "paging space is OK now",
 366                                clusters_committed_peak - clusters_available);
 367                         user_warned = FALSE;
 368                         clusters_committed_peak = 0;
 369                 }
 370         }
 371         BSL_UNLOCK();
 372
 373         return;
 374 }
 375
 376 int default_pager_info_verbose = 1;
 377
 378 void
 379 bs_global_info(
 380         uint64_t        *totalp,
 381         uint64_t        *freep)
 382 {
 383         uint64_t                pages_total, pages_free;
 384         paging_segment_t        ps;
 385         int                     i;
 386
 387         PSL_LOCK();
 388         pages_total = pages_free = 0;
 389         for (i = 0; i <= paging_segment_max; i++) {
 390                 ps = paging_segments[i];
 391                 if (ps == PAGING_SEGMENT_NULL)
 392                         continue;
 393
 394                 /*
 395                  * no need to lock: by the time this data
 396                  * gets back to any remote requestor it
 397                  * will be obsolete anyways
 398                  */
 399                 pages_total += ps->ps_pgnum;
 400                 pages_free += ps->ps_clcount << ps->ps_clshift;
 401                 DP_DEBUG(DEBUG_BS_INTERNAL,
 402                          ("segment #%d: %d total, %d free\n",
 403                           i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 404         }
 405         *totalp = pages_total;
 406         *freep = pages_free;
 407         if (verbose && user_warned && default_pager_info_verbose) {
 408                 if (clusters_available < clusters_committed) {
 409                         printf("%s %d clusters committed, %d available.\n",
 410                                my_name,
 411                                clusters_committed,
 412                                clusters_available);
 413                 }
 414         }
 415         PSL_UNLOCK();
 416 }
 417
 418 backing_store_t backing_store_alloc(void);      /* forward */
 419
 420 backing_store_t
 421 backing_store_alloc(void)
 422 {
 423         backing_store_t bs;
 424
 425         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 426         if (bs == BACKING_STORE_NULL)
 427                 panic("backing_store_alloc: no memory");
 428
 429         BS_LOCK_INIT(bs);
 430         bs->bs_port = MACH_PORT_NULL;
 431         bs->bs_priority = 0;
 432         bs->bs_clsize = 0;
 433         bs->bs_pages_total = 0;
 434         bs->bs_pages_in = 0;
 435         bs->bs_pages_in_fail = 0;
 436         bs->bs_pages_out = 0;
 437         bs->bs_pages_out_fail = 0;
 438
 439         return bs;
 440 }
 441
 442 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 443
 444 /* Even in both the component space and external versions of this pager, */
 445 /* backing_store_lookup will be called from tasks in the application space */
 446 backing_store_t
 447 backing_store_lookup(
 448         MACH_PORT_FACE port)
 449 {
 450         backing_store_t bs;
 451
 452 /*
 453         port is currently backed with a vs structure in the alias field
 454         we could create an ISBS alias and a port_is_bs call but frankly
 455         I see no reason for the test, the bs->port == port check below
 456         will work properly on junk entries.
 457
 458         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 459 */
 460         if (port == MACH_PORT_NULL)
 461                 return BACKING_STORE_NULL;
 462
 463         BSL_LOCK();
 464         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 465                       bs_links) {
 466                 BS_LOCK(bs);
 467                 if (bs->bs_port == port) {
 468                         BSL_UNLOCK();
 469                         /* Success, return it locked. */
 470                         return bs;
 471                 }
 472                 BS_UNLOCK(bs);
 473         }
 474         BSL_UNLOCK();
 475         return BACKING_STORE_NULL;
 476 }
 477
 478 void backing_store_add(backing_store_t);        /* forward */
 479
 480 void
 481 backing_store_add(
 482         __unused backing_store_t bs)
 483 {
 484 //      MACH_PORT_FACE          port = bs->bs_port;
 485 //      MACH_PORT_FACE          pset = default_pager_default_set;
 486         kern_return_t           kr = KERN_SUCCESS;
 487
 488         if (kr != KERN_SUCCESS)
 489                 panic("backing_store_add: add to set");
 490
 491 }
 492
 493 /*
 494  * Set up default page shift, but only if not already
 495  * set and argument is within range.
 496  */
 497 boolean_t
 498 bs_set_default_clsize(unsigned int npages)
 499 {
 500         switch(npages){
 501             case 1:
 502             case 2:
 503             case 4:
 504             case 8:
 505                 if (default_pager_clsize == 0)  /* if not yet set */
 506                         vstruct_def_clshift = local_log2(npages);
 507                 return(TRUE);
 508         }
 509         return(FALSE);
 510 }
 511
 512 int bs_get_global_clsize(int clsize);   /* forward */
 513
 514 int
 515 bs_get_global_clsize(
 516         int     clsize)
 517 {
 518         int                     i;
 519         memory_object_default_t dmm;
 520         kern_return_t           kr;
 521
 522         /*
 523          * Only allow setting of cluster size once. If called
 524          * with no cluster size (default), we use the compiled-in default
 525          * for the duration. The same cluster size is used for all
 526          * paging segments.
 527          */
 528         if (default_pager_clsize == 0) {
 529                 /*
 530                  * Keep cluster size in bit shift because it's quicker
 531                  * arithmetic, and easier to keep at a power of 2.
 532                  */
 533                 if (clsize != NO_CLSIZE) {
 534                         for (i = 0; (1 << i) < clsize; i++);
 535                         if (i > MAX_CLUSTER_SHIFT)
 536                                 i = MAX_CLUSTER_SHIFT;
 537                         vstruct_def_clshift = i;
 538                 }
 539                 default_pager_clsize = (1 << vstruct_def_clshift);
 540
 541                 /*
 542                  * Let the user know the new (and definitive) cluster size.
 543                  */
 544                 if (verbose)
 545                         printf("%scluster size = %d page%s\n",
 546                                 my_name, default_pager_clsize,
 547                                 (default_pager_clsize == 1) ? "" : "s");
 548
 549                 /*
 550                  * Let the kernel know too, in case it hasn't used the
 551                  * default value provided in main() yet.
 552                  */
 553                 dmm = default_pager_object;
 554                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 555                 kr = host_default_memory_manager(host_priv_self(),
 556                                                  &dmm,
 557                                                  clsize);
 558                 memory_object_default_deallocate(dmm);
 559
 560                 if (kr != KERN_SUCCESS) {
 561                    panic("bs_get_global_cl_size:host_default_memory_manager");
 562                 }
 563                 if (dmm != default_pager_object) {
 564                   panic("bs_get_global_cl_size:there is another default pager");
 565                 }
 566         }
 567         ASSERT(default_pager_clsize > 0 &&
 568                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 569
 570         return default_pager_clsize;
 571 }
 572
 573 kern_return_t
 574 default_pager_backing_store_create(
 575         memory_object_default_t pager,
 576         int                     priority,
 577         int                     clsize,         /* in bytes */
 578         MACH_PORT_FACE          *backing_store)
 579 {
 580         backing_store_t bs;
 581         MACH_PORT_FACE  port;
 582 //      kern_return_t   kr;
 583         struct vstruct_alias *alias_struct;
 584
 585         if (pager != default_pager_object)
 586                 return KERN_INVALID_ARGUMENT;
 587
 588         bs = backing_store_alloc();
 589         port = ipc_port_alloc_kernel();
 590         ipc_port_make_send(port);
 591         assert (port != IP_NULL);
 592
 593         DP_DEBUG(DEBUG_BS_EXTERNAL,
 594                  ("priority=%d clsize=%d bs_port=0x%x\n",
 595                   priority, clsize, (int) backing_store));
 596
 597         alias_struct = (struct vstruct_alias *)
 598                                 kalloc(sizeof (struct vstruct_alias));
 599         if(alias_struct != NULL) {
 600                 alias_struct->vs = (struct vstruct *)bs;
 601                 alias_struct->name = &default_pager_ops;
 602                 port->alias = (uintptr_t) alias_struct;
 603         }
 604         else {
 605                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 606                 kfree(bs, sizeof (struct backing_store));
 607                 return KERN_RESOURCE_SHORTAGE;
 608         }
 609
 610         bs->bs_port = port;
 611         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 612                 priority = BS_MAXPRI;
 613         else if (priority == BS_NOPRI)
 614                 priority = BS_MAXPRI;
 615         else
 616                 priority = BS_MINPRI;
 617         bs->bs_priority = priority;
 618
 619         bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
 620
 621         BSL_LOCK();
 622         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 623                     bs_links);
 624         BSL_UNLOCK();
 625
 626         backing_store_add(bs);
 627
 628         *backing_store = port;
 629         return KERN_SUCCESS;
 630 }
 631
 632 kern_return_t
 633 default_pager_backing_store_info(
 634         MACH_PORT_FACE          backing_store,
 635         backing_store_flavor_t  flavour,
 636         backing_store_info_t    info,
 637         mach_msg_type_number_t  *size)
 638 {
 639         backing_store_t                 bs;
 640         backing_store_basic_info_t      basic;
 641         int                             i;
 642         paging_segment_t                ps;
 643
 644         if (flavour != BACKING_STORE_BASIC_INFO ||
 645             *size < BACKING_STORE_BASIC_INFO_COUNT)
 646                 return KERN_INVALID_ARGUMENT;
 647
 648         basic = (backing_store_basic_info_t)info;
 649         *size = BACKING_STORE_BASIC_INFO_COUNT;
 650
 651         VSTATS_LOCK(&global_stats.gs_lock);
 652         basic->pageout_calls    = global_stats.gs_pageout_calls;
 653         basic->pagein_calls     = global_stats.gs_pagein_calls;
 654         basic->pages_in         = global_stats.gs_pages_in;
 655         basic->pages_out        = global_stats.gs_pages_out;
 656         basic->pages_unavail    = global_stats.gs_pages_unavail;
 657         basic->pages_init       = global_stats.gs_pages_init;
 658         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 659         VSTATS_UNLOCK(&global_stats.gs_lock);
 660
 661         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 662                 return KERN_INVALID_ARGUMENT;
 663
 664         basic->bs_pages_total   = bs->bs_pages_total;
 665         PSL_LOCK();
 666         bs->bs_pages_free = 0;
 667         for (i = 0; i <= paging_segment_max; i++) {
 668                 ps = paging_segments[i];
 669                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 670                         PS_LOCK(ps);
 671                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 672                         PS_UNLOCK(ps);
 673                 }
 674         }
 675         PSL_UNLOCK();
 676         basic->bs_pages_free    = bs->bs_pages_free;
 677         basic->bs_pages_in      = bs->bs_pages_in;
 678         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 679         basic->bs_pages_out     = bs->bs_pages_out;
 680         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 681
 682         basic->bs_priority      = bs->bs_priority;
 683         basic->bs_clsize        = ptoa_32(bs->bs_clsize);       /* in bytes */
 684
 685         BS_UNLOCK(bs);
 686
 687         return KERN_SUCCESS;
 688 }
 689
 690 int ps_delete(paging_segment_t);        /* forward */
 691 boolean_t current_thread_aborted(void);
 692
 693 int
 694 ps_delete(
 695         paging_segment_t ps)
 696 {
 697         vstruct_t       vs;
 698         kern_return_t   error = KERN_SUCCESS;
 699         int             vs_count;
 700
 701         VSL_LOCK();             /* get the lock on the list of vs's      */
 702
 703         /* The lock relationship and sequence is farily complicated      */
 704         /* this code looks at a live list, locking and unlocking the list */
 705         /* as it traverses it.  It depends on the locking behavior of    */
 706         /* default_pager_no_senders.  no_senders always locks the vstruct */
 707         /* targeted for removal before locking the vstruct list.  However */
 708         /* it will remove that member of the list without locking its    */
 709         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 710         /* it cannot be removed from the list but we must hold the list  */
 711         /* lock to be sure that its pointers to its neighbors are valid. */
 712         /* Also, we can hold off destruction of a vstruct when the list  */
 713         /* lock and the vs locks are not being held by bumping the       */
 714         /* vs_async_pending count.      */
 715
 716
 717         while(backing_store_release_trigger_disable != 0) {
 718                 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
 719         }
 720
 721         /* we will choose instead to hold a send right */
 722         vs_count = vstruct_list.vsl_count;
 723         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 724         if(vs == (vstruct_t)&vstruct_list)  {
 725                 VSL_UNLOCK();
 726                 return KERN_SUCCESS;
 727         }
 728         VS_LOCK(vs);
 729         vs_async_wait(vs);  /* wait for any pending async writes */
 730         if ((vs_count != 0) && (vs != NULL))
 731                 vs->vs_async_pending += 1;  /* hold parties calling  */
 732                                             /* vs_async_wait */
 733
 734         if (bs_low == FALSE)
 735                 backing_store_abort_compaction = FALSE;
 736
 737         VS_UNLOCK(vs);
 738         VSL_UNLOCK();
 739         while((vs_count != 0) && (vs != NULL)) {
 740                 /* We take the count of AMO's before beginning the         */
 741                 /* transfer of of the target segment.                      */
 742                 /* We are guaranteed that the target segment cannot get    */
 743                 /* more users.  We also know that queue entries are        */
 744                 /* made at the back of the list.  If some of the entries   */
 745                 /* we would check disappear while we are traversing the    */
 746                 /* list then we will either check new entries which        */
 747                 /* do not have any backing store in the target segment     */
 748                 /* or re-check old entries.  This might not be optimal     */
 749                 /* but it will always be correct. The alternative is to    */
 750                 /* take a snapshot of the list.                            */
 751                 vstruct_t       next_vs;
 752
 753                 if(dp_pages_free < cluster_transfer_minimum)
 754                         error = KERN_FAILURE;
 755                 else {
 756                         vm_object_t     transfer_object;
 757                         unsigned int    count;
 758                         upl_t           upl;
 759                         int             upl_flags;
 760
 761                         transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
 762                         count = 0;
 763                         upl_flags = (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE |
 764                                      UPL_SET_LITE | UPL_SET_INTERNAL);
 765                         if (dp_encryption) {
 766                                 /* mark the pages as "encrypted" when they come in */
 767                                 upl_flags |= UPL_ENCRYPT;
 768                         }
 769                         error = vm_object_upl_request(transfer_object,
 770                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 771                                 &upl, NULL, &count, upl_flags);
 772
 773                         if(error == KERN_SUCCESS) {
 774                                 error = ps_vstruct_transfer_from_segment(
 775                                                         vs, ps, upl);
 776                                 upl_commit(upl, NULL, 0);
 777                                 upl_deallocate(upl);
 778                         } else {
 779                                 error = KERN_FAILURE;
 780                         }
 781                         vm_object_deallocate(transfer_object);
 782                 }
 783                 if(error || current_thread_aborted()) {
 784                         VS_LOCK(vs);
 785                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 786                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 787                                 vs->vs_waiting_async = FALSE;
 788                                 VS_UNLOCK(vs);
 789                                 thread_wakeup(&vs->vs_async_pending);
 790                         } else {
 791                                 VS_UNLOCK(vs);
 792                         }
 793                         return KERN_FAILURE;
 794                 }
 795
 796                 VSL_LOCK();
 797
 798                 while(backing_store_release_trigger_disable != 0) {
 799                         VSL_SLEEP(&backing_store_release_trigger_disable,
 800                                   THREAD_UNINT);
 801                 }
 802
 803                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 804                 if((next_vs != (vstruct_t)&vstruct_list) &&
 805                                 (vs != next_vs) && (vs_count != 1)) {
 806                         VS_LOCK(next_vs);
 807                         vs_async_wait(next_vs);  /* wait for any  */
 808                                                  /* pending async writes */
 809                         next_vs->vs_async_pending += 1; /* hold parties  */
 810                                                 /* calling vs_async_wait */
 811                         VS_UNLOCK(next_vs);
 812                 }
 813                 VSL_UNLOCK();
 814                 VS_LOCK(vs);
 815                 vs->vs_async_pending -= 1;
 816                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 817                         vs->vs_waiting_async = FALSE;
 818                         VS_UNLOCK(vs);
 819                         thread_wakeup(&vs->vs_async_pending);
 820                 } else {
 821                         VS_UNLOCK(vs);
 822                 }
 823                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 824                         vs = NULL;
 825                 else
 826                         vs = next_vs;
 827                 vs_count--;
 828         }
 829         return KERN_SUCCESS;
 830 }
 831
 832
 833 kern_return_t
 834 default_pager_backing_store_delete_internal(
 835         MACH_PORT_FACE backing_store)
 836 {
 837         backing_store_t         bs;
 838         int                     i;
 839         paging_segment_t        ps;
 840         int                     error;
 841         int                     interim_pages_removed = 0;
 842         boolean_t               dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store );
 843
 844         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 845                 return KERN_INVALID_ARGUMENT;
 846
 847 restart:
 848         PSL_LOCK();
 849         error = KERN_SUCCESS;
 850         for (i = 0; i <= paging_segment_max; i++) {
 851                 ps = paging_segments[i];
 852                 if (ps != PAGING_SEGMENT_NULL &&
 853                     ps->ps_bs == bs &&
 854                     ! IS_PS_GOING_AWAY(ps)) {
 855                         PS_LOCK(ps);
 856
 857                         if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) {
 858                         /*
 859                          * Someone is already busy reclamining this paging segment.
 860                          * If it's the emergency segment we are looking at then check
 861                          * that someone has not already recovered it and set the right
 862                          * state i.e. online but not activated.
 863                          */
 864                                 PS_UNLOCK(ps);
 865                                 continue;
 866                         }
 867
 868                         /* disable access to this segment */
 869                         ps->ps_state &= ~PS_CAN_USE;
 870                         ps->ps_state |= PS_GOING_AWAY;
 871                         PS_UNLOCK(ps);
 872                         /*
 873                          * The "ps" segment is "off-line" now,
 874                          * we can try and delete it...
 875                          */
 876                         if(dp_pages_free < (cluster_transfer_minimum
 877                                                         + ps->ps_pgcount)) {
 878                                 error = KERN_FAILURE;
 879                                 PSL_UNLOCK();
 880                         }
 881                         else {
 882                                 /* remove all pages associated with the  */
 883                                 /* segment from the list of free pages   */
 884                                 /* when transfer is through, all target  */
 885                                 /* segment pages will appear to be free  */
 886
 887                                 dp_pages_free -=  ps->ps_pgcount;
 888                                 interim_pages_removed += ps->ps_pgcount;
 889                                 PSL_UNLOCK();
 890                                 error = ps_delete(ps);
 891                         }
 892                         if (error != KERN_SUCCESS) {
 893                                 /*
 894                                  * We couldn't delete the segment,
 895                                  * probably because there's not enough
 896                                  * virtual memory left.
 897                                  * Re-enable all the segments.
 898                                  */
 899                                 PSL_LOCK();
 900                                 break;
 901                         }
 902                         goto restart;
 903                 }
 904         }
 905
 906         if (error != KERN_SUCCESS) {
 907                 for (i = 0; i <= paging_segment_max; i++) {
 908                         ps = paging_segments[i];
 909                         if (ps != PAGING_SEGMENT_NULL &&
 910                             ps->ps_bs == bs &&
 911                             IS_PS_GOING_AWAY(ps)) {
 912                                 PS_LOCK(ps);
 913
 914                                 if( !IS_PS_GOING_AWAY(ps)) {
 915                                         PS_UNLOCK(ps);
 916                                         continue;
 917                                 }
 918                                 /* Handle the special clusters that came in while we let go the lock*/
 919                                 if( ps->ps_special_clusters) {
 920                                         dp_pages_free += ps->ps_special_clusters << ps->ps_clshift;
 921                                         ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift;
 922                                         ps->ps_clcount += ps->ps_special_clusters;
 923                                         if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) {
 924                                                 ps_select_array[ps->ps_bs->bs_priority] = 0;
 925                                         }
 926                                         ps->ps_special_clusters = 0;
 927                                 }
 928                                 /* re-enable access to this segment */
 929                                 ps->ps_state &= ~PS_GOING_AWAY;
 930                                 ps->ps_state |= PS_CAN_USE;
 931                                 PS_UNLOCK(ps);
 932                         }
 933                 }
 934                 dp_pages_free += interim_pages_removed;
 935                 PSL_UNLOCK();
 936                 BS_UNLOCK(bs);
 937                 return error;
 938         }
 939
 940         for (i = 0; i <= paging_segment_max; i++) {
 941                 ps = paging_segments[i];
 942                 if (ps != PAGING_SEGMENT_NULL &&
 943                     ps->ps_bs == bs) {
 944                         if(IS_PS_GOING_AWAY(ps)) {
 945                                 if(IS_PS_EMERGENCY_SEGMENT(ps)) {
 946                                         PS_LOCK(ps);
 947                                         ps->ps_state &= ~PS_GOING_AWAY;
 948                                         ps->ps_special_clusters = 0;
 949                                         ps->ps_pgcount = ps->ps_pgnum;
 950                                         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
 951                                         dp_pages_reserve += ps->ps_pgcount;
 952                                         PS_UNLOCK(ps);
 953                                 } else {
 954                                         paging_segments[i] = PAGING_SEGMENT_NULL;
 955                                         paging_segment_count--;
 956                                         PS_LOCK(ps);
 957                                         kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
 958                                         kfree(ps, sizeof *ps);
 959                                 }
 960                         }
 961                 }
 962         }
 963
 964         /* Scan the entire ps array separately to make certain we find the */
 965         /* proper paging_segment_max                                       */
 966         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 967                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 968                    paging_segment_max = i;
 969         }
 970
 971         PSL_UNLOCK();
 972
 973         if( dealing_with_emergency_segment ) {
 974                 BS_UNLOCK(bs);
 975                 return KERN_SUCCESS;
 976         }
 977
 978         /*
 979          * All the segments have been deleted.
 980          * We can remove the backing store.
 981          */
 982
 983         /*
 984          * Disable lookups of this backing store.
 985          */
 986         if((void *)bs->bs_port->alias != NULL)
 987                 kfree((void *) bs->bs_port->alias,
 988                       sizeof (struct vstruct_alias));
 989         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 990         bs->bs_port = MACH_PORT_NULL;
 991         BS_UNLOCK(bs);
 992
 993         /*
 994          * Remove backing store from backing_store list.
 995          */
 996         BSL_LOCK();
 997         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
 998                      bs_links);
 999         BSL_UNLOCK();
1000
1001         /*
1002          * Free the backing store structure.
1003          */
1004         kfree(bs, sizeof *bs);
1005
1006         return KERN_SUCCESS;
1007 }
1008
1009 kern_return_t
1010 default_pager_backing_store_delete(
1011         MACH_PORT_FACE backing_store)
1012 {
1013         if( backing_store != emergency_segment_backing_store ) {
1014                 default_pager_backing_store_delete_internal(emergency_segment_backing_store);
1015         }
1016         return(default_pager_backing_store_delete_internal(backing_store));
1017 }
1018
1019 int     ps_enter(paging_segment_t);     /* forward */
1020
1021 int
1022 ps_enter(
1023         paging_segment_t ps)
1024 {
1025         int i;
1026
1027         PSL_LOCK();
1028
1029         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
1030                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
1031                         break;
1032         }
1033
1034         if (i < MAX_NUM_PAGING_SEGMENTS) {
1035                 paging_segments[i] = ps;
1036                 if (i > paging_segment_max)
1037                         paging_segment_max = i;
1038                 paging_segment_count++;
1039                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
1040                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
1041                         ps_select_array[ps->ps_bs->bs_priority] = 0;
1042                 i = 0;
1043         } else {
1044                 PSL_UNLOCK();
1045                 return KERN_RESOURCE_SHORTAGE;
1046         }
1047
1048         PSL_UNLOCK();
1049         return i;
1050 }
1051
1052 #ifdef DEVICE_PAGING
1053 kern_return_t
1054 default_pager_add_segment(
1055         MACH_PORT_FACE  backing_store,
1056         MACH_PORT_FACE  device,
1057         recnum_t        offset,
1058         recnum_t        count,
1059         int             record_size)
1060 {
1061         backing_store_t         bs;
1062         paging_segment_t        ps;
1063         int                     i;
1064         int                     error;
1065
1066         if ((bs = backing_store_lookup(backing_store))
1067             == BACKING_STORE_NULL)
1068                 return KERN_INVALID_ARGUMENT;
1069
1070         PSL_LOCK();
1071         for (i = 0; i <= paging_segment_max; i++) {
1072                 ps = paging_segments[i];
1073                 if (ps == PAGING_SEGMENT_NULL)
1074                         continue;
1075
1076                 /*
1077                  * Check for overlap on same device.
1078                  */
1079                 if (!(ps->ps_device != device
1080                       || offset >= ps->ps_offset + ps->ps_recnum
1081                       || offset + count <= ps->ps_offset)) {
1082                         PSL_UNLOCK();
1083                         BS_UNLOCK(bs);
1084                         return KERN_INVALID_ARGUMENT;
1085                 }
1086         }
1087         PSL_UNLOCK();
1088
1089         /*
1090          * Set up the paging segment
1091          */
1092         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1093         if (ps == PAGING_SEGMENT_NULL) {
1094                 BS_UNLOCK(bs);
1095                 return KERN_RESOURCE_SHORTAGE;
1096         }
1097
1098         ps->ps_segtype = PS_PARTITION;
1099         ps->ps_device = device;
1100         ps->ps_offset = offset;
1101         ps->ps_record_shift = local_log2(vm_page_size / record_size);
1102         ps->ps_recnum = count;
1103         ps->ps_pgnum = count >> ps->ps_record_shift;
1104
1105         ps->ps_pgcount = ps->ps_pgnum;
1106         ps->ps_clshift = local_log2(bs->bs_clsize);
1107         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1108         ps->ps_hint = 0;
1109
1110         PS_LOCK_INIT(ps);
1111         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1112         if (!ps->ps_bmap) {
1113                 kfree(ps, sizeof *ps);
1114                 BS_UNLOCK(bs);
1115                 return KERN_RESOURCE_SHORTAGE;
1116         }
1117         for (i = 0; i < ps->ps_ncls; i++) {
1118                 clrbit(ps->ps_bmap, i);
1119         }
1120
1121         if(paging_segment_count == 0) {
1122                 ps->ps_state = PS_EMERGENCY_SEGMENT;
1123                 if(use_emergency_swap_file_first) {
1124                         ps->ps_state |= PS_CAN_USE;
1125                 }
1126         } else {
1127                 ps->ps_state = PS_CAN_USE;
1128         }
1129
1130         ps->ps_bs = bs;
1131
1132         if ((error = ps_enter(ps)) != 0) {
1133                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1134                 kfree(ps, sizeof *ps);
1135                 BS_UNLOCK(bs);
1136                 return KERN_RESOURCE_SHORTAGE;
1137         }
1138
1139         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1140         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1141         BS_UNLOCK(bs);
1142
1143         PSL_LOCK();
1144         if(IS_PS_OK_TO_USE(ps)) {
1145                 dp_pages_free += ps->ps_pgcount;
1146         } else {
1147                 dp_pages_reserve += ps->ps_pgcount;
1148         }
1149         PSL_UNLOCK();
1150
1151         bs_more_space(ps->ps_clcount);
1152
1153         DP_DEBUG(DEBUG_BS_INTERNAL,
1154                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1155                   device, offset, count, record_size,
1156                   ps->ps_record_shift, ps->ps_pgnum));
1157
1158         return KERN_SUCCESS;
1159 }
1160
1161 boolean_t
1162 bs_add_device(
1163         char            *dev_name,
1164         MACH_PORT_FACE  master)
1165 {
1166         security_token_t        null_security_token = {
1167                 { 0, 0 }
1168         };
1169         MACH_PORT_FACE  device;
1170         int             info[DEV_GET_SIZE_COUNT];
1171         mach_msg_type_number_t info_count;
1172         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1173         unsigned int    rec_size;
1174         recnum_t        count;
1175         int             clsize;
1176         MACH_PORT_FACE  reply_port;
1177
1178         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1179                         null_security_token, dev_name, &device))
1180                 return FALSE;
1181
1182         info_count = DEV_GET_SIZE_COUNT;
1183         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1184                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1185                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1186                 clsize = bs_get_global_clsize(0);
1187                 if (!default_pager_backing_store_create(
1188                                         default_pager_object,
1189                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1190                                         (clsize * vm_page_size),
1191                                         &bs)) {
1192                         if (!default_pager_add_segment(bs, device,
1193                                                        0, count, rec_size)) {
1194                                 return TRUE;
1195                         }
1196                         ipc_port_release_receive(bs);
1197                 }
1198         }
1199
1200         ipc_port_release_send(device);
1201         return FALSE;
1202 }
1203 #endif /* DEVICE_PAGING */
1204
1205 #if     VS_ASYNC_REUSE
1206
1207 struct vs_async *
1208 vs_alloc_async(void)
1209 {
1210         struct vs_async *vsa;
1211         MACH_PORT_FACE  reply_port;
1212 //      kern_return_t   kr;
1213
1214         VS_ASYNC_LOCK();
1215         if (vs_async_free_list == NULL) {
1216                 VS_ASYNC_UNLOCK();
1217                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1218                 if (vsa != NULL) {
1219                         /*
1220                          * Try allocating a reply port named after the
1221                          * address of the vs_async structure.
1222                          */
1223                         struct vstruct_alias    *alias_struct;
1224
1225                         reply_port = ipc_port_alloc_kernel();
1226                         alias_struct = (struct vstruct_alias *)
1227                                 kalloc(sizeof (struct vstruct_alias));
1228                         if(alias_struct != NULL) {
1229                                 alias_struct->vs = (struct vstruct *)vsa;
1230                                 alias_struct->name = &default_pager_ops;
1231                                 reply_port->alias = (uintptr_t) alias_struct;
1232                                 vsa->reply_port = reply_port;
1233                                 vs_alloc_async_count++;
1234                         }
1235                         else {
1236                                 vs_alloc_async_failed++;
1237                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1238                                                                 (reply_port));
1239                                 kfree(vsa, sizeof (struct vs_async));
1240                                 vsa = NULL;
1241                         }
1242                 }
1243         } else {
1244                 vsa = vs_async_free_list;
1245                 vs_async_free_list = vs_async_free_list->vsa_next;
1246                 VS_ASYNC_UNLOCK();
1247         }
1248
1249         return vsa;
1250 }
1251
1252 void
1253 vs_free_async(
1254         struct vs_async *vsa)
1255 {
1256         VS_ASYNC_LOCK();
1257         vsa->vsa_next = vs_async_free_list;
1258         vs_async_free_list = vsa;
1259         VS_ASYNC_UNLOCK();
1260 }
1261
1262 #else   /* VS_ASYNC_REUSE */
1263
1264 struct vs_async *
1265 vs_alloc_async(void)
1266 {
1267         struct vs_async *vsa;
1268         MACH_PORT_FACE  reply_port;
1269         kern_return_t   kr;
1270
1271         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1272         if (vsa != NULL) {
1273                 /*
1274                  * Try allocating a reply port named after the
1275                  * address of the vs_async structure.
1276                  */
1277                         reply_port = ipc_port_alloc_kernel();
1278                         alias_struct = (vstruct_alias *)
1279                                 kalloc(sizeof (struct vstruct_alias));
1280                         if(alias_struct != NULL) {
1281                                 alias_struct->vs = reply_port;
1282                                 alias_struct->name = &default_pager_ops;
1283                                 reply_port->alias = (int) vsa;
1284                                 vsa->reply_port = reply_port;
1285                                 vs_alloc_async_count++;
1286                         }
1287                         else {
1288                                 vs_alloc_async_failed++;
1289                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1290                                                                 (reply_port));
1291                                 kfree(vsa, sizeof (struct vs_async));
1292                                 vsa = NULL;
1293                         }
1294         }
1295
1296         return vsa;
1297 }
1298
1299 void
1300 vs_free_async(
1301         struct vs_async *vsa)
1302 {
1303         MACH_PORT_FACE  reply_port;
1304         kern_return_t   kr;
1305
1306         reply_port = vsa->reply_port;
1307         kfree(reply_port->alias, sizeof (struct vstuct_alias));
1308         kfree(vsa, sizeof (struct vs_async));
1309         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1310 #if 0
1311         VS_ASYNC_LOCK();
1312         vs_alloc_async_count--;
1313         VS_ASYNC_UNLOCK();
1314 #endif
1315 }
1316
1317 #endif  /* VS_ASYNC_REUSE */
1318
1319 zone_t  vstruct_zone;
1320
1321 vstruct_t
1322 ps_vstruct_create(
1323         dp_size_t size)
1324 {
1325         vstruct_t       vs;
1326         unsigned int    i;
1327
1328         vs = (vstruct_t) zalloc(vstruct_zone);
1329         if (vs == VSTRUCT_NULL) {
1330                 return VSTRUCT_NULL;
1331         }
1332
1333         VS_LOCK_INIT(vs);
1334
1335         /*
1336          * The following fields will be provided later.
1337          */
1338         vs->vs_pager_ops = NULL;
1339         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1340         vs->vs_references = 1;
1341         vs->vs_seqno = 0;
1342
1343         vs->vs_waiting_seqno = FALSE;
1344         vs->vs_waiting_read = FALSE;
1345         vs->vs_waiting_write = FALSE;
1346         vs->vs_waiting_async = FALSE;
1347
1348         vs->vs_readers = 0;
1349         vs->vs_writers = 0;
1350
1351         vs->vs_errors = 0;
1352
1353         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1354         vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1355         vs->vs_async_pending = 0;
1356
1357         /*
1358          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1359          * depending on the size of the memory object.
1360          */
1361         if (INDIRECT_CLMAP(vs->vs_size)) {
1362                 vs->vs_imap = (struct vs_map **)
1363                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1364                 vs->vs_indirect = TRUE;
1365         } else {
1366                 vs->vs_dmap = (struct vs_map *)
1367                         kalloc(CLMAP_SIZE(vs->vs_size));
1368                 vs->vs_indirect = FALSE;
1369         }
1370         vs->vs_xfer_pending = FALSE;
1371         DP_DEBUG(DEBUG_VS_INTERNAL,
1372                  ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1373
1374         /*
1375          * Check to see that we got the space.
1376          */
1377         if (!vs->vs_dmap) {
1378                 kfree(vs, sizeof *vs);
1379                 return VSTRUCT_NULL;
1380         }
1381
1382         /*
1383          * Zero the indirect pointers, or clear the direct pointers.
1384          */
1385         if (vs->vs_indirect)
1386                 memset(vs->vs_imap, 0,
1387                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1388         else
1389                 for (i = 0; i < vs->vs_size; i++)
1390                         VSM_CLR(vs->vs_dmap[i]);
1391
1392         VS_MAP_LOCK_INIT(vs);
1393
1394         bs_commit(vs->vs_size);
1395
1396         return vs;
1397 }
1398
1399 paging_segment_t ps_select_segment(unsigned int, int *);        /* forward */
1400
1401 paging_segment_t
1402 ps_select_segment(
1403         unsigned int    shift,
1404         int             *psindex)
1405 {
1406         paging_segment_t        ps;
1407         int                     i;
1408         int                     j;
1409
1410         /*
1411          * Optimize case where there's only one segment.
1412          * paging_segment_max will index the one and only segment.
1413          */
1414
1415         PSL_LOCK();
1416         if (paging_segment_count == 1) {
1417                 paging_segment_t lps = PAGING_SEGMENT_NULL;     /* used to avoid extra PS_UNLOCK */
1418                 ipc_port_t trigger = IP_NULL;
1419
1420                 ps = paging_segments[paging_segment_max];
1421                 *psindex = paging_segment_max;
1422                 PS_LOCK(ps);
1423                 if( !IS_PS_EMERGENCY_SEGMENT(ps) ) {
1424                         panic("Emergency paging segment missing\n");
1425                 }
1426                 ASSERT(ps->ps_clshift >= shift);
1427                 if(IS_PS_OK_TO_USE(ps)) {
1428                         if (ps->ps_clcount) {
1429                                 ps->ps_clcount--;
1430                                 dp_pages_free -=  1 << ps->ps_clshift;
1431                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1432                                 if(min_pages_trigger_port &&
1433                                   (dp_pages_free < minimum_pages_remaining)) {
1434                                         trigger = min_pages_trigger_port;
1435                                         min_pages_trigger_port = NULL;
1436                                         bs_low = TRUE;
1437                                         backing_store_abort_compaction = TRUE;
1438                                 }
1439                                 lps = ps;
1440                         }
1441                 }
1442                 PS_UNLOCK(ps);
1443
1444                 if( lps == PAGING_SEGMENT_NULL ) {
1445                         if(dp_pages_free) {
1446                                 dp_pages_free_drift_count++;
1447                                 if(dp_pages_free > dp_pages_free_drifted_max) {
1448                                         dp_pages_free_drifted_max = dp_pages_free;
1449                                 }
1450                                 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1451                         }
1452                         dp_pages_free = 0;
1453                 }
1454
1455                 PSL_UNLOCK();
1456
1457                 if (trigger != IP_NULL) {
1458                         dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1459
1460                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1461                         ipc_port_release_send(trigger);
1462                 }
1463                 return lps;
1464         }
1465
1466         if (paging_segment_count == 0) {
1467                 if(dp_pages_free) {
1468                         dp_pages_free_drift_count++;
1469                         if(dp_pages_free > dp_pages_free_drifted_max) {
1470                                 dp_pages_free_drifted_max = dp_pages_free;
1471                         }
1472                         dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1473                 }
1474                 dp_pages_free = 0;
1475                 PSL_UNLOCK();
1476                 return PAGING_SEGMENT_NULL;
1477         }
1478
1479         for (i = BS_MAXPRI;
1480              i >= BS_MINPRI; i--) {
1481                 int start_index;
1482
1483                 if ((ps_select_array[i] == BS_NOPRI) ||
1484                                 (ps_select_array[i] == BS_FULLPRI))
1485                         continue;
1486                 start_index = ps_select_array[i];
1487
1488                 if(!(paging_segments[start_index])) {
1489                         j = start_index+1;
1490                         physical_transfer_cluster_count = 0;
1491                 }
1492                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1493                                 (((paging_segments[start_index])->ps_clshift)
1494                                 + vm_page_shift))) {
1495                         physical_transfer_cluster_count = 0;
1496                         j = start_index + 1;
1497                 } else {
1498                         physical_transfer_cluster_count+=1;
1499                         j = start_index;
1500                         if(start_index == 0)
1501                                 start_index = paging_segment_max;
1502                         else
1503                                 start_index = start_index - 1;
1504                 }
1505
1506                 while (1) {
1507                         if (j > paging_segment_max)
1508                                 j = 0;
1509                         if ((ps = paging_segments[j]) &&
1510                             (ps->ps_bs->bs_priority == i)) {
1511                                 /*
1512                                  * Force the ps cluster size to be
1513                                  * >= that of the vstruct.
1514                                  */
1515                                 PS_LOCK(ps);
1516                                 if (IS_PS_OK_TO_USE(ps)) {
1517                                         if ((ps->ps_clcount) &&
1518                                                    (ps->ps_clshift >= shift)) {
1519                                                 ipc_port_t trigger = IP_NULL;
1520
1521                                                 ps->ps_clcount--;
1522                                                 dp_pages_free -=  1 << ps->ps_clshift;
1523                                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1524                                                 if(min_pages_trigger_port &&
1525                                                         (dp_pages_free <
1526                                                         minimum_pages_remaining)) {
1527                                                         trigger = min_pages_trigger_port;
1528                                                         min_pages_trigger_port = NULL;
1529                                                         bs_low = TRUE;
1530                                                         backing_store_abort_compaction = TRUE;
1531                                                 }
1532                                                 PS_UNLOCK(ps);
1533                                                 /*
1534                                                  * found one, quit looking.
1535                                                  */
1536                                                 ps_select_array[i] = j;
1537                                                 PSL_UNLOCK();
1538
1539                                                 if (trigger != IP_NULL) {
1540                                                         dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1541
1542                                                         default_pager_space_alert(
1543                                                                 trigger,
1544                                                                 HI_WAT_ALERT);
1545                                                         ipc_port_release_send(trigger);
1546                                                 }
1547                                                 *psindex = j;
1548                                                 return ps;
1549                                         }
1550                                 }
1551                                 PS_UNLOCK(ps);
1552                         }
1553                         if (j == start_index) {
1554                                 /*
1555                                  * none at this priority -- mark it full
1556                                  */
1557                                 ps_select_array[i] = BS_FULLPRI;
1558                                 break;
1559                         }
1560                         j++;
1561                 }
1562         }
1563
1564         if(dp_pages_free) {
1565                 dp_pages_free_drift_count++;
1566                 if(dp_pages_free > dp_pages_free_drifted_max) {
1567                         dp_pages_free_drifted_max = dp_pages_free;
1568                 }
1569                 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free));
1570         }
1571         dp_pages_free = 0;
1572         PSL_UNLOCK();
1573         return PAGING_SEGMENT_NULL;
1574 }
1575
1576 dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1577
1578 dp_offset_t
1579 ps_allocate_cluster(
1580         vstruct_t               vs,
1581         int                     *psindex,
1582         paging_segment_t        use_ps)
1583 {
1584         unsigned int            byte_num;
1585         int                     bit_num = 0;
1586         paging_segment_t        ps;
1587         dp_offset_t             cluster;
1588         ipc_port_t              trigger = IP_NULL;
1589
1590         /*
1591          * Find best paging segment.
1592          * ps_select_segment will decrement cluster count on ps.
1593          * Must pass cluster shift to find the most appropriate segment.
1594          */
1595         /* NOTE:  The addition of paging segment delete capability threatened
1596          * to seriously complicate the treatment of paging segments in this
1597          * module and the ones that call it (notably ps_clmap), because of the
1598          * difficulty in assuring that the paging segment would continue to
1599          * exist between being unlocked and locked.   This was
1600          * avoided because all calls to this module are based in either
1601          * dp_memory_object calls which rely on the vs lock, or by
1602          * the transfer function which is part of the segment delete path.
1603          * The transfer function which is part of paging segment delete is
1604          * protected from multiple callers by the backing store lock.
1605          * The paging segment delete function treats mappings to a paging
1606          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1607          * while data is transferred to the remaining segments.  This is in
1608          * line with the view that incomplete or in-transition mappings between
1609          * data, a vstruct, and backing store are protected by the vs lock.
1610          * This and the ordering of the paging segment "going_away" bit setting
1611          * protects us.
1612          */
1613 retry:
1614         if (use_ps != PAGING_SEGMENT_NULL) {
1615                 ps = use_ps;
1616                 PSL_LOCK();
1617                 PS_LOCK(ps);
1618
1619                 ASSERT(ps->ps_clcount != 0);
1620
1621                 ps->ps_clcount--;
1622                 dp_pages_free -=  1 << ps->ps_clshift;
1623                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1624                 if(min_pages_trigger_port &&
1625                                 (dp_pages_free < minimum_pages_remaining)) {
1626                         trigger = min_pages_trigger_port;
1627                         min_pages_trigger_port = NULL;
1628                         bs_low = TRUE;
1629                         backing_store_abort_compaction = TRUE;
1630                 }
1631                 PSL_UNLOCK();
1632                 PS_UNLOCK(ps);
1633                 if (trigger != IP_NULL) {
1634                         dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1635
1636                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1637                         ipc_port_release_send(trigger);
1638                 }
1639
1640         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1641                    PAGING_SEGMENT_NULL) {
1642                 static clock_sec_t lastnotify = 0;
1643                 clock_sec_t now;
1644                 clock_nsec_t nanoseconds_dummy;
1645
1646                 /*
1647                  * Don't immediately jump to the emergency segment. Give the
1648                  * dynamic pager a chance to create it's first normal swap file.
1649                  * Unless, of course the very first normal swap file can't be
1650                  * created due to some problem and we didn't expect that problem
1651                  * i.e. use_emergency_swap_file_first was never set to true initially.
1652                  * It then gets set in the swap file creation error handling.
1653                  */
1654                 if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) {
1655
1656                         ps = paging_segments[EMERGENCY_PSEG_INDEX];
1657                         if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) {
1658                                 PSL_LOCK();
1659                                 PS_LOCK(ps);
1660
1661                                 if(IS_PS_GOING_AWAY(ps)) {
1662                                         /* Someone de-activated the emergency paging segment*/
1663                                         PS_UNLOCK(ps);
1664                                         PSL_UNLOCK();
1665
1666                                 } else if(dp_pages_free) {
1667                                         /*
1668                                          * Someone has already activated the emergency paging segment
1669                                          * OR
1670                                          * Between us having rec'd a NULL segment from ps_select_segment
1671                                          * and reaching here a new normal segment could have been added.
1672                                          * E.g. we get NULL segment and another thread just added the
1673                                          * new swap file. Hence check to see if we have more dp_pages_free
1674                                          * before activating the emergency segment.
1675                                          */
1676                                         PS_UNLOCK(ps);
1677                                         PSL_UNLOCK();
1678                                         goto retry;
1679
1680                                 } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) {
1681                                         /*
1682                                          * PS_CAN_USE is only reset from the emergency segment when it's
1683                                          * been successfully recovered. So it's legal to have an emergency
1684                                          * segment that has PS_CAN_USE but no clusters because it's recovery
1685                                          * failed.
1686                                          */
1687                                         backing_store_t bs = ps->ps_bs;
1688                                         ps->ps_state |= PS_CAN_USE;
1689                                         if(ps_select_array[bs->bs_priority] == BS_FULLPRI ||
1690                                                 ps_select_array[bs->bs_priority] == BS_NOPRI) {
1691                                                 ps_select_array[bs->bs_priority] = 0;
1692                                         }
1693                                         dp_pages_free += ps->ps_pgcount;
1694                                         dp_pages_reserve -= ps->ps_pgcount;
1695                                         PS_UNLOCK(ps);
1696                                         PSL_UNLOCK();
1697                                         dprintf(("Switching ON Emergency paging segment\n"));
1698                                         goto retry;
1699                                 }
1700
1701                                 PS_UNLOCK(ps);
1702                                 PSL_UNLOCK();
1703                         }
1704                 }
1705
1706                 /*
1707                  * Emit a notification of the low-paging resource condition
1708                  * but don't issue it more than once every five seconds.  This
1709                  * prevents us from overflowing logs with thousands of
1710                  * repetitions of the message.
1711                  */
1712                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1713                 if (paging_segment_count > 1 && (now > lastnotify + 5)) {
1714                         /* With an activated emergency paging segment we still
1715                          * didn't get any clusters. This could mean that the
1716                          * emergency paging segment is exhausted.
1717                          */
1718                         dprintf(("System is out of paging space.\n"));
1719                         lastnotify = now;
1720                 }
1721
1722                 PSL_LOCK();
1723
1724                 if(min_pages_trigger_port) {
1725                         trigger = min_pages_trigger_port;
1726                         min_pages_trigger_port = NULL;
1727                         bs_low = TRUE;
1728                         backing_store_abort_compaction = TRUE;
1729                 }
1730                 PSL_UNLOCK();
1731                 if (trigger != IP_NULL) {
1732                         dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1733
1734                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1735                         ipc_port_release_send(trigger);
1736                 }
1737                 return (dp_offset_t) -1;
1738         }
1739
1740         /*
1741          * Look for an available cluster.  At the end of the loop,
1742          * byte_num is the byte offset and bit_num is the bit offset of the
1743          * first zero bit in the paging segment bitmap.
1744          */
1745         PS_LOCK(ps);
1746         byte_num = ps->ps_hint;
1747         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1748                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1749                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1750                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1751                                         break;
1752                         }
1753                         ASSERT(bit_num != NBBY);
1754                         break;
1755                 }
1756         }
1757         ps->ps_hint = byte_num;
1758         cluster = (byte_num*NBBY) + bit_num;
1759
1760         /* Space was reserved, so this must be true */
1761         ASSERT(cluster < ps->ps_ncls);
1762
1763         setbit(ps->ps_bmap, cluster);
1764         PS_UNLOCK(ps);
1765
1766         return cluster;
1767 }
1768
1769 void ps_deallocate_cluster(paging_segment_t, dp_offset_t);      /* forward */
1770
1771 void
1772 ps_deallocate_cluster(
1773         paging_segment_t        ps,
1774         dp_offset_t             cluster)
1775 {
1776
1777         if (cluster >= ps->ps_ncls)
1778                 panic("ps_deallocate_cluster: Invalid cluster number");
1779
1780         /*
1781          * Lock the paging segment, clear the cluster's bitmap and increment the
1782          * number of free cluster.
1783          */
1784         PSL_LOCK();
1785         PS_LOCK(ps);
1786         clrbit(ps->ps_bmap, cluster);
1787         if( IS_PS_OK_TO_USE(ps)) {
1788                 ++ps->ps_clcount;
1789                 ps->ps_pgcount +=  1 << ps->ps_clshift;
1790                 dp_pages_free +=  1 << ps->ps_clshift;
1791         } else {
1792                 ps->ps_special_clusters += 1;
1793         }
1794
1795         /*
1796          * Move the hint down to the freed cluster if it is
1797          * less than the current hint.
1798          */
1799         if ((cluster/NBBY) < ps->ps_hint) {
1800                 ps->ps_hint = (cluster/NBBY);
1801         }
1802
1803
1804         /*
1805          * If we're freeing space on a full priority, reset the array.
1806          */
1807         if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1808                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1809         PS_UNLOCK(ps);
1810         PSL_UNLOCK();
1811
1812         return;
1813 }
1814
1815 void ps_dealloc_vsmap(struct vs_map *, dp_size_t);      /* forward */
1816
1817 void
1818 ps_dealloc_vsmap(
1819         struct vs_map   *vsmap,
1820         dp_size_t       size)
1821 {
1822         unsigned int i;
1823         struct ps_vnode_trim_data trim_data;
1824
1825         ps_vnode_trim_init(&trim_data);
1826
1827         for (i = 0; i < size; i++) {
1828                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) {
1829                         ps_vnode_trim_more(&trim_data,
1830                                               &vsmap[i],
1831                                               VSM_PS(vsmap[i])->ps_clshift,
1832                                               vm_page_size << VSM_PS(vsmap[i])->ps_clshift);
1833                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1834                                               VSM_CLOFF(vsmap[i]));
1835                 } else {
1836                         ps_vnode_trim_now(&trim_data);
1837                 }
1838         }
1839         ps_vnode_trim_now(&trim_data);
1840 }
1841
1842 void
1843 ps_vstruct_dealloc(
1844         vstruct_t vs)
1845 {
1846         unsigned int    i;
1847 //      spl_t   s;
1848
1849         VS_MAP_LOCK(vs);
1850
1851         /*
1852          * If this is an indirect structure, then we walk through the valid
1853          * (non-zero) indirect pointers and deallocate the clusters
1854          * associated with each used map entry (via ps_dealloc_vsmap).
1855          * When all of the clusters in an indirect block have been
1856          * freed, we deallocate the block.  When all of the indirect
1857          * blocks have been deallocated we deallocate the memory
1858          * holding the indirect pointers.
1859          */
1860         if (vs->vs_indirect) {
1861                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1862                         if (vs->vs_imap[i] != NULL) {
1863                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1864                                 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1865                         }
1866                 }
1867                 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1868         } else {
1869                 /*
1870                  * Direct map.  Free used clusters, then memory.
1871                  */
1872                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1873                 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1874         }
1875         VS_MAP_UNLOCK(vs);
1876
1877         bs_commit(- vs->vs_size);
1878
1879         zfree(vstruct_zone, vs);
1880 }
1881
1882 void
1883 ps_vstruct_reclaim(
1884         vstruct_t vs,
1885         boolean_t return_to_vm,
1886         boolean_t reclaim_backing_store)
1887 {
1888         unsigned int    i, j;
1889 //      spl_t   s;
1890         unsigned int    request_flags;
1891         struct vs_map   *vsmap;
1892         boolean_t       vsmap_all_clear, vsimap_all_clear;
1893         struct vm_object_fault_info fault_info;
1894         int             clmap_off;
1895         unsigned int    vsmap_size;
1896         kern_return_t   kr;
1897
1898         request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
1899         if (reclaim_backing_store) {
1900 #if USE_PRECIOUS
1901                 request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE;
1902 #else   /* USE_PRECIOUS */
1903                 request_flags |= UPL_REQUEST_SET_DIRTY;
1904 #endif  /* USE_PRECIOUS */
1905         }
1906
1907         VS_MAP_LOCK(vs);
1908
1909         fault_info.cluster_size = VM_SUPER_CLUSTER;
1910         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
1911         fault_info.user_tag = 0;
1912         fault_info.lo_offset = 0;
1913         fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift);
1914         fault_info.io_sync = reclaim_backing_store;
1915
1916         /*
1917          * If this is an indirect structure, then we walk through the valid
1918          * (non-zero) indirect pointers and deallocate the clusters
1919          * associated with each used map entry (via ps_dealloc_vsmap).
1920          * When all of the clusters in an indirect block have been
1921          * freed, we deallocate the block.  When all of the indirect
1922          * blocks have been deallocated we deallocate the memory
1923          * holding the indirect pointers.
1924          */
1925         if (vs->vs_indirect) {
1926                 vsimap_all_clear = TRUE;
1927                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1928                         vsmap = vs->vs_imap[i];
1929                         if (vsmap == NULL)
1930                                 continue;
1931                         /* loop on clusters in this indirect map */
1932                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
1933                                      VSCLSIZE(vs) * i);
1934                         if (i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
1935                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
1936                         else
1937                                 vsmap_size = CLMAP_ENTRIES;
1938                         vsmap_all_clear = TRUE;
1939                         if (return_to_vm) {
1940                                 for (j = 0; j < vsmap_size;) {
1941                                         if (VSM_ISCLR(vsmap[j]) ||
1942                                             VSM_ISERR(vsmap[j])) {
1943                                                 j++;
1944                                                 clmap_off += vm_page_size * VSCLSIZE(vs);
1945                                                 continue;
1946                                         }
1947                                         VS_MAP_UNLOCK(vs);
1948                                         kr = pvs_cluster_read(
1949                                                 vs,
1950                                                 clmap_off,
1951                                                 (dp_size_t) -1, /* read whole cluster */
1952                                                 &fault_info);
1953                                         VS_MAP_LOCK(vs); /* XXX what if it changed ? */
1954                                         if (kr != KERN_SUCCESS) {
1955                                                 vsmap_all_clear = FALSE;
1956                                                 vsimap_all_clear = FALSE;
1957                                         }
1958                                 }
1959                         }
1960                         if (vsmap_all_clear) {
1961                                 ps_dealloc_vsmap(vsmap, CLMAP_ENTRIES);
1962                                 kfree(vsmap, CLMAP_THRESHOLD);
1963                                 vs->vs_imap[i] = NULL;
1964                         }
1965                 }
1966                 if (vsimap_all_clear) {
1967 //                      kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1968                 }
1969         } else {
1970                 /*
1971                  * Direct map.  Free used clusters, then memory.
1972                  */
1973                 vsmap = vs->vs_dmap;
1974                 if (vsmap == NULL) {
1975                         goto out;
1976                 }
1977                 vsmap_all_clear = TRUE;
1978                 /* loop on clusters in the direct map */
1979                 if (return_to_vm) {
1980                         for (j = 0; j < vs->vs_size;) {
1981                                 if (VSM_ISCLR(vsmap[j]) ||
1982                                     VSM_ISERR(vsmap[j])) {
1983                                         j++;
1984                                         continue;
1985                                 }
1986                                 clmap_off = vm_page_size * (j << vs->vs_clshift);
1987                                 VS_MAP_UNLOCK(vs);
1988                                 kr = pvs_cluster_read(
1989                                         vs,
1990                                         clmap_off,
1991                                         (dp_size_t) -1, /* read whole cluster */
1992                                         &fault_info);
1993                                 VS_MAP_LOCK(vs); /* XXX what if it changed ? */
1994                                 if (kr != KERN_SUCCESS) {
1995                                         vsmap_all_clear = FALSE;
1996                                 } else {
1997 //                                      VSM_CLR(vsmap[j]);
1998                                 }
1999                         }
2000                 }
2001                 if (vsmap_all_clear) {
2002                         ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
2003 //                      kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
2004                 }
2005         }
2006 out:
2007         VS_MAP_UNLOCK(vs);
2008 }
2009
2010 int ps_map_extend(vstruct_t, unsigned int);     /* forward */
2011
2012 int ps_map_extend(
2013         vstruct_t       vs,
2014         unsigned int    new_size)
2015 {
2016         struct vs_map   **new_imap;
2017         struct vs_map   *new_dmap = NULL;
2018         int             newdsize;
2019         int             i;
2020         void            *old_map = NULL;
2021         int             old_map_size = 0;
2022
2023         if (vs->vs_size >= new_size) {
2024                 /*
2025                  * Someone has already done the work.
2026                  */
2027                 return 0;
2028         }
2029
2030         /*
2031          * If the new size extends into the indirect range, then we have one
2032          * of two cases: we are going from indirect to indirect, or we are
2033          * going from direct to indirect.  If we are going from indirect to
2034          * indirect, then it is possible that the new size will fit in the old
2035          * indirect map.  If this is the case, then just reset the size of the
2036          * vstruct map and we are done.  If the new size will not
2037          * fit into the old indirect map, then we have to allocate a new
2038          * indirect map and copy the old map pointers into this new map.
2039          *
2040          * If we are going from direct to indirect, then we have to allocate a
2041          * new indirect map and copy the old direct pages into the first
2042          * indirect page of the new map.
2043          * NOTE: allocating memory here is dangerous, as we're in the
2044          * pageout path.
2045          */
2046         if (INDIRECT_CLMAP(new_size)) {
2047                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
2048
2049                 /*
2050                  * Get a new indirect map and zero it.
2051                  */
2052                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
2053                 if (vs->vs_indirect &&
2054                     (new_map_size == old_map_size)) {
2055                         bs_commit(new_size - vs->vs_size);
2056                         vs->vs_size = new_size;
2057                         return 0;
2058                 }
2059
2060                 new_imap = (struct vs_map **)kalloc(new_map_size);
2061                 if (new_imap == NULL) {
2062                         return -1;
2063                 }
2064                 memset(new_imap, 0, new_map_size);
2065
2066                 if (vs->vs_indirect) {
2067                         /* Copy old entries into new map */
2068                         memcpy(new_imap, vs->vs_imap, old_map_size);
2069                         /* Arrange to free the old map */
2070                         old_map = (void *) vs->vs_imap;
2071                         newdsize = 0;
2072                 } else {        /* Old map was a direct map */
2073                         /* Allocate an indirect page */
2074                         if ((new_imap[0] = (struct vs_map *)
2075                              kalloc(CLMAP_THRESHOLD)) == NULL) {
2076                                 kfree(new_imap, new_map_size);
2077                                 return -1;
2078                         }
2079                         new_dmap = new_imap[0];
2080                         newdsize = CLMAP_ENTRIES;
2081                 }
2082         } else {
2083                 new_imap = NULL;
2084                 newdsize = new_size;
2085                 /*
2086                  * If the new map is a direct map, then the old map must
2087                  * also have been a direct map.  All we have to do is
2088                  * to allocate a new direct map, copy the old entries
2089                  * into it and free the old map.
2090                  */
2091                 if ((new_dmap = (struct vs_map *)
2092                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
2093                         return -1;
2094                 }
2095         }
2096         if (newdsize) {
2097
2098                 /* Free the old map */
2099                 old_map = (void *) vs->vs_dmap;
2100                 old_map_size = CLMAP_SIZE(vs->vs_size);
2101
2102                 /* Copy info from the old map into the new map */
2103                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
2104
2105                 /* Initialize the rest of the new map */
2106                 for (i = vs->vs_size; i < newdsize; i++)
2107                         VSM_CLR(new_dmap[i]);
2108         }
2109         if (new_imap) {
2110                 vs->vs_imap = new_imap;
2111                 vs->vs_indirect = TRUE;
2112         } else
2113                 vs->vs_dmap = new_dmap;
2114         bs_commit(new_size - vs->vs_size);
2115         vs->vs_size = new_size;
2116         if (old_map)
2117                 kfree(old_map, old_map_size);
2118         return 0;
2119 }
2120
2121 dp_offset_t
2122 ps_clmap(
2123         vstruct_t       vs,
2124         dp_offset_t     offset,
2125         struct clmap    *clmap,
2126         int             flag,
2127         dp_size_t       size,
2128         int             error)
2129 {
2130         dp_offset_t     cluster;        /* The cluster of offset.       */
2131         dp_offset_t     newcl;          /* The new cluster allocated.   */
2132         dp_offset_t     newoff;
2133         unsigned int    i;
2134         struct vs_map   *vsmap;
2135
2136         VS_MAP_LOCK(vs);
2137
2138         ASSERT(vs->vs_dmap);
2139         cluster = atop_32(offset) >> vs->vs_clshift;
2140
2141         /*
2142          * Initialize cluster error value
2143          */
2144         clmap->cl_error = 0;
2145
2146         /*
2147          * If the object has grown, extend the page map.
2148          */
2149         if (cluster >= vs->vs_size) {
2150                 if (flag == CL_FIND) {
2151                         /* Do not allocate if just doing a lookup */
2152                         VS_MAP_UNLOCK(vs);
2153                         return (dp_offset_t) -1;
2154                 }
2155                 if (ps_map_extend(vs, cluster + 1)) {
2156                         VS_MAP_UNLOCK(vs);
2157                         return (dp_offset_t) -1;
2158                 }
2159         }
2160
2161         /*
2162          * Look for the desired cluster.  If the map is indirect, then we
2163          * have a two level lookup.  First find the indirect block, then
2164          * find the actual cluster.  If the indirect block has not yet
2165          * been allocated, then do so.  If the cluster has not yet been
2166          * allocated, then do so.
2167          *
2168          * If any of the allocations fail, then return an error.
2169          * Don't allocate if just doing a lookup.
2170          */
2171         if (vs->vs_indirect) {
2172                 long    ind_block = cluster/CLMAP_ENTRIES;
2173
2174                 /* Is the indirect block allocated? */
2175                 vsmap = vs->vs_imap[ind_block];
2176                 if (vsmap == NULL) {
2177                         if (flag == CL_FIND) {
2178                                 VS_MAP_UNLOCK(vs);
2179                                 return (dp_offset_t) -1;
2180                         }
2181
2182                         /* Allocate the indirect block */
2183                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
2184                         if (vsmap == NULL) {
2185                                 VS_MAP_UNLOCK(vs);
2186                                 return (dp_offset_t) -1;
2187                         }
2188                         /* Initialize the cluster offsets */
2189                         for (i = 0; i < CLMAP_ENTRIES; i++)
2190                                 VSM_CLR(vsmap[i]);
2191                         vs->vs_imap[ind_block] = vsmap;
2192                 }
2193         } else
2194                 vsmap = vs->vs_dmap;
2195
2196         ASSERT(vsmap);
2197         vsmap += cluster%CLMAP_ENTRIES;
2198
2199         /*
2200          * At this point, vsmap points to the struct vs_map desired.
2201          *
2202          * Look in the map for the cluster, if there was an error on a
2203          * previous write, flag it and return.  If it is not yet
2204          * allocated, then allocate it, if we're writing; if we're
2205          * doing a lookup and the cluster's not allocated, return error.
2206          */
2207         if (VSM_ISERR(*vsmap)) {
2208                 clmap->cl_error = VSM_GETERR(*vsmap);
2209                 VS_MAP_UNLOCK(vs);
2210                 return (dp_offset_t) -1;
2211         } else if (VSM_ISCLR(*vsmap)) {
2212                 int psindex;
2213
2214                 if (flag == CL_FIND) {
2215                         /*
2216                          * If there's an error and the entry is clear, then
2217                          * we've run out of swap space.  Record the error
2218                          * here and return.
2219                          */
2220                         if (error) {
2221                                 VSM_SETERR(*vsmap, error);
2222                         }
2223                         VS_MAP_UNLOCK(vs);
2224                         return (dp_offset_t) -1;
2225                 } else {
2226                         /*
2227                          * Attempt to allocate a cluster from the paging segment
2228                          */
2229                         newcl = ps_allocate_cluster(vs, &psindex,
2230                                                     PAGING_SEGMENT_NULL);
2231                         if (newcl == (dp_offset_t) -1) {
2232                                 VS_MAP_UNLOCK(vs);
2233                                 return (dp_offset_t) -1;
2234                         }
2235                         VSM_CLR(*vsmap);
2236                         VSM_SETCLOFF(*vsmap, newcl);
2237                         VSM_SETPS(*vsmap, psindex);
2238                 }
2239         } else
2240                 newcl = VSM_CLOFF(*vsmap);
2241
2242         /*
2243          * Fill in pertinent fields of the clmap
2244          */
2245         clmap->cl_ps = VSM_PS(*vsmap);
2246         clmap->cl_numpages = VSCLSIZE(vs);
2247         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
2248
2249         /*
2250          * Byte offset in paging segment is byte offset to cluster plus
2251          * byte offset within cluster.  It looks ugly, but should be
2252          * relatively quick.
2253          */
2254         ASSERT(trunc_page(offset) == offset);
2255         newcl = ptoa_32(newcl) << vs->vs_clshift;
2256         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
2257         if (flag == CL_ALLOC) {
2258                 /*
2259                  * set bits in the allocation bitmap according to which
2260                  * pages were requested.  size is in bytes.
2261                  */
2262                 i = atop_32(newoff);
2263                 while ((size > 0) && (i < VSCLSIZE(vs))) {
2264                         VSM_SETALLOC(*vsmap, i);
2265                         i++;
2266                         size -= vm_page_size;
2267                 }
2268         }
2269         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
2270         if (newoff) {
2271                 /*
2272                  * Offset is not cluster aligned, so number of pages
2273                  * and bitmaps must be adjusted
2274                  */
2275                 clmap->cl_numpages -= atop_32(newoff);
2276                 CLMAP_SHIFT(clmap, vs);
2277                 CLMAP_SHIFTALLOC(clmap, vs);
2278         }
2279
2280         /*
2281          *
2282          * The setting of valid bits and handling of write errors
2283          * must be done here, while we hold the lock on the map.
2284          * It logically should be done in ps_vs_write_complete().
2285          * The size and error information has been passed from
2286          * ps_vs_write_complete().  If the size parameter is non-zero,
2287          * then there is work to be done.  If error is also non-zero,
2288          * then the error number is recorded in the cluster and the
2289          * entire cluster is in error.
2290          */
2291         if (size && flag == CL_FIND) {
2292                 dp_offset_t off = (dp_offset_t) 0;
2293
2294                 if (!error) {
2295                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
2296                              i++) {
2297                                 VSM_SETPG(*vsmap, i);
2298                                 size -= vm_page_size;
2299                         }
2300                         ASSERT(i <= VSCLSIZE(vs));
2301                 } else {
2302                         BS_STAT(clmap->cl_ps->ps_bs,
2303                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
2304                                         atop_32(size));
2305                         off = VSM_CLOFF(*vsmap);
2306                         VSM_SETERR(*vsmap, error);
2307                 }
2308                 /*
2309                  * Deallocate cluster if error, and no valid pages
2310                  * already present.
2311                  */
2312                 if (off != (dp_offset_t) 0)
2313                         ps_deallocate_cluster(clmap->cl_ps, off);
2314                 VS_MAP_UNLOCK(vs);
2315                 return (dp_offset_t) 0;
2316         } else
2317                 VS_MAP_UNLOCK(vs);
2318
2319         DP_DEBUG(DEBUG_VS_INTERNAL,
2320                  ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2321                   newcl+newoff, (int) vs, (int) vsmap, flag));
2322         DP_DEBUG(DEBUG_VS_INTERNAL,
2323                  ("     clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2324                   (int) clmap->cl_ps, clmap->cl_numpages,
2325                   (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
2326
2327         return (newcl + newoff);
2328 }
2329
2330 void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t);     /* forward */
2331
2332 void
2333 ps_clunmap(
2334         vstruct_t       vs,
2335         dp_offset_t     offset,
2336         dp_size_t       length)
2337 {
2338         dp_offset_t             cluster; /* The cluster number of offset */
2339         struct vs_map           *vsmap;
2340         struct ps_vnode_trim_data trim_data;
2341
2342         ps_vnode_trim_init(&trim_data);
2343
2344         VS_MAP_LOCK(vs);
2345
2346         /*
2347          * Loop through all clusters in this range, freeing paging segment
2348          * clusters and map entries as encountered.
2349          */
2350         while (length > 0) {
2351                 dp_offset_t     newoff;
2352                 unsigned int    i;
2353
2354                 cluster = atop_32(offset) >> vs->vs_clshift;
2355                 if (vs->vs_indirect)    /* indirect map */
2356                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2357                 else
2358                         vsmap = vs->vs_dmap;
2359                 if (vsmap == NULL) {
2360                         ps_vnode_trim_now(&trim_data);
2361                         VS_MAP_UNLOCK(vs);
2362                         return;
2363                 }
2364                 vsmap += cluster%CLMAP_ENTRIES;
2365                 if (VSM_ISCLR(*vsmap)) {
2366                         ps_vnode_trim_now(&trim_data);
2367                         length -= vm_page_size;
2368                         offset += vm_page_size;
2369                         continue;
2370                 }
2371                 /*
2372                  * We've got a valid mapping.  Clear it and deallocate
2373                  * paging segment cluster pages.
2374                  * Optimize for entire cluster cleraing.
2375                  */
2376                 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2377                         /*
2378                          * Not cluster aligned.
2379                          */
2380                         ASSERT(trunc_page(newoff) == newoff);
2381                         i = atop_32(newoff);
2382                 } else
2383                         i = 0;
2384                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2385                         VSM_CLRPG(*vsmap, i);
2386                         VSM_CLRALLOC(*vsmap, i);
2387                         length -= vm_page_size;
2388                         offset += vm_page_size;
2389                         i++;
2390                 }
2391
2392                 /*
2393                  * If map entry is empty, clear and deallocate cluster.
2394                  */
2395                 if (!VSM_BMAP(*vsmap)) {
2396                         ps_vnode_trim_more(&trim_data,
2397                                               vsmap,
2398                                               vs->vs_clshift,
2399                                               VSCLSIZE(vs) * vm_page_size);
2400                         ps_deallocate_cluster(VSM_PS(*vsmap),
2401                                               VSM_CLOFF(*vsmap));
2402                         VSM_CLR(*vsmap);
2403                 } else {
2404                         ps_vnode_trim_now(&trim_data);
2405                 }
2406         }
2407         ps_vnode_trim_now(&trim_data);
2408
2409         VS_MAP_UNLOCK(vs);
2410 }
2411
2412 void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */
2413
2414 void
2415 ps_vs_write_complete(
2416         vstruct_t       vs,
2417         dp_offset_t     offset,
2418         dp_size_t       size,
2419         int             error)
2420 {
2421         struct clmap    clmap;
2422
2423         /*
2424          * Get the struct vsmap for this cluster.
2425          * Use READ, even though it was written, because the
2426          * cluster MUST be present, unless there was an error
2427          * in the original ps_clmap (e.g. no space), in which
2428          * case, nothing happens.
2429          *
2430          * Must pass enough information to ps_clmap to allow it
2431          * to set the vs_map structure bitmap under lock.
2432          */
2433         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2434 }
2435
2436 void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int);    /* forward */
2437
2438 void
2439 vs_cl_write_complete(
2440         vstruct_t                       vs,
2441         __unused paging_segment_t       ps,
2442         dp_offset_t                     offset,
2443         __unused vm_offset_t            addr,
2444         dp_size_t                       size,
2445         boolean_t                       async,
2446         int                             error)
2447 {
2448 //      kern_return_t   kr;
2449
2450         if (error) {
2451                 /*
2452                  * For internal objects, the error is recorded on a
2453                  * per-cluster basis by ps_clmap() which is called
2454                  * by ps_vs_write_complete() below.
2455                  */
2456                 dprintf(("write failed error = 0x%x\n", error));
2457                 /* add upl_abort code here */
2458         } else
2459                 GSTAT(global_stats.gs_pages_out += atop_32(size));
2460         /*
2461          * Notify the vstruct mapping code, so it can do its accounting.
2462          */
2463         ps_vs_write_complete(vs, offset, size, error);
2464
2465         if (async) {
2466                 VS_LOCK(vs);
2467                 ASSERT(vs->vs_async_pending > 0);
2468                 vs->vs_async_pending -= size;
2469                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2470                         vs->vs_waiting_async = FALSE;
2471                         VS_UNLOCK(vs);
2472                         thread_wakeup(&vs->vs_async_pending);
2473                 } else {
2474                         VS_UNLOCK(vs);
2475                 }
2476         }
2477 }
2478
2479 #ifdef DEVICE_PAGING
2480 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2481
2482 kern_return_t
2483 device_write_reply(
2484         MACH_PORT_FACE  reply_port,
2485         kern_return_t   device_code,
2486         io_buf_len_t    bytes_written)
2487 {
2488         struct vs_async *vsa;
2489
2490         vsa = (struct vs_async *)
2491                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2492
2493         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2494                 device_code = KERN_FAILURE;
2495         }
2496
2497         vsa->vsa_error = device_code;
2498
2499
2500         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2501         if(vsa->vsa_flags & VSA_TRANSFER) {
2502                 /* revisit when async disk segments redone */
2503                 if(vsa->vsa_error) {
2504                    /* need to consider error condition.  re-write data or */
2505                    /* throw it away here. */
2506                    vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2507                 }
2508                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2509                                                 vsa->vsa_size, vsa->vsa_error);
2510         } else {
2511                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2512                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2513                              vsa->vsa_error);
2514         }
2515         VS_FREE_ASYNC(vsa);
2516
2517         return KERN_SUCCESS;
2518 }
2519
2520 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2521 kern_return_t
2522 device_write_reply_inband(
2523         MACH_PORT_FACE          reply_port,
2524         kern_return_t           return_code,
2525         io_buf_len_t            bytes_written)
2526 {
2527         panic("device_write_reply_inband: illegal");
2528         return KERN_SUCCESS;
2529 }
2530
2531 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2532 kern_return_t
2533 device_read_reply(
2534         MACH_PORT_FACE          reply_port,
2535         kern_return_t           return_code,
2536         io_buf_ptr_t            data,
2537         mach_msg_type_number_t  dataCnt)
2538 {
2539         struct vs_async *vsa;
2540         vsa = (struct vs_async *)
2541                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2542         vsa->vsa_addr = (vm_offset_t)data;
2543         vsa->vsa_size = (vm_size_t)dataCnt;
2544         vsa->vsa_error = return_code;
2545         thread_wakeup(&vsa);
2546         return KERN_SUCCESS;
2547 }
2548
2549 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2550 kern_return_t
2551 device_read_reply_inband(
2552         MACH_PORT_FACE          reply_port,
2553         kern_return_t           return_code,
2554         io_buf_ptr_inband_t     data,
2555         mach_msg_type_number_t  dataCnt)
2556 {
2557         panic("device_read_reply_inband: illegal");
2558         return KERN_SUCCESS;
2559 }
2560
2561 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2562 kern_return_t
2563 device_read_reply_overwrite(
2564         MACH_PORT_FACE          reply_port,
2565         kern_return_t           return_code,
2566         io_buf_len_t            bytes_read)
2567 {
2568         panic("device_read_reply_overwrite: illegal\n");
2569         return KERN_SUCCESS;
2570 }
2571
2572 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2573 kern_return_t
2574 device_open_reply(
2575         MACH_PORT_FACE          reply_port,
2576         kern_return_t           return_code,
2577         MACH_PORT_FACE          device_port)
2578 {
2579         panic("device_open_reply: illegal\n");
2580         return KERN_SUCCESS;
2581 }
2582
2583 kern_return_t
2584 ps_read_device(
2585         paging_segment_t        ps,
2586         dp_offset_t             offset,
2587         vm_offset_t             *bufferp,
2588         unsigned int            size,
2589         unsigned int            *residualp,
2590         int                     flags)
2591 {
2592         kern_return_t   kr;
2593         recnum_t        dev_offset;
2594         unsigned int    bytes_wanted;
2595         unsigned int    bytes_read;
2596         unsigned int    total_read;
2597         vm_offset_t     dev_buffer;
2598         vm_offset_t     buf_ptr;
2599         unsigned int    records_read;
2600         struct vs_async *vsa;
2601
2602         device_t        device;
2603         vm_map_copy_t   device_data = NULL;
2604         default_pager_thread_t *dpt = NULL;
2605
2606         device = dev_port_lookup(ps->ps_device);
2607         clustered_reads[atop_32(size)]++;
2608
2609         dev_offset = (ps->ps_offset +
2610                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2611         bytes_wanted = size;
2612         total_read = 0;
2613         *bufferp = (vm_offset_t)NULL;
2614
2615         do {
2616                 vsa = VS_ALLOC_ASYNC();
2617                 if (vsa) {
2618                         vsa->vsa_vs = NULL;
2619                         vsa->vsa_addr = 0;
2620                         vsa->vsa_offset = 0;
2621                         vsa->vsa_size = 0;
2622                         vsa->vsa_ps = NULL;
2623                 }
2624                 ip_lock(vsa->reply_port);
2625                 vsa->reply_port->ip_sorights++;
2626                 ip_reference(vsa->reply_port);
2627                 ip_unlock(vsa->reply_port);
2628                 kr = ds_device_read_common(device,
2629                                  vsa->reply_port,
2630                                  (mach_msg_type_name_t)
2631                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2632                                  (dev_mode_t) 0,
2633                                  dev_offset,
2634                                  bytes_wanted,
2635                                  (IO_READ | IO_CALL),
2636                                  (io_buf_ptr_t *) &dev_buffer,
2637                                  (mach_msg_type_number_t *) &bytes_read);
2638                 if(kr == MIG_NO_REPLY) {
2639                         assert_wait(&vsa, THREAD_UNINT);
2640                         thread_block(THREAD_CONTINUE_NULL);
2641
2642                         dev_buffer = vsa->vsa_addr;
2643                         bytes_read = (unsigned int)vsa->vsa_size;
2644                         kr = vsa->vsa_error;
2645                 }
2646                 VS_FREE_ASYNC(vsa);
2647                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2648                         break;
2649                 }
2650                 total_read += bytes_read;
2651
2652                 /*
2653                  * If we got the entire range, use the returned dev_buffer.
2654                  */
2655                 if (bytes_read == size) {
2656                         *bufferp = (vm_offset_t)dev_buffer;
2657                         break;
2658                 }
2659
2660 #if 1
2661                 dprintf(("read only %d bytes out of %d\n",
2662                          bytes_read, bytes_wanted));
2663 #endif
2664                 if(dpt == NULL) {
2665                         dpt = get_read_buffer();
2666                         buf_ptr = dpt->dpt_buffer;
2667                         *bufferp = (vm_offset_t)buf_ptr;
2668                 }
2669                 /*
2670                  * Otherwise, copy the data into the provided buffer (*bufferp)
2671                  * and append the rest of the range as it comes in.
2672                  */
2673                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2674                 buf_ptr += bytes_read;
2675                 bytes_wanted -= bytes_read;
2676                 records_read = (bytes_read >>
2677                                 (vm_page_shift - ps->ps_record_shift));
2678                 dev_offset += records_read;
2679                 DP_DEBUG(DEBUG_VS_INTERNAL,
2680                          ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2681                           dev_buffer, bytes_read));
2682                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2683                     != KERN_SUCCESS)
2684                         Panic("dealloc buf");
2685         } while (bytes_wanted);
2686
2687         *residualp = size - total_read;
2688         if((dev_buffer != *bufferp) && (total_read != 0)) {
2689                 vm_offset_t temp_buffer;
2690                 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2691                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2692                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2693                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2694                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2695                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2696                         (vm_map_copy_t *)&device_data, FALSE))
2697                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2698         }
2699         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2700                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2701                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2702                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2703                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2704                         (vm_map_copy_t *)&device_data, FALSE))
2705                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2706         }
2707         else {
2708                 device_data = NULL;
2709         }
2710         *bufferp = (vm_offset_t)device_data;
2711
2712         if(dpt != NULL) {
2713                 /* Free the receive buffer */
2714                 dpt->checked_out = 0;
2715                 thread_wakeup(&dpt_array);
2716         }
2717         return KERN_SUCCESS;
2718 }
2719
2720 kern_return_t
2721 ps_write_device(
2722         paging_segment_t        ps,
2723         dp_offset_t             offset,
2724         vm_offset_t             addr,
2725         unsigned int            size,
2726         struct vs_async         *vsa)
2727 {
2728         recnum_t        dev_offset;
2729         io_buf_len_t    bytes_to_write, bytes_written;
2730         recnum_t        records_written;
2731         kern_return_t   kr;
2732         MACH_PORT_FACE  reply_port;
2733
2734
2735
2736         clustered_writes[atop_32(size)]++;
2737
2738         dev_offset = (ps->ps_offset +
2739                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2740         bytes_to_write = size;
2741
2742         if (vsa) {
2743                 /*
2744                  * Asynchronous write.
2745                  */
2746                 reply_port = vsa->reply_port;
2747                 ip_lock(reply_port);
2748                 reply_port->ip_sorights++;
2749                 ip_reference(reply_port);
2750                 ip_unlock(reply_port);
2751                 {
2752                 device_t        device;
2753                 device = dev_port_lookup(ps->ps_device);
2754
2755                 vsa->vsa_addr = addr;
2756                 kr=ds_device_write_common(device,
2757                         reply_port,
2758                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2759                         (dev_mode_t) 0,
2760                         dev_offset,
2761                         (io_buf_ptr_t)  addr,
2762                         size,
2763                         (IO_WRITE | IO_CALL),
2764                         &bytes_written);
2765                 }
2766                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2767                         if (verbose)
2768                                 dprintf(("%s0x%x, addr=0x%x,"
2769                                          "size=0x%x,offset=0x%x\n",
2770                                          "device_write_request returned ",
2771                                          kr, addr, size, offset));
2772                         BS_STAT(ps->ps_bs,
2773                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2774                         /* do the completion notification to free resources */
2775                         device_write_reply(reply_port, kr, 0);
2776                         return PAGER_ERROR;
2777                 }
2778         } else do {
2779                 /*
2780                  * Synchronous write.
2781                  */
2782                 {
2783                 device_t        device;
2784                 device = dev_port_lookup(ps->ps_device);
2785                 kr=ds_device_write_common(device,
2786                         IP_NULL, 0,
2787                         (dev_mode_t) 0,
2788                         dev_offset,
2789                         (io_buf_ptr_t)  addr,
2790                         size,
2791                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2792                         &bytes_written);
2793                 }
2794                 if (kr != KERN_SUCCESS) {
2795                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2796                                  "device_write returned ",
2797                                  kr, addr, size, offset));
2798                         BS_STAT(ps->ps_bs,
2799                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2800                         return PAGER_ERROR;
2801                 }
2802                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2803                         Panic("fragmented write");
2804                 records_written = (bytes_written >>
2805                                    (vm_page_shift - ps->ps_record_shift));
2806                 dev_offset += records_written;
2807 #if 1
2808                 if (bytes_written != bytes_to_write) {
2809                         dprintf(("wrote only %d bytes out of %d\n",
2810                                  bytes_written, bytes_to_write));
2811                 }
2812 #endif
2813                 bytes_to_write -= bytes_written;
2814                 addr += bytes_written;
2815         } while (bytes_to_write > 0);
2816
2817         return PAGER_SUCCESS;
2818 }
2819
2820
2821 #else /* !DEVICE_PAGING */
2822
2823 kern_return_t
2824 ps_read_device(
2825         __unused paging_segment_t       ps,
2826         __unused dp_offset_t            offset,
2827         __unused vm_offset_t            *bufferp,
2828         __unused unsigned int           size,
2829         __unused unsigned int           *residualp,
2830         __unused int                            flags)
2831 {
2832   panic("ps_read_device not supported");
2833   return KERN_FAILURE;
2834 }
2835
2836 kern_return_t
2837 ps_write_device(
2838         __unused paging_segment_t       ps,
2839         __unused dp_offset_t            offset,
2840         __unused vm_offset_t            addr,
2841         __unused unsigned int           size,
2842         __unused struct vs_async        *vsa)
2843 {
2844   panic("ps_write_device not supported");
2845   return KERN_FAILURE;
2846 }
2847
2848 #endif /* DEVICE_PAGING */
2849 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t);      /* forward */
2850
2851 void
2852 pvs_object_data_provided(
2853         __unused vstruct_t              vs,
2854         __unused upl_t                  upl,
2855         __unused upl_offset_t   offset,
2856         upl_size_t                              size)
2857 {
2858
2859         DP_DEBUG(DEBUG_VS_INTERNAL,
2860                  ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2861                   upl, offset, size));
2862
2863         ASSERT(size > 0);
2864         GSTAT(global_stats.gs_pages_in += atop_32(size));
2865
2866 /* check upl iosync flag instead of using RECLAIM_SWAP*/
2867 #if     RECLAIM_SWAP
2868         if (size != upl->size) {
2869                 upl_abort(upl, UPL_ABORT_ERROR);
2870                 upl_deallocate(upl);
2871         } else {
2872                 ps_clunmap(vs, offset, size);
2873                 upl_commit(upl, NULL, 0);
2874                 upl_deallocate(upl);
2875         }
2876 #endif  /* RECLAIM_SWAP */
2877
2878 }
2879
2880 static memory_object_offset_t   last_start;
2881 static vm_size_t                last_length;
2882
2883 /*
2884  * A "cnt" of 0 means that the caller just wants to check if the page at
2885  * offset "vs_offset" exists in the backing store.  That page hasn't been
2886  * prepared, so no need to release it.
2887  *
2888  * A "cnt" of -1 means that the caller wants to bring back from the backing
2889  * store all existing pages in the cluster containing "vs_offset".
2890  */
2891 kern_return_t
2892 pvs_cluster_read(
2893         vstruct_t       vs,
2894         dp_offset_t     vs_offset,
2895         dp_size_t       cnt,
2896         void            *fault_info)
2897 {
2898         kern_return_t           error = KERN_SUCCESS;
2899         unsigned int            size;
2900         unsigned int            residual;
2901         unsigned int            request_flags;
2902         int                     io_flags = 0;
2903         int                     seg_index;
2904         int                     pages_in_cl;
2905         int                     cl_size;
2906         int                     cl_mask;
2907         int                     cl_index;
2908         unsigned int            xfer_size;
2909         dp_offset_t             orig_vs_offset;
2910         dp_offset_t       ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2911         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2912         struct clmap            clmap;
2913         upl_t                   upl;
2914         unsigned int            page_list_count;
2915         memory_object_offset_t  cluster_start;
2916         vm_size_t               cluster_length;
2917         uint32_t                io_streaming;
2918         int                     i;
2919         boolean_t               io_sync = FALSE;
2920
2921         pages_in_cl = 1 << vs->vs_clshift;
2922         cl_size = pages_in_cl * vm_page_size;
2923         cl_mask = cl_size - 1;
2924
2925         request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2926
2927         if (cnt == (dp_size_t) -1) {
2928                 /*
2929                  * We've been called from ps_vstruct_reclaim() to move all
2930                  * the object's swapped pages back to VM pages.
2931                  * This can put memory pressure on the system, so we do want
2932                  * to wait for free pages, to avoid getting in the way of the
2933                  * vm_pageout_scan() thread.
2934                  * Let's not use UPL_NOBLOCK in this case.
2935                  */
2936                 vs_offset &= ~cl_mask;
2937                 i = pages_in_cl;
2938         } else {
2939                 i = 1;
2940                 request_flags |= UPL_NOBLOCK;
2941         }
2942
2943 again:
2944         cl_index = (vs_offset & cl_mask) / vm_page_size;
2945
2946         if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) ||
2947             !CLMAP_ISSET(clmap, cl_index)) {
2948                 /*
2949                  * the needed page doesn't exist in the backing store...
2950                  * we don't want to try to do any I/O, just abort the
2951                  * page and let the fault handler provide a zero-fill
2952                  */
2953                 if (cnt == 0) {
2954                         /*
2955                          * The caller was just poking at us to see if
2956                          * the page has been paged out.  No need to
2957                          * mess with the page at all.
2958                          * Just let the caller know we don't have that page.
2959                          */
2960                         return KERN_FAILURE;
2961                 }
2962                 if (cnt == (dp_size_t) -1) {
2963                         i--;
2964                         if (i == 0) {
2965                                 /* no more pages in this cluster */
2966                                 return KERN_FAILURE;
2967                         }
2968                         /* try the next page in this cluster */
2969                         vs_offset += vm_page_size;
2970                         goto again;
2971                 }
2972
2973                 page_list_count = 0;
2974
2975                 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2976                                                 PAGE_SIZE, PAGE_SIZE,
2977                                                 &upl, NULL, &page_list_count,
2978                                                 request_flags);
2979
2980                 if (clmap.cl_error)
2981                         upl_abort(upl, UPL_ABORT_ERROR);
2982                 else
2983                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2984                 upl_deallocate(upl);
2985
2986                 return KERN_SUCCESS;
2987         }
2988
2989         if (cnt == 0) {
2990                 /*
2991                  * The caller was just poking at us to see if
2992                  * the page has been paged out.  No need to
2993                  * mess with the page at all.
2994                  * Just let the caller know we do have that page.
2995                  */
2996                 return KERN_SUCCESS;
2997         }
2998
2999         if(((vm_object_fault_info_t)fault_info)->io_sync == TRUE ) {
3000                 io_sync = TRUE;
3001         } else {
3002 #if RECLAIM_SWAP
3003                 io_sync = TRUE;
3004 #endif  /* RECLAIM_SWAP */
3005         }
3006
3007         if( io_sync == TRUE ) {
3008
3009                 io_flags |= UPL_IOSYNC | UPL_NOCOMMIT;
3010 #if USE_PRECIOUS
3011                 request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE;
3012 #else   /* USE_PRECIOUS */
3013                 request_flags |= UPL_REQUEST_SET_DIRTY;
3014 #endif  /* USE_PRECIOUS */
3015         }
3016
3017         assert(dp_encryption_inited);
3018         if (dp_encryption) {
3019                 /*
3020                  * ENCRYPTED SWAP:
3021                  * request that the UPL be prepared for
3022                  * decryption.
3023                  */
3024                 request_flags |= UPL_ENCRYPT;
3025                 io_flags |= UPL_PAGING_ENCRYPTED;
3026         }
3027         orig_vs_offset = vs_offset;
3028
3029         assert(cnt != 0);
3030         cnt = VM_SUPER_CLUSTER;
3031         cluster_start = (memory_object_offset_t) vs_offset;
3032         cluster_length = (vm_size_t) cnt;
3033         io_streaming = 0;
3034
3035         /*
3036          * determine how big a speculative I/O we should try for...
3037          */
3038         if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
3039                 assert(vs_offset >= (dp_offset_t) cluster_start &&
3040                        vs_offset < (dp_offset_t) (cluster_start + cluster_length));
3041                 vs_offset = (dp_offset_t) cluster_start;
3042                 cnt = (dp_size_t) cluster_length;
3043         } else {
3044                 cluster_length = PAGE_SIZE;
3045                 cnt = PAGE_SIZE;
3046         }
3047
3048         if (io_streaming)
3049                 io_flags |= UPL_IOSTREAMING;
3050
3051         last_start = cluster_start;
3052         last_length = cluster_length;
3053
3054         /*
3055          * This loop will be executed multiple times until the entire
3056          * range has been looked at or we issue an I/O... if the request spans cluster
3057          * boundaries, the clusters will be checked for logical continunity,
3058          * if contiguous the I/O request will span multiple clusters...
3059          * at most only 1 I/O will be issued... it will encompass the original offset
3060          */
3061         while (cnt && error == KERN_SUCCESS) {
3062                 int     ps_info_valid;
3063
3064                 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
3065                         size = VM_SUPER_CLUSTER;
3066                         size -= vs_offset & cl_mask;
3067                 } else if (cnt > VM_SUPER_CLUSTER)
3068                         size = VM_SUPER_CLUSTER;
3069                 else
3070                         size = cnt;
3071
3072                 cnt -= size;
3073
3074                 ps_info_valid = 0;
3075                 seg_index     = 0;
3076
3077                 while (size > 0 && error == KERN_SUCCESS) {
3078                         unsigned int  abort_size;
3079                         int           failed_size;
3080                         int           beg_pseg;
3081                         int           beg_indx;
3082                         dp_offset_t   cur_offset;
3083
3084                         if ( !ps_info_valid) {
3085                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3086                                 psp[seg_index]       = CLMAP_PS(clmap);
3087                                 ps_info_valid = 1;
3088                         }
3089                         /*
3090                          * skip over unallocated physical segments
3091                          */
3092                         if (ps_offset[seg_index] == (dp_offset_t) -1) {
3093                                 abort_size = cl_size - (vs_offset & cl_mask);
3094                                 abort_size = MIN(abort_size, size);
3095
3096                                 size      -= abort_size;
3097                                 vs_offset += abort_size;
3098
3099                                 seg_index++;
3100                                 ps_info_valid = 0;
3101
3102                                 continue;
3103                         }
3104                         cl_index = (vs_offset & cl_mask) / vm_page_size;
3105
3106                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
3107                                 /*
3108                                  * skip over unallocated pages
3109                                  */
3110                                 if (CLMAP_ISSET(clmap, cl_index))
3111                                         break;
3112                                 abort_size += vm_page_size;
3113                         }
3114                         if (abort_size) {
3115                                 size      -= abort_size;
3116                                 vs_offset += abort_size;
3117
3118                                 if (cl_index == pages_in_cl) {
3119                                         /*
3120                                          * if we're at the end of this physical cluster
3121                                          * then bump to the next one and continue looking
3122                                          */
3123                                         seg_index++;
3124                                         ps_info_valid = 0;
3125
3126                                         continue;
3127                                 }
3128                                 if (size == 0)
3129                                         break;
3130                         }
3131                         /*
3132                          * remember the starting point of the first allocated page
3133                          * for the I/O we're about to issue
3134                          */
3135                         beg_pseg   = seg_index;
3136                         beg_indx   = cl_index;
3137                         cur_offset = vs_offset;
3138
3139                         /*
3140                          * calculate the size of the I/O that we can do...
3141                          * this may span multiple physical segments if
3142                          * they are contiguous
3143                          */
3144                         for (xfer_size = 0; xfer_size < size; ) {
3145
3146                                 while (cl_index < pages_in_cl && xfer_size < size) {
3147                                         /*
3148                                          * accumulate allocated pages within
3149                                          * a physical segment
3150                                          */
3151                                         if (CLMAP_ISSET(clmap, cl_index)) {
3152                                                 xfer_size  += vm_page_size;
3153                                                 cur_offset += vm_page_size;
3154                                                 cl_index++;
3155
3156                                                 BS_STAT(psp[seg_index]->ps_bs,
3157                                                         psp[seg_index]->ps_bs->bs_pages_in++);
3158                                         } else
3159                                                 break;
3160                                 }
3161                                 if (cl_index < pages_in_cl || xfer_size >= size) {
3162                                         /*
3163                                          * we've hit an unallocated page or
3164                                          * the end of this request... see if
3165                                          * it's time to fire the I/O
3166                                          */
3167                                         break;
3168                                 }
3169                                 /*
3170                                  * we've hit the end of the current physical
3171                                  * segment and there's more to do, so try
3172                                  * moving to the next one
3173                                  */
3174                                 seg_index++;
3175
3176                                 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3177                                 psp[seg_index] = CLMAP_PS(clmap);
3178                                 ps_info_valid = 1;
3179
3180                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
3181                                         /*
3182                                          * if the physical segment we're about
3183                                          * to step into is not contiguous to
3184                                          * the one we're currently in, or it's
3185                                          * in a different paging file, or
3186                                          * it hasn't been allocated....
3187                                          * we stop this run and go check
3188                                          * to see if it's time to fire the I/O
3189                                          */
3190                                         break;
3191                                 }
3192                                 /*
3193                                  * start with first page of the next physical
3194                                  * segment
3195                                  */
3196                                 cl_index = 0;
3197                         }
3198                         if (xfer_size == 0) {
3199                                 /*
3200                                  * no I/O to generate for this segment
3201                                  */
3202                                 continue;
3203                         }
3204                         if (cur_offset <= orig_vs_offset) {
3205                                 /*
3206                                  * we've hit a hole in our speculative cluster
3207                                  * before the offset that we're really after...
3208                                  * don't issue the I/O since it doesn't encompass
3209                                  * the original offset and we're looking to only
3210                                  * pull in the speculative pages if they can be
3211                                  * made part of a single I/O
3212                                  */
3213                                 size      -= xfer_size;
3214                                 vs_offset += xfer_size;
3215
3216                                 continue;
3217                         }
3218                         /*
3219                          * we have a contiguous range of allocated pages
3220                          * to read from that encompasses the original offset
3221                          */
3222                         page_list_count = 0;
3223                         memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
3224                                                         xfer_size, xfer_size,
3225                                                         &upl, NULL, &page_list_count,
3226                                                         request_flags | UPL_SET_INTERNAL);
3227
3228                         error = ps_read_file(psp[beg_pseg],
3229                                              upl, (upl_offset_t) 0,
3230                                              ps_offset[beg_pseg] + (beg_indx * vm_page_size),
3231                                              xfer_size, &residual, io_flags);
3232
3233                         failed_size = 0;
3234
3235                         /*
3236                          * Adjust counts and send response to VM.  Optimize
3237                          * for the common case, i.e. no error and/or partial
3238                          * data. If there was an error, then we need to error
3239                          * the entire range, even if some data was successfully
3240                          * read. If there was a partial read we may supply some
3241                          * data and may error some as well.  In all cases the
3242                          * VM must receive some notification for every page
3243                          * in the range.
3244                          */
3245                         if ((error == KERN_SUCCESS) && (residual == 0)) {
3246                                 /*
3247                                  * Got everything we asked for, supply the data
3248                                  * to the VM.  Note that as a side effect of
3249                                  * supplying the data, the buffer holding the
3250                                  * supplied data is deallocated from the pager's
3251                                  *  address space.
3252                                  */
3253                                 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
3254                         } else {
3255                                 failed_size = xfer_size;
3256
3257                                 if (error == KERN_SUCCESS) {
3258                                         if (residual == xfer_size) {
3259                                                 /*
3260                                                  * If a read operation returns no error
3261                                                  * and no data moved, we turn it into
3262                                                  * an error, assuming we're reading at
3263                                                  * or beyong EOF.
3264                                                  * Fall through and error the entire range.
3265                                                  */
3266                                                 error = KERN_FAILURE;
3267                                         } else {
3268                                                 /*
3269                                                  * Otherwise, we have partial read. If
3270                                                  * the part read is a integral number
3271                                                  * of pages supply it. Otherwise round
3272                                                  * it up to a page boundary, zero fill
3273                                                  * the unread part, and supply it.
3274                                                  * Fall through and error the remainder
3275                                                  * of the range, if any.
3276                                                  */
3277                                                 int fill;
3278                                                 unsigned int lsize;
3279
3280                                                 fill = residual & ~vm_page_size;
3281                                                 lsize = (xfer_size - residual) + fill;
3282
3283                                                 pvs_object_data_provided(vs, upl, vs_offset, lsize);
3284
3285                                                 if (lsize < xfer_size) {
3286                                                         failed_size = xfer_size - lsize;
3287                                                         error = KERN_FAILURE;
3288                                                 }
3289                                         }
3290                                 }
3291                         }
3292                         if (error != KERN_SUCCESS) {
3293                                 /*
3294                                  * There was an error in some part of the range, tell
3295                                  * the VM. Note that error is explicitly checked again
3296                                  * since it can be modified above.
3297                                  */
3298                                 BS_STAT(psp[beg_pseg]->ps_bs,
3299                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
3300                         }
3301                         /*
3302                          * we've issued a single I/O that encompassed the original offset
3303                          * at this point we either met our speculative request length or
3304                          * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3305                          * not present or not physically contiguous to the previous one), so
3306                          * we're done issuing I/O at this point
3307                          */
3308                         return (error);
3309                 }
3310         }
3311         return error;
3312 }
3313
3314 int vs_do_async_write = 1;
3315
3316 kern_return_t
3317 vs_cluster_write(
3318         vstruct_t       vs,
3319         upl_t           internal_upl,
3320         upl_offset_t    offset,
3321         upl_size_t      cnt,
3322         boolean_t       dp_internal,
3323         int             flags)
3324 {
3325         upl_size_t      transfer_size;
3326         int             error = 0;
3327         struct clmap    clmap;
3328
3329         dp_offset_t     actual_offset;  /* Offset within paging segment */
3330         paging_segment_t ps;
3331         dp_offset_t     mobj_base_addr;
3332         dp_offset_t     mobj_target_addr;
3333
3334         upl_t           upl;
3335         upl_page_info_t *pl;
3336         int             page_index;
3337         unsigned int    page_max_index;
3338         int             list_size;
3339         int             pages_in_cl;
3340         unsigned int    cl_size;
3341         int             base_index;
3342         unsigned int    seg_size;
3343         unsigned int    upl_offset_in_object;
3344         boolean_t       minimal_clustering = FALSE;
3345         boolean_t       found_dirty;
3346
3347         if (!dp_encryption_inited) {
3348                 /*
3349                  * ENCRYPTED SWAP:
3350                  * Once we've started using swap, we
3351                  * can't change our mind on whether
3352                  * it needs to be encrypted or
3353                  * not.
3354                  */
3355                 dp_encryption_inited = TRUE;
3356         }
3357         if (dp_encryption) {
3358                 /*
3359                  * ENCRYPTED SWAP:
3360                  * the UPL will need to be encrypted...
3361                  */
3362                 flags |= UPL_PAGING_ENCRYPTED;
3363         }
3364
3365         pages_in_cl = 1 << vs->vs_clshift;
3366         cl_size = pages_in_cl * vm_page_size;
3367
3368 #if CONFIG_FREEZE
3369         minimal_clustering = TRUE;
3370 #else
3371         if (dp_isssd == TRUE)
3372                 minimal_clustering = TRUE;
3373 #endif
3374         if (!dp_internal) {
3375                 unsigned int page_list_count;
3376                 int          request_flags;
3377                 unsigned int super_size;
3378                 int          first_dirty;
3379                 int          num_dirty;
3380                 int          num_of_pages;
3381                 int          seg_index;
3382                 upl_offset_t  upl_offset;
3383                 upl_offset_t  upl_offset_aligned;
3384                 dp_offset_t  seg_offset;
3385                 dp_offset_t  ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3386                 paging_segment_t   psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3387
3388
3389                 if (bs_low)
3390                         super_size = cl_size;
3391                 else
3392                         super_size = VM_SUPER_CLUSTER;
3393
3394                 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
3395                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
3396                                 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
3397
3398                 if (dp_encryption) {
3399                         /*
3400                          * ENCRYPTED SWAP:
3401                          * request that the UPL be prepared for
3402                          * encryption.
3403                          */
3404                         request_flags |= UPL_ENCRYPT;
3405                         flags |= UPL_PAGING_ENCRYPTED;
3406                 }
3407
3408                 page_list_count = 0;
3409                 memory_object_super_upl_request(vs->vs_control,
3410                                 (memory_object_offset_t)offset,
3411                                 cnt, super_size,
3412                                 &upl, NULL, &page_list_count,
3413                                 request_flags | UPL_FOR_PAGEOUT);
3414
3415                 /*
3416                  * The default pager does not handle objects larger than
3417                  * 4GB, so it does not deal with offset that don't fit in
3418                  * 32-bit.  Cast down upl->offset now and make sure we
3419                  * did not lose any valuable bits.
3420                  */
3421                 upl_offset_in_object = (unsigned int) upl->offset;
3422                 assert(upl->offset == upl_offset_in_object);
3423
3424                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3425
3426                 seg_size = cl_size - (upl_offset_in_object % cl_size);
3427                 upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1);
3428                 page_index = 0;
3429                 page_max_index = upl->size / PAGE_SIZE;
3430                 found_dirty = TRUE;
3431
3432                 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
3433
3434                         unsigned int    seg_pgcnt;
3435
3436                         seg_pgcnt = seg_size / PAGE_SIZE;
3437
3438                         if (minimal_clustering == TRUE) {
3439                                 unsigned int    non_dirty;
3440
3441                                 non_dirty = 0;
3442                                 found_dirty = FALSE;
3443
3444                                 for (; non_dirty < seg_pgcnt; non_dirty++) {
3445                                         if ((page_index + non_dirty) >= page_max_index)
3446                                                 break;
3447
3448                                         if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) ||
3449                                             UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) {
3450                                                 found_dirty = TRUE;
3451                                                 break;
3452                                         }
3453                                 }
3454                         }
3455                         if (found_dirty == TRUE) {
3456                                 ps_offset[seg_index] =
3457                                         ps_clmap(vs,
3458                                                  upl_offset_aligned,
3459                                                  &clmap, CL_ALLOC,
3460                                                  cl_size, 0);
3461
3462                                 if (ps_offset[seg_index] == (dp_offset_t) -1) {
3463                                         upl_abort(upl, 0);
3464                                         upl_deallocate(upl);
3465
3466                                         return KERN_FAILURE;
3467                                 }
3468                                 psp[seg_index] = CLMAP_PS(clmap);
3469                         }
3470                         if (transfer_size > seg_size) {
3471                                 page_index += seg_pgcnt;
3472                                 transfer_size -= seg_size;
3473                                 upl_offset_aligned += cl_size;
3474                                 seg_size = cl_size;
3475                                 seg_index++;
3476                         } else
3477                                 transfer_size = 0;
3478                 }
3479                 /*
3480                  * Ignore any non-present pages at the end of the
3481                  * UPL.
3482                  */
3483                 for (page_index = upl->size / vm_page_size; page_index > 0;)
3484                         if (UPL_PAGE_PRESENT(pl, --page_index))
3485                                 break;
3486                 num_of_pages = page_index + 1;
3487
3488                 base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE;
3489
3490                 for (page_index = 0; page_index < num_of_pages; ) {
3491                         /*
3492                          * skip over non-dirty pages
3493                          */
3494                         for ( ; page_index < num_of_pages; page_index++) {
3495                                 if (UPL_DIRTY_PAGE(pl, page_index)
3496                                         || UPL_PRECIOUS_PAGE(pl, page_index))
3497                                         /*
3498                                          * this is a page we need to write
3499                                          * go see if we can buddy it up with
3500                                          * others that are contiguous to it
3501                                          */
3502                                         break;
3503                                 /*
3504                                  * if the page is not-dirty, but present we
3505                                  * need to commit it...  This is an unusual
3506                                  * case since we only asked for dirty pages
3507                                  */
3508                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
3509                                         boolean_t empty = FALSE;
3510                                         upl_commit_range(upl,
3511                                                  page_index * vm_page_size,
3512                                                  vm_page_size,
3513                                                  UPL_COMMIT_NOTIFY_EMPTY,
3514                                                  pl,
3515                                                  page_list_count,
3516                                                  &empty);
3517                                         if (empty) {
3518                                                 assert(page_index ==
3519                                                        num_of_pages - 1);
3520                                                 upl_deallocate(upl);
3521                                         }
3522                                 }
3523                         }
3524                         if (page_index == num_of_pages)
3525                                 /*
3526                                  * no more pages to look at, we're out of here
3527                                  */
3528                                 break;
3529
3530                         /*
3531                          * gather up contiguous dirty pages... we have at
3532                          * least 1 * otherwise we would have bailed above
3533                          * make sure that each physical segment that we step
3534                          * into is contiguous to the one we're currently in
3535                          * if it's not, we have to stop and write what we have
3536                          */
3537                         for (first_dirty = page_index;
3538                                         page_index < num_of_pages; ) {
3539                                 if ( !UPL_DIRTY_PAGE(pl, page_index)
3540                                         && !UPL_PRECIOUS_PAGE(pl, page_index))
3541                                         break;
3542                                 page_index++;
3543                                 /*
3544                                  * if we just looked at the last page in the UPL
3545                                  * we don't need to check for physical segment
3546                                  * continuity
3547                                  */
3548                                 if (page_index < num_of_pages) {
3549                                         int cur_seg;
3550                                         int nxt_seg;
3551
3552                                         cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3553                                         nxt_seg = (base_index + page_index)/pages_in_cl;
3554
3555                                         if (cur_seg != nxt_seg) {
3556                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3557                                                 /*
3558                                                  * if the segment we're about
3559                                                  * to step into is not
3560                                                  * contiguous to the one we're
3561                                                  * currently in, or it's in a
3562                                                  * different paging file....
3563                                                  * we stop here and generate
3564                                                  * the I/O
3565                                                  */
3566                                                         break;
3567                                         }
3568                                 }
3569                         }
3570                         num_dirty = page_index - first_dirty;
3571
3572                         if (num_dirty) {
3573                                 upl_offset = first_dirty * vm_page_size;
3574                                 transfer_size = num_dirty * vm_page_size;
3575
3576                                 while (transfer_size) {
3577
3578                                         if ((seg_size = cl_size -
3579                                                 ((upl_offset_in_object +
3580                                                   upl_offset) % cl_size))
3581                                                         > transfer_size)
3582                                                 seg_size = transfer_size;
3583
3584                                         ps_vs_write_complete(
3585                                                 vs,
3586                                                 (upl_offset_in_object +
3587                                                  upl_offset),
3588                                                 seg_size, error);
3589
3590                                         transfer_size -= seg_size;
3591                                         upl_offset += seg_size;
3592                                 }
3593                                 upl_offset = first_dirty * vm_page_size;
3594                                 transfer_size = num_dirty * vm_page_size;
3595
3596                                 seg_index  = (base_index + first_dirty) / pages_in_cl;
3597                                 seg_offset = (upl_offset_in_object + upl_offset) % cl_size;
3598
3599                                 error = ps_write_file(psp[seg_index],
3600                                                 upl, upl_offset,
3601                                                 ps_offset[seg_index]
3602                                                                 + seg_offset,
3603                                                 transfer_size, flags);
3604                         } else {
3605                                 boolean_t empty = FALSE;
3606                                 upl_abort_range(upl,
3607                                                 first_dirty * vm_page_size,
3608                                                 num_dirty   * vm_page_size,
3609                                                 UPL_ABORT_NOTIFY_EMPTY,
3610                                                 &empty);
3611                                 if (empty) {
3612                                         assert(page_index == num_of_pages);
3613                                         upl_deallocate(upl);
3614                                 }
3615                         }
3616                 }
3617
3618         } else {
3619                 assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift));
3620                 list_size = cnt;
3621
3622                 page_index = 0;
3623                 /* The caller provides a mapped_data which is derived  */
3624                 /* from a temporary object.  The targeted pages are    */
3625                 /* guaranteed to be set at offset 0 in the mapped_data */
3626                 /* The actual offset however must still be derived     */
3627                 /* from the offset in the vs in question               */
3628                 mobj_base_addr = offset;
3629                 mobj_target_addr = mobj_base_addr;
3630
3631                 for (transfer_size = list_size; transfer_size != 0;) {
3632                         actual_offset = ps_clmap(vs, mobj_target_addr,
3633                                 &clmap, CL_ALLOC,
3634                                 transfer_size < cl_size ?
3635                                         transfer_size : cl_size, 0);
3636                         if(actual_offset == (dp_offset_t) -1) {
3637                                 error = 1;
3638                                 break;
3639                         }
3640                         cnt = MIN(transfer_size,
3641                                   (unsigned) CLMAP_NPGS(clmap) * vm_page_size);
3642                         ps = CLMAP_PS(clmap);
3643                         /* Assume that the caller has given us contiguous */
3644                         /* pages */
3645                         if(cnt) {
3646                                 ps_vs_write_complete(vs, mobj_target_addr,
3647                                                                 cnt, error);
3648                                 error = ps_write_file(ps, internal_upl,
3649                                                 0, actual_offset,
3650                                                 cnt, flags);
3651                                 if (error)
3652                                         break;
3653                            }
3654                         if (error)
3655                                 break;
3656                         actual_offset += cnt;
3657                         mobj_target_addr += cnt;
3658                         transfer_size -= cnt;
3659                         cnt = 0;
3660
3661                         if (error)
3662                                 break;
3663                 }
3664         }
3665         if(error)
3666                 return KERN_FAILURE;
3667         else
3668                 return KERN_SUCCESS;
3669 }
3670
3671 vm_size_t
3672 ps_vstruct_allocated_size(
3673         vstruct_t       vs)
3674 {
3675         int             num_pages;
3676         struct vs_map   *vsmap;
3677         unsigned int    i, j, k;
3678
3679         num_pages = 0;
3680         if (vs->vs_indirect) {
3681                 /* loop on indirect maps */
3682                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3683                         vsmap = vs->vs_imap[i];
3684                         if (vsmap == NULL)
3685                                 continue;
3686                         /* loop on clusters in this indirect map */
3687                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3688                                 if (VSM_ISCLR(vsmap[j]) ||
3689                                     VSM_ISERR(vsmap[j]))
3690                                         continue;
3691                                 /* loop on pages in this cluster */
3692                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3693                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3694                                                 num_pages++;
3695                                 }
3696                         }
3697                 }
3698         } else {
3699                 vsmap = vs->vs_dmap;
3700                 if (vsmap == NULL)
3701                         return 0;
3702                 /* loop on clusters in the direct map */
3703                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3704                         if (VSM_ISCLR(vsmap[j]) ||
3705                             VSM_ISERR(vsmap[j]))
3706                                 continue;
3707                         /* loop on pages in this cluster */
3708                         for (k = 0; k < VSCLSIZE(vs); k++) {
3709                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3710                                         num_pages++;
3711                         }
3712                 }
3713         }
3714
3715         return ptoa_32(num_pages);
3716 }
3717
3718 unsigned int
3719 ps_vstruct_allocated_pages(
3720         vstruct_t               vs,
3721         default_pager_page_t    *pages,
3722         unsigned int            pages_size)
3723 {
3724         unsigned int    num_pages;
3725         struct vs_map   *vsmap;
3726         dp_offset_t     offset;
3727         unsigned int    i, j, k;
3728
3729         num_pages = 0;
3730         offset = 0;
3731         if (vs->vs_indirect) {
3732                 /* loop on indirect maps */
3733                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3734                         vsmap = vs->vs_imap[i];
3735                         if (vsmap == NULL) {
3736                                 offset += (vm_page_size * CLMAP_ENTRIES *
3737                                            VSCLSIZE(vs));
3738                                 continue;
3739                         }
3740                         /* loop on clusters in this indirect map */
3741                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3742                                 if (VSM_ISCLR(vsmap[j]) ||
3743                                     VSM_ISERR(vsmap[j])) {
3744                                         offset += vm_page_size * VSCLSIZE(vs);
3745                                         continue;
3746                                 }
3747                                 /* loop on pages in this cluster */
3748                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3749                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3750                                                 num_pages++;
3751                                                 if (num_pages < pages_size)
3752                                                         pages++->dpp_offset =
3753                                                                 offset;
3754                                         }
3755                                         offset += vm_page_size;
3756                                 }
3757                         }
3758                 }
3759         } else {
3760                 vsmap = vs->vs_dmap;
3761                 if (vsmap == NULL)
3762                         return 0;
3763                 /* loop on clusters in the direct map */
3764                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3765                         if (VSM_ISCLR(vsmap[j]) ||
3766                             VSM_ISERR(vsmap[j])) {
3767                                 offset += vm_page_size * VSCLSIZE(vs);
3768                                 continue;
3769                         }
3770                         /* loop on pages in this cluster */
3771                         for (k = 0; k < VSCLSIZE(vs); k++) {
3772                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3773                                         num_pages++;
3774                                         if (num_pages < pages_size)
3775                                                 pages++->dpp_offset = offset;
3776                                 }
3777                                 offset += vm_page_size;
3778                         }
3779                 }
3780         }
3781
3782         return num_pages;
3783 }
3784
3785
3786 kern_return_t
3787 ps_vstruct_transfer_from_segment(
3788         vstruct_t        vs,
3789         paging_segment_t segment,
3790         upl_t            upl)
3791 {
3792         struct vs_map   *vsmap;
3793 //      struct vs_map   old_vsmap;
3794 //      struct vs_map   new_vsmap;
3795         unsigned int    i, j;
3796
3797         VS_LOCK(vs);    /* block all work on this vstruct */
3798                         /* can't allow the normal multiple write */
3799                         /* semantic because writes may conflict */
3800         vs->vs_xfer_pending = TRUE;
3801         vs_wait_for_sync_writers(vs);
3802         vs_start_write(vs);
3803         vs_wait_for_readers(vs);
3804         /* we will unlock the vs to allow other writes while transferring */
3805         /* and will be guaranteed of the persistance of the vs struct     */
3806         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3807         /* vs_async_pending */
3808         /* OK we now have guaranteed no other parties are accessing this */
3809         /* vs.  Now that we are also supporting simple lock versions of  */
3810         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3811         /* our purpose in holding it before was the multiple write case */
3812         /* we now use the boolean xfer_pending to do that.  We can use  */
3813         /* a boolean instead of a count because we have guaranteed single */
3814         /* file access to this code in its caller */
3815         VS_UNLOCK(vs);
3816 vs_changed:
3817         if (vs->vs_indirect) {
3818                 unsigned int    vsmap_size;
3819                 int             clmap_off;
3820                 /* loop on indirect maps */
3821                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3822                         vsmap = vs->vs_imap[i];
3823                         if (vsmap == NULL)
3824                                 continue;
3825                         /* loop on clusters in this indirect map */
3826                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3827                                            VSCLSIZE(vs) * i);
3828                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3829                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3830                         else
3831                                 vsmap_size = CLMAP_ENTRIES;
3832                         for (j = 0; j < vsmap_size; j++) {
3833                                 if (VSM_ISCLR(vsmap[j]) ||
3834                                     VSM_ISERR(vsmap[j]) ||
3835                                     (VSM_PS(vsmap[j]) != segment))
3836                                         continue;
3837                                 if(vs_cluster_transfer(vs,
3838                                         (vm_page_size * (j << vs->vs_clshift))
3839                                         + clmap_off,
3840                                         vm_page_size << vs->vs_clshift,
3841                                         upl)
3842                                                 != KERN_SUCCESS) {
3843                                    VS_LOCK(vs);
3844                                    vs->vs_xfer_pending = FALSE;
3845                                    VS_UNLOCK(vs);
3846                                    vs_finish_write(vs);
3847                                    return KERN_FAILURE;
3848                                 }
3849                                 /* allow other readers/writers during transfer*/
3850                                 VS_LOCK(vs);
3851                                 vs->vs_xfer_pending = FALSE;
3852                                 VS_UNLOCK(vs);
3853                                 vs_finish_write(vs);
3854
3855                                 if (backing_store_abort_compaction || backing_store_stop_compaction) {
3856                                         backing_store_abort_compaction = FALSE;
3857                                         dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n"));
3858                                         return KERN_FAILURE;
3859                                 }
3860                                 vnode_pager_throttle();
3861
3862                                 VS_LOCK(vs);
3863                                 vs->vs_xfer_pending = TRUE;
3864                                 vs_wait_for_sync_writers(vs);
3865                                 vs_start_write(vs);
3866                                 vs_wait_for_readers(vs);
3867                                 VS_UNLOCK(vs);
3868                                 if (!(vs->vs_indirect)) {
3869                                         goto vs_changed;
3870                                 }
3871                         }
3872                 }
3873         } else {
3874                 vsmap = vs->vs_dmap;
3875                 if (vsmap == NULL) {
3876                         VS_LOCK(vs);
3877                         vs->vs_xfer_pending = FALSE;
3878                         VS_UNLOCK(vs);
3879                         vs_finish_write(vs);
3880                         return KERN_SUCCESS;
3881                 }
3882                 /* loop on clusters in the direct map */
3883                 for (j = 0; j < vs->vs_size; j++) {
3884                         if (VSM_ISCLR(vsmap[j]) ||
3885                             VSM_ISERR(vsmap[j]) ||
3886                             (VSM_PS(vsmap[j]) != segment))
3887                                 continue;
3888                         if(vs_cluster_transfer(vs,
3889                                 vm_page_size * (j << vs->vs_clshift),
3890                                 vm_page_size << vs->vs_clshift,
3891                                 upl) != KERN_SUCCESS) {
3892                            VS_LOCK(vs);
3893                            vs->vs_xfer_pending = FALSE;
3894                            VS_UNLOCK(vs);
3895                            vs_finish_write(vs);
3896                            return KERN_FAILURE;
3897                         }
3898                         /* allow other readers/writers during transfer*/
3899                         VS_LOCK(vs);
3900                         vs->vs_xfer_pending = FALSE;
3901                         VS_UNLOCK(vs);
3902                         vs_finish_write(vs);
3903                         VS_LOCK(vs);
3904                         vs->vs_xfer_pending = TRUE;
3905                         vs_wait_for_sync_writers(vs);
3906                         vs_start_write(vs);
3907                         vs_wait_for_readers(vs);
3908                         VS_UNLOCK(vs);
3909                         if (vs->vs_indirect) {
3910                                 goto vs_changed;
3911                         }
3912                 }
3913         }
3914
3915         VS_LOCK(vs);
3916         vs->vs_xfer_pending = FALSE;
3917         VS_UNLOCK(vs);
3918         vs_finish_write(vs);
3919         return KERN_SUCCESS;
3920 }
3921
3922
3923
3924 vs_map_t
3925 vs_get_map_entry(
3926         vstruct_t       vs,
3927         dp_offset_t     offset)
3928 {
3929         struct vs_map   *vsmap;
3930         dp_offset_t     cluster;
3931
3932         cluster = atop_32(offset) >> vs->vs_clshift;
3933         if (vs->vs_indirect) {
3934                 long    ind_block = cluster/CLMAP_ENTRIES;
3935
3936                 /* Is the indirect block allocated? */
3937                 vsmap = vs->vs_imap[ind_block];
3938                 if(vsmap == (vs_map_t) NULL)
3939                         return vsmap;
3940         } else
3941                 vsmap = vs->vs_dmap;
3942         vsmap += cluster%CLMAP_ENTRIES;
3943         return vsmap;
3944 }
3945
3946 kern_return_t
3947 vs_cluster_transfer(
3948         vstruct_t       vs,
3949         dp_offset_t     offset,
3950         dp_size_t       cnt,
3951         upl_t           upl)
3952 {
3953         dp_offset_t             actual_offset;
3954         paging_segment_t        ps;
3955         struct clmap            clmap;
3956         kern_return_t           error = KERN_SUCCESS;
3957         unsigned int            size, size_wanted;
3958         int                     i;
3959         unsigned int            residual = 0;
3960         unsigned int            unavail_size;
3961 //      default_pager_thread_t  *dpt;
3962 //      boolean_t               dealloc;
3963         struct  vs_map          *vsmap_ptr = NULL;
3964         struct  vs_map          read_vsmap;
3965         struct  vs_map          original_read_vsmap;
3966         struct  vs_map          write_vsmap;
3967 //      upl_t                           sync_upl;
3968 //      vm_offset_t                     ioaddr;
3969
3970         /* vs_cluster_transfer reads in the pages of a cluster and
3971          * then writes these pages back to new backing store.  The
3972          * segment the pages are being read from is assumed to have
3973          * been taken off-line and is no longer considered for new
3974          * space requests.
3975          */
3976
3977         /*
3978          * This loop will be executed once per cluster referenced.
3979          * Typically this means once, since it's unlikely that the
3980          * VM system will ask for anything spanning cluster boundaries.
3981          *
3982          * If there are holes in a cluster (in a paging segment), we stop
3983          * reading at the hole, then loop again, hoping to
3984          * find valid pages later in the cluster.  This continues until
3985          * the entire range has been examined, and read, if present.  The
3986          * pages are written as they are read.  If a failure occurs after
3987          * some pages are written the unmap call at the bottom of the loop
3988          * recovers the backing store and the old backing store remains
3989          * in effect.
3990          */
3991
3992         VSM_CLR(write_vsmap);
3993         VSM_CLR(original_read_vsmap);
3994         /* grab the actual object's pages to sync with I/O */
3995         while (cnt && (error == KERN_SUCCESS)) {
3996                 vsmap_ptr = vs_get_map_entry(vs, offset);
3997                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3998
3999                 if (actual_offset == (dp_offset_t) -1) {
4000
4001                         /*
4002                          * Nothing left to write in this cluster at least
4003                          * set write cluster information for any previous
4004                          * write, clear for next cluster, if there is one
4005                          */
4006                         unsigned int local_size, clmask, clsize;
4007
4008                         clsize = vm_page_size << vs->vs_clshift;
4009                         clmask = clsize - 1;
4010                         local_size = clsize - (offset & clmask);
4011                         ASSERT(local_size);
4012                         local_size = MIN(local_size, cnt);
4013
4014                         /* This cluster has no data in it beyond what may */
4015                         /* have been found on a previous iteration through */
4016                         /* the loop "write_vsmap" */
4017                         *vsmap_ptr = write_vsmap;
4018                         VSM_CLR(write_vsmap);
4019                         VSM_CLR(original_read_vsmap);
4020
4021                         cnt -= local_size;
4022                         offset += local_size;
4023                         continue;
4024                 }
4025
4026                 /*
4027                  * Count up contiguous available or unavailable
4028                  * pages.
4029                  */
4030                 ps = CLMAP_PS(clmap);
4031                 ASSERT(ps);
4032                 size = 0;
4033                 unavail_size = 0;
4034                 for (i = 0;
4035                      (size < cnt) && (unavail_size < cnt) &&
4036                      (i < CLMAP_NPGS(clmap)); i++) {
4037                         if (CLMAP_ISSET(clmap, i)) {
4038                                 if (unavail_size != 0)
4039                                         break;
4040                                 size += vm_page_size;
4041                                 BS_STAT(ps->ps_bs,
4042                                         ps->ps_bs->bs_pages_in++);
4043                         } else {
4044                                 if (size != 0)
4045                                         break;
4046                                 unavail_size += vm_page_size;
4047                         }
4048                 }
4049
4050                 if (size == 0) {
4051                         ASSERT(unavail_size);
4052                         ps_clunmap(vs, offset, unavail_size);
4053                         cnt -= unavail_size;
4054                         offset += unavail_size;
4055                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
4056                                 == 0) {
4057                                 /* There is no more to transfer in this
4058                                    cluster
4059                                 */
4060                                 *vsmap_ptr = write_vsmap;
4061                                 VSM_CLR(write_vsmap);
4062                                 VSM_CLR(original_read_vsmap);
4063                         }
4064                         continue;
4065                 }
4066
4067                 if(VSM_ISCLR(original_read_vsmap))
4068                         original_read_vsmap = *vsmap_ptr;
4069
4070                 if(ps->ps_segtype == PS_PARTITION) {
4071                         panic("swap partition not supported\n");
4072                         /*NOTREACHED*/
4073                         error = KERN_FAILURE;
4074                         residual = size;
4075 /*
4076                         NEED TO ISSUE WITH SYNC & NO COMMIT
4077                         error = ps_read_device(ps, actual_offset, &buffer,
4078                                        size, &residual, flags);
4079 */
4080                 } else {
4081                         /* NEED TO ISSUE WITH SYNC & NO COMMIT */
4082                         error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
4083                                         size, &residual,
4084                                         (UPL_IOSYNC | UPL_NOCOMMIT | (dp_encryption ? UPL_PAGING_ENCRYPTED : 0)));
4085                 }
4086
4087                 read_vsmap = *vsmap_ptr;
4088
4089
4090                 /*
4091                  * Adjust counts and put data in new BS.  Optimize for the
4092                  * common case, i.e. no error and/or partial data.
4093                  * If there was an error, then we need to error the entire
4094                  * range, even if some data was successfully read.
4095                  *
4096                  */
4097                 if ((error == KERN_SUCCESS) && (residual == 0)) {
4098
4099                         /*
4100                          * Got everything we asked for, supply the data to
4101                          * the new BS.  Note that as a side effect of supplying
4102                          * the data, the buffer holding the supplied data is
4103                          * deallocated from the pager's address space unless
4104                          * the write is unsuccessful.
4105                          */
4106
4107                         /* note buffer will be cleaned up in all cases by */
4108                         /* internal_cluster_write or if an error on write */
4109                         /* the vm_map_copy_page_discard call              */
4110                         *vsmap_ptr = write_vsmap;
4111
4112                         if(vs_cluster_write(vs, upl, offset,
4113                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
4114                                 error = KERN_FAILURE;
4115                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
4116                                         /* unmap the new backing store object */
4117                                         ps_clunmap(vs, offset, size);
4118                                 }
4119                                 /* original vsmap */
4120                                 *vsmap_ptr = original_read_vsmap;
4121                                 VSM_CLR(write_vsmap);
4122                         } else {
4123                                if((offset + size) &
4124                                         ((vm_page_size << vs->vs_clshift)
4125                                         - 1)) {
4126                                         /* There is more to transfer in this
4127                                            cluster
4128                                         */
4129                                         write_vsmap = *vsmap_ptr;
4130                                         *vsmap_ptr = read_vsmap;
4131                                         ps_clunmap(vs, offset, size);
4132                                 } else {
4133                                         /* discard the old backing object */
4134                                         write_vsmap = *vsmap_ptr;
4135                                         *vsmap_ptr = read_vsmap;
4136                                         ps_clunmap(vs, offset, size);
4137                                         *vsmap_ptr = write_vsmap;
4138                                         VSM_CLR(write_vsmap);
4139                                         VSM_CLR(original_read_vsmap);
4140                                 }
4141                         }
4142                 } else {
4143                         size_wanted = size;
4144                         if (error == KERN_SUCCESS) {
4145                                 if (residual == size) {
4146                                         /*
4147                                          * If a read operation returns no error
4148                                          * and no data moved, we turn it into
4149                                          * an error, assuming we're reading at
4150                                          * or beyond EOF.
4151                                          * Fall through and error the entire
4152                                          * range.
4153                                          */
4154                                         error = KERN_FAILURE;
4155                                         *vsmap_ptr = write_vsmap;
4156                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
4157                                         /* unmap the new backing store object */
4158                                         ps_clunmap(vs, offset, size);
4159                                         }
4160                                         *vsmap_ptr = original_read_vsmap;
4161                                         VSM_CLR(write_vsmap);
4162                                         continue;
4163                                 } else {
4164                                         /*
4165                                          * Otherwise, we have partial read.
4166                                          * This is also considered an error
4167                                          * for the purposes of cluster transfer
4168                                          */
4169                                         error = KERN_FAILURE;
4170                                         *vsmap_ptr = write_vsmap;
4171                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
4172                                         /* unmap the new backing store object */
4173                                         ps_clunmap(vs, offset, size);
4174                                         }
4175                                         *vsmap_ptr = original_read_vsmap;
4176                                         VSM_CLR(write_vsmap);
4177                                         continue;
4178                                 }
4179                         }
4180
4181                 }
4182                 cnt -= size;
4183                 offset += size;
4184
4185         } /* END while (cnt && (error == 0)) */
4186         if(!VSM_ISCLR(write_vsmap))
4187                 *vsmap_ptr = write_vsmap;
4188
4189         return error;
4190 }
4191
4192 kern_return_t
4193 default_pager_add_file(
4194         MACH_PORT_FACE  backing_store,
4195         vnode_ptr_t     vp,
4196         int             record_size,
4197         vm_size_t       size)
4198 {
4199         backing_store_t         bs;
4200         paging_segment_t        ps;
4201         int                     i;
4202         unsigned int            j;
4203         int                     error;
4204
4205         if ((bs = backing_store_lookup(backing_store))
4206             == BACKING_STORE_NULL)
4207                 return KERN_INVALID_ARGUMENT;
4208
4209         PSL_LOCK();
4210         for (i = 0; i <= paging_segment_max; i++) {
4211                 ps = paging_segments[i];
4212                 if (ps == PAGING_SEGMENT_NULL)
4213                         continue;
4214                 if (ps->ps_segtype != PS_FILE)
4215                         continue;
4216
4217                 /*
4218                  * Check for overlap on same device.
4219                  */
4220                 if (ps->ps_vnode == (struct vnode *)vp) {
4221                         PSL_UNLOCK();
4222                         BS_UNLOCK(bs);
4223                         return KERN_INVALID_ARGUMENT;
4224                 }
4225         }
4226         PSL_UNLOCK();
4227
4228         /*
4229          * Set up the paging segment
4230          */
4231         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
4232         if (ps == PAGING_SEGMENT_NULL) {
4233                 BS_UNLOCK(bs);
4234                 return KERN_RESOURCE_SHORTAGE;
4235         }
4236
4237         ps->ps_segtype = PS_FILE;
4238         ps->ps_vnode = (struct vnode *)vp;
4239         ps->ps_offset = 0;
4240         ps->ps_record_shift = local_log2(vm_page_size / record_size);
4241         assert((dp_size_t) size == size);
4242         ps->ps_recnum = (dp_size_t) size;
4243         ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift;
4244
4245         ps->ps_pgcount = ps->ps_pgnum;
4246         ps->ps_clshift = local_log2(bs->bs_clsize);
4247         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
4248         ps->ps_special_clusters = 0;
4249         ps->ps_hint = 0;
4250
4251         PS_LOCK_INIT(ps);
4252         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
4253         if (!ps->ps_bmap) {
4254                 kfree(ps, sizeof *ps);
4255                 BS_UNLOCK(bs);
4256                 return KERN_RESOURCE_SHORTAGE;
4257         }
4258         for (j = 0; j < ps->ps_ncls; j++) {
4259                 clrbit(ps->ps_bmap, j);
4260         }
4261
4262         if(paging_segment_count == 0) {
4263                 ps->ps_state = PS_EMERGENCY_SEGMENT;
4264                 if(use_emergency_swap_file_first) {
4265                         ps->ps_state |= PS_CAN_USE;
4266                 }
4267                 emergency_segment_backing_store = backing_store;
4268         } else {
4269                 ps->ps_state = PS_CAN_USE;
4270         }
4271
4272         ps->ps_bs = bs;
4273
4274         if ((error = ps_enter(ps)) != 0) {
4275                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
4276                 kfree(ps, sizeof *ps);
4277                 BS_UNLOCK(bs);
4278                 return KERN_RESOURCE_SHORTAGE;
4279         }
4280
4281         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
4282         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
4283         PSL_LOCK();
4284         if(IS_PS_OK_TO_USE(ps)) {
4285                 dp_pages_free += ps->ps_pgcount;
4286         } else {
4287                 dp_pages_reserve += ps->ps_pgcount;
4288         }
4289         PSL_UNLOCK();
4290
4291         BS_UNLOCK(bs);
4292
4293         bs_more_space(ps->ps_clcount);
4294
4295         /*
4296          * If the paging segment being activated is not the emergency
4297          * segment and we notice that the emergency segment is being
4298          * used then we help recover it. If all goes well, the
4299          * emergency segment will be back to its original state of
4300          * online but not activated (till it's needed the next time).
4301          */
4302 #if CONFIG_FREEZE
4303         if (!vm_freeze_enabled)
4304 #endif
4305         {
4306                 ps = paging_segments[EMERGENCY_PSEG_INDEX];
4307                 if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) {
4308                         if(default_pager_backing_store_delete(emergency_segment_backing_store)) {
4309                                 dprintf(("Failed to recover emergency paging segment\n"));
4310                         } else {
4311                                 dprintf(("Recovered emergency paging segment\n"));
4312                         }
4313                 }
4314         }
4315
4316         DP_DEBUG(DEBUG_BS_INTERNAL,
4317                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
4318                   device, offset, (dp_size_t) size, record_size,
4319                   ps->ps_record_shift, ps->ps_pgnum));
4320
4321         return KERN_SUCCESS;
4322 }
4323
4324
4325
4326 kern_return_t
4327 ps_read_file(
4328         paging_segment_t        ps,
4329         upl_t                   upl,
4330         upl_offset_t            upl_offset,
4331         dp_offset_t             offset,
4332         upl_size_t              size,
4333         unsigned int            *residualp,
4334         int                     flags)
4335 {
4336         vm_object_offset_t      f_offset;
4337         int                     error = 0;
4338         int                     result;
4339
4340         assert(dp_encryption_inited);
4341
4342         clustered_reads[atop_32(size)]++;
4343
4344         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4345
4346         /*
4347          * for transfer case we need to pass uploffset and flags
4348          */
4349         assert((upl_size_t) size == size);
4350         error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL);
4351
4352         /* The vnode_pagein semantic is somewhat at odds with the existing   */
4353         /* device_read semantic.  Partial reads are not experienced at this  */
4354         /* level.  It is up to the bit map code and cluster read code to     */
4355         /* check that requested data locations are actually backed, and the  */
4356         /* pagein code to either read all of the requested data or return an */
4357         /* error. */
4358
4359         if (error)
4360                 result = KERN_FAILURE;
4361         else {
4362                 *residualp = 0;
4363                 result = KERN_SUCCESS;
4364         }
4365         return result;
4366 }
4367
4368 kern_return_t
4369 ps_write_file(
4370         paging_segment_t        ps,
4371         upl_t                   upl,
4372         upl_offset_t            upl_offset,
4373         dp_offset_t             offset,
4374         unsigned int            size,
4375         int                     flags)
4376 {
4377         vm_object_offset_t      f_offset;
4378         kern_return_t           result;
4379
4380         assert(dp_encryption_inited);
4381
4382         clustered_writes[atop_32(size)]++;
4383         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4384
4385         if (flags & UPL_PAGING_ENCRYPTED) {
4386                 /*
4387                  * ENCRYPTED SWAP:
4388                  * encrypt all the pages that we're going
4389                  * to pageout.
4390                  */
4391                 upl_encrypt(upl, upl_offset, size);
4392         }
4393         assert((upl_size_t) size == size);
4394         if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL))
4395                 result = KERN_FAILURE;
4396         else
4397                 result = KERN_SUCCESS;
4398
4399         return result;
4400 }
4401
4402 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data)
4403 {
4404 #if CONFIG_EMBEDDED
4405         data->vp = NULL;
4406         data->offset = 0;
4407         data->length = 0;
4408 #else
4409 #pragma unused(data)
4410 #endif
4411 }
4412
4413 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data)
4414 {
4415 #if CONFIG_EMBEDDED
4416         if ((data->vp) != NULL) {
4417                 vnode_trim(data->vp,
4418                                    data->offset,
4419                                    data->length);
4420                 ps_vnode_trim_init(data);
4421         }
4422 #else
4423 #pragma unused(data)
4424 #endif
4425 }
4426
4427 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length)
4428 {
4429 #if CONFIG_EMBEDDED
4430         struct vnode *vp = VSM_PS(*map)->ps_vnode;
4431         dp_offset_t offset = ptoa_32(VSM_CLOFF(*map)) << shift;
4432
4433         if ((vp != data->vp) || (offset) != (data->offset + data->length)) {
4434                 ps_vnode_trim_now(data);
4435                 data->vp = vp;
4436                 data->offset = offset;
4437                 data->length = 0;
4438         }
4439         data->length += (length);
4440 #else
4441 #pragma unused(data, map, shift, length)
4442 #endif
4443 }
4444
4445 kern_return_t
4446 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
4447         int             hi_wat,
4448         int             lo_wat,
4449         int             flags,
4450         MACH_PORT_FACE  trigger_port)
4451 {
4452         MACH_PORT_FACE release = IPC_PORT_NULL;
4453         kern_return_t kr;
4454         clock_sec_t now;
4455         clock_nsec_t nanoseconds_dummy;
4456         static clock_sec_t error_notify = 0;
4457
4458         PSL_LOCK();
4459         if (flags == SWAP_ENCRYPT_ON) {
4460                 /* ENCRYPTED SWAP: turn encryption on */
4461                 release = trigger_port;
4462                 if (!dp_encryption_inited) {
4463                         dp_encryption_inited = TRUE;
4464                         dp_encryption = TRUE;
4465                         kr = KERN_SUCCESS;
4466                 } else {
4467                         kr = KERN_FAILURE;
4468                 }
4469         } else if (flags == SWAP_ENCRYPT_OFF) {
4470                 /* ENCRYPTED SWAP: turn encryption off */
4471                 release = trigger_port;
4472                 if (!dp_encryption_inited) {
4473                         dp_encryption_inited = TRUE;
4474                         dp_encryption = FALSE;
4475                         kr = KERN_SUCCESS;
4476                 } else {
4477                         kr = KERN_FAILURE;
4478                 }
4479         } else if (flags == HI_WAT_ALERT) {
4480                 release = min_pages_trigger_port;
4481 #if CONFIG_FREEZE
4482                 /* High and low water signals aren't applicable when freeze is */
4483                 /* enabled, so release the trigger ports here and return       */
4484                 /* KERN_FAILURE.                                               */
4485                 if (vm_freeze_enabled) {
4486                         if (IP_VALID( trigger_port )){
4487                                 ipc_port_release_send( trigger_port );
4488                         }
4489                         min_pages_trigger_port = IPC_PORT_NULL;
4490                         kr = KERN_FAILURE;
4491                 }
4492                 else
4493 #endif
4494                 {
4495                         min_pages_trigger_port = trigger_port;
4496                         minimum_pages_remaining = hi_wat/vm_page_size;
4497                         bs_low = FALSE;
4498                         kr = KERN_SUCCESS;
4499                 }
4500         } else if (flags ==  LO_WAT_ALERT) {
4501                 release = max_pages_trigger_port;
4502 #if CONFIG_FREEZE
4503                 if (vm_freeze_enabled) {
4504                         if (IP_VALID( trigger_port )){
4505                                 ipc_port_release_send( trigger_port );
4506                         }
4507                         max_pages_trigger_port = IPC_PORT_NULL;
4508                         kr = KERN_FAILURE;
4509                 }
4510                 else
4511 #endif
4512                 {
4513                         max_pages_trigger_port = trigger_port;
4514                         maximum_pages_free = lo_wat/vm_page_size;
4515                         kr = KERN_SUCCESS;
4516                 }
4517         } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) {
4518                 use_emergency_swap_file_first = TRUE;
4519                 release = trigger_port;
4520                 kr = KERN_SUCCESS;
4521         } else if (flags == SWAP_FILE_CREATION_ERROR) {
4522                 release = trigger_port;
4523                 kr = KERN_SUCCESS;
4524                 if( paging_segment_count == 1) {
4525                         use_emergency_swap_file_first = TRUE;
4526                 }
4527                 no_paging_space_action();
4528                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
4529                 if (now > error_notify + 5) {
4530                         dprintf(("Swap File Error.\n"));
4531                         error_notify = now;
4532                 }
4533         } else {
4534                 release = trigger_port;
4535                 kr =  KERN_INVALID_ARGUMENT;
4536         }
4537         PSL_UNLOCK();
4538
4539         if (IP_VALID(release))
4540                 ipc_port_release_send(release);
4541
4542         return kr;
4543 }
4544
4545 /*
4546  * Monitor the amount of available backing store vs. the amount of
4547  * required backing store, notify a listener (if present) when
4548  * backing store may safely be removed.
4549  *
4550  * We attempt to avoid the situation where backing store is
4551  * discarded en masse, as this can lead to thrashing as the
4552  * backing store is compacted.
4553  */
4554
4555 #define PF_INTERVAL     3       /* time between free level checks */
4556 #define PF_LATENCY      10      /* number of intervals before release */
4557
4558 static int dp_pages_free_low_count = 0;
4559 thread_call_t default_pager_backing_store_monitor_callout;
4560
4561 void
4562 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
4563                                                                         __unused thread_call_param_t p2)
4564 {
4565 //      unsigned long long      average;
4566         ipc_port_t              trigger;
4567         uint64_t                deadline;
4568
4569         /*
4570          * We determine whether it will be safe to release some
4571          * backing store by watching the free page level.  If
4572          * it remains below the maximum_pages_free threshold for
4573          * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4574          * then we deem it safe.
4575          *
4576          * Note that this establishes a maximum rate at which backing
4577          * store will be released, as each notification (currently)
4578          * only results in a single backing store object being
4579          * released.
4580          */
4581         if (dp_pages_free > maximum_pages_free) {
4582                 dp_pages_free_low_count++;
4583         } else {
4584                 dp_pages_free_low_count = 0;
4585         }
4586
4587         /* decide whether to send notification */
4588         trigger = IP_NULL;
4589         if (max_pages_trigger_port &&
4590             (backing_store_release_trigger_disable == 0) &&
4591             (dp_pages_free_low_count > PF_LATENCY)) {
4592                 trigger = max_pages_trigger_port;
4593                 max_pages_trigger_port = NULL;
4594         }
4595
4596         /* send notification */
4597         if (trigger != IP_NULL) {
4598                 VSL_LOCK();
4599                 if(backing_store_release_trigger_disable != 0) {
4600                         assert_wait((event_t)
4601                                     &backing_store_release_trigger_disable,
4602                                     THREAD_UNINT);
4603                         VSL_UNLOCK();
4604                         thread_block(THREAD_CONTINUE_NULL);
4605                 } else {
4606                         VSL_UNLOCK();
4607                 }
4608                 dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n"));
4609
4610                 default_pager_space_alert(trigger, LO_WAT_ALERT);
4611                 ipc_port_release_send(trigger);
4612                 dp_pages_free_low_count = 0;
4613         }
4614
4615         clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
4616         thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
4617 }
4618
4619 #if CONFIG_FREEZE
4620 unsigned int default_pager_swap_pages_free() {
4621         return dp_pages_free;
4622 }
4623 #endif