osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56
  57 /*
  58  *      Default Pager.
  59  *              Paging File Management.
  60  */
  61
  62 #include <mach/host_priv.h>
  63 #include <mach/memory_object_control.h>
  64 #include <mach/memory_object_server.h>
  65 #include <mach/upl.h>
  66 #include <default_pager/default_pager_internal.h>
  67 #include <default_pager/default_pager_alerts.h>
  68 #include <default_pager/default_pager_object_server.h>
  69
  70 #include <ipc/ipc_types.h>
  71 #include <ipc/ipc_port.h>
  72 #include <ipc/ipc_space.h>
  73
  74 #include <kern/kern_types.h>
  75 #include <kern/host.h>
  76 #include <kern/queue.h>
  77 #include <kern/counters.h>
  78 #include <kern/sched_prim.h>
  79
  80 #include <vm/vm_kern.h>
  81 #include <vm/vm_pageout.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/vm_object.h>
  84 #include <vm/vm_protos.h>
  85
  86
  87 /* todo - need large internal object support */
  88
  89 /*
  90  * ALLOC_STRIDE... the maximum number of bytes allocated from
  91  * a swap file before moving on to the next swap file... if
  92  * all swap files reside on a single disk, this value should
  93  * be very large (this is the default assumption)... if the
  94  * swap files are spread across multiple disks, than this value
  95  * should be small (128 * 1024)...
  96  *
  97  * This should be determined dynamically in the future
  98  */
  99
 100 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
 101 int physical_transfer_cluster_count = 0;
 102
 103 #define VM_SUPER_CLUSTER        0x40000
 104 #define VM_SUPER_PAGES          (VM_SUPER_CLUSTER / PAGE_SIZE)
 105
 106 /*
 107  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
 108  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
 109  */
 110 #define VSTRUCT_MIN_CLSHIFT     0
 111
 112 #define VSTRUCT_DEF_CLSHIFT     2
 113 int default_pager_clsize = 0;
 114
 115 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
 116
 117 /* statistics */
 118 unsigned int clustered_writes[VM_SUPER_PAGES+1];
 119 unsigned int clustered_reads[VM_SUPER_PAGES+1];
 120
 121 /*
 122  * Globals used for asynchronous paging operations:
 123  *      vs_async_list:  head of list of to-be-completed I/O ops
 124  *      async_num_queued: number of pages completed, but not yet
 125  *              processed by async thread.
 126  *      async_requests_out: number of pages of requests not completed.
 127  */
 128
 129 #if 0
 130 struct vs_async *vs_async_list;
 131 int     async_num_queued;
 132 int     async_requests_out;
 133 #endif
 134
 135
 136 #define VS_ASYNC_REUSE 1
 137 struct vs_async *vs_async_free_list;
 138
 139 lck_mtx_t       default_pager_async_lock;       /* Protects globals above */
 140
 141
 142 int vs_alloc_async_failed = 0;                  /* statistics */
 143 int vs_alloc_async_count = 0;                   /* statistics */
 144 struct vs_async *vs_alloc_async(void);          /* forward */
 145 void vs_free_async(struct vs_async *vsa);       /* forward */
 146
 147
 148 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 149 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 150
 151 #define VS_ASYNC_LOCK()         lck_mtx_lock(&default_pager_async_lock)
 152 #define VS_ASYNC_UNLOCK()       lck_mtx_unlock(&default_pager_async_lock)
 153 #define VS_ASYNC_LOCK_INIT()    lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
 154 #define VS_ASYNC_LOCK_DESTROY() lck_mtx_destroy(&default_pager_async_lock, &default_pager_lck_grp)
 155 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 156 /*
 157  *  Paging Space Hysteresis triggers and the target notification port
 158  *
 159  */
 160 unsigned int    dp_pages_free_drift_count = 0;
 161 unsigned int    dp_pages_free_drifted_max = 0;
 162 unsigned int    minimum_pages_remaining = 0;
 163 unsigned int    maximum_pages_free = 0;
 164 ipc_port_t      min_pages_trigger_port = NULL;
 165 ipc_port_t      max_pages_trigger_port = NULL;
 166
 167 #if CONFIG_FREEZE
 168 boolean_t       use_emergency_swap_file_first = TRUE;
 169 #else
 170 boolean_t       use_emergency_swap_file_first = FALSE;
 171 #endif
 172 boolean_t       bs_low = FALSE;
 173 int             backing_store_release_trigger_disable = 0;
 174 boolean_t       backing_store_stop_compaction = FALSE;
 175 boolean_t       backing_store_abort_compaction = FALSE;
 176
 177 /* Have we decided if swap needs to be encrypted yet ? */
 178 boolean_t       dp_encryption_inited = FALSE;
 179 /* Should we encrypt swap ? */
 180 boolean_t       dp_encryption = FALSE;
 181
 182 boolean_t       dp_isssd = FALSE;
 183
 184 /*
 185  * Object sizes are rounded up to the next power of 2,
 186  * unless they are bigger than a given maximum size.
 187  */
 188 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 189
 190 /*
 191  * List of all backing store and segments.
 192  */
 193 MACH_PORT_FACE          emergency_segment_backing_store;
 194 struct backing_store_list_head backing_store_list;
 195 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 196 lck_mtx_t                       paging_segments_lock;
 197 int                     paging_segment_max = 0;
 198 int                     paging_segment_count = 0;
 199 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 200
 201
 202 /*
 203  * Total pages free in system
 204  * This differs from clusters committed/avail which is a measure of the
 205  * over commitment of paging segments to backing store.  An idea which is
 206  * likely to be deprecated.
 207  */
 208 unsigned  int   dp_pages_free = 0;
 209 unsigned  int   dp_pages_reserve = 0;
 210 unsigned  int   cluster_transfer_minimum = 100;
 211
 212 /*
 213  * Trim state
 214  */
 215 struct ps_vnode_trim_data {
 216         struct vnode *vp;
 217         dp_offset_t   offset;
 218         dp_size_t     length;
 219 };
 220
 221 /* forward declarations */
 222 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int);     /* forward */
 223 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int);     /* forward */
 224 default_pager_thread_t *get_read_buffer( void );
 225 kern_return_t ps_vstruct_transfer_from_segment(
 226         vstruct_t        vs,
 227         paging_segment_t segment,
 228         upl_t            upl);
 229 kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
 230 kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
 231 kern_return_t vs_cluster_transfer(
 232         vstruct_t       vs,
 233         dp_offset_t     offset,
 234         dp_size_t       cnt,
 235         upl_t           upl);
 236 vs_map_t vs_get_map_entry(
 237         vstruct_t       vs,
 238         dp_offset_t     offset);
 239
 240 kern_return_t
 241 default_pager_backing_store_delete_internal( MACH_PORT_FACE );
 242
 243 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data);
 244 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data);
 245 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length);
 246
 247 default_pager_thread_t *
 248 get_read_buffer( void )
 249 {
 250         int     i;
 251
 252         DPT_LOCK(dpt_lock);
 253         while(TRUE) {
 254                 for (i=0; i<default_pager_internal_count; i++) {
 255                         if(dpt_array[i]->checked_out == FALSE) {
 256                           dpt_array[i]->checked_out = TRUE;
 257                           DPT_UNLOCK(dpt_lock);
 258                           return  dpt_array[i];
 259                         }
 260                 }
 261                 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
 262         }
 263 }
 264
 265 void
 266 bs_initialize(void)
 267 {
 268         int i;
 269
 270         /*
 271          * List of all backing store.
 272          */
 273         BSL_LOCK_INIT();
 274         queue_init(&backing_store_list.bsl_queue);
 275         PSL_LOCK_INIT();
 276
 277         VS_ASYNC_LOCK_INIT();
 278 #if     VS_ASYNC_REUSE
 279         vs_async_free_list = NULL;
 280 #endif  /* VS_ASYNC_REUSE */
 281
 282         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 283                 clustered_writes[i] = 0;
 284                 clustered_reads[i] = 0;
 285         }
 286
 287 }
 288
 289 /*
 290  * When things do not quite workout...
 291  */
 292 void bs_no_paging_space(boolean_t);     /* forward */
 293
 294 void
 295 bs_no_paging_space(
 296         boolean_t out_of_memory)
 297 {
 298
 299         if (out_of_memory)
 300                 dprintf(("*** OUT OF MEMORY ***\n"));
 301         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 302 }
 303
 304 void bs_more_space(int);        /* forward */
 305 void bs_commit(int);            /* forward */
 306
 307 boolean_t       user_warned = FALSE;
 308 unsigned int    clusters_committed = 0;
 309 unsigned int    clusters_available = 0;
 310 unsigned int    clusters_committed_peak = 0;
 311
 312 void
 313 bs_more_space(
 314         int     nclusters)
 315 {
 316         BSL_LOCK();
 317         /*
 318          * Account for new paging space.
 319          */
 320         clusters_available += nclusters;
 321
 322         if (clusters_available >= clusters_committed) {
 323                 if (verbose && user_warned) {
 324                         printf("%s%s - %d excess clusters now.\n",
 325                                my_name,
 326                                "paging space is OK now",
 327                                clusters_available - clusters_committed);
 328                         user_warned = FALSE;
 329                         clusters_committed_peak = 0;
 330                 }
 331         } else {
 332                 if (verbose && user_warned) {
 333                         printf("%s%s - still short of %d clusters.\n",
 334                                my_name,
 335                                "WARNING: paging space over-committed",
 336                                clusters_committed - clusters_available);
 337                         clusters_committed_peak -= nclusters;
 338                 }
 339         }
 340         BSL_UNLOCK();
 341
 342         return;
 343 }
 344
 345 void
 346 bs_commit(
 347         int     nclusters)
 348 {
 349         BSL_LOCK();
 350         clusters_committed += nclusters;
 351         if (clusters_committed > clusters_available) {
 352                 if (verbose && !user_warned) {
 353                         user_warned = TRUE;
 354                         printf("%s%s - short of %d clusters.\n",
 355                                my_name,
 356                                "WARNING: paging space over-committed",
 357                                clusters_committed - clusters_available);
 358                 }
 359                 if (clusters_committed > clusters_committed_peak) {
 360                         clusters_committed_peak = clusters_committed;
 361                 }
 362         } else {
 363                 if (verbose && user_warned) {
 364                         printf("%s%s - was short of up to %d clusters.\n",
 365                                my_name,
 366                                "paging space is OK now",
 367                                clusters_committed_peak - clusters_available);
 368                         user_warned = FALSE;
 369                         clusters_committed_peak = 0;
 370                 }
 371         }
 372         BSL_UNLOCK();
 373
 374         return;
 375 }
 376
 377 int default_pager_info_verbose = 1;
 378
 379 void
 380 bs_global_info(
 381         uint64_t        *totalp,
 382         uint64_t        *freep)
 383 {
 384         uint64_t                pages_total, pages_free;
 385         paging_segment_t        ps;
 386         int                     i;
 387
 388         PSL_LOCK();
 389         pages_total = pages_free = 0;
 390         for (i = 0; i <= paging_segment_max; i++) {
 391                 ps = paging_segments[i];
 392                 if (ps == PAGING_SEGMENT_NULL)
 393                         continue;
 394
 395                 /*
 396                  * no need to lock: by the time this data
 397                  * gets back to any remote requestor it
 398                  * will be obsolete anyways
 399                  */
 400                 pages_total += ps->ps_pgnum;
 401                 pages_free += ps->ps_clcount << ps->ps_clshift;
 402                 DP_DEBUG(DEBUG_BS_INTERNAL,
 403                          ("segment #%d: %d total, %d free\n",
 404                           i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 405         }
 406         *totalp = pages_total;
 407         *freep = pages_free;
 408         if (verbose && user_warned && default_pager_info_verbose) {
 409                 if (clusters_available < clusters_committed) {
 410                         printf("%s %d clusters committed, %d available.\n",
 411                                my_name,
 412                                clusters_committed,
 413                                clusters_available);
 414                 }
 415         }
 416         PSL_UNLOCK();
 417 }
 418
 419 backing_store_t backing_store_alloc(void);      /* forward */
 420
 421 backing_store_t
 422 backing_store_alloc(void)
 423 {
 424         backing_store_t bs;
 425
 426         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 427         if (bs == BACKING_STORE_NULL)
 428                 panic("backing_store_alloc: no memory");
 429
 430         BS_LOCK_INIT(bs);
 431         bs->bs_port = MACH_PORT_NULL;
 432         bs->bs_priority = 0;
 433         bs->bs_clsize = 0;
 434         bs->bs_pages_total = 0;
 435         bs->bs_pages_in = 0;
 436         bs->bs_pages_in_fail = 0;
 437         bs->bs_pages_out = 0;
 438         bs->bs_pages_out_fail = 0;
 439
 440         return bs;
 441 }
 442
 443 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 444
 445 /* Even in both the component space and external versions of this pager, */
 446 /* backing_store_lookup will be called from tasks in the application space */
 447 backing_store_t
 448 backing_store_lookup(
 449         MACH_PORT_FACE port)
 450 {
 451         backing_store_t bs;
 452
 453 /*
 454         port is currently backed with a vs structure in the alias field
 455         we could create an ISBS alias and a port_is_bs call but frankly
 456         I see no reason for the test, the bs->port == port check below
 457         will work properly on junk entries.
 458
 459         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 460 */
 461         if (port == MACH_PORT_NULL)
 462                 return BACKING_STORE_NULL;
 463
 464         BSL_LOCK();
 465         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 466                       bs_links) {
 467                 BS_LOCK(bs);
 468                 if (bs->bs_port == port) {
 469                         BSL_UNLOCK();
 470                         /* Success, return it locked. */
 471                         return bs;
 472                 }
 473                 BS_UNLOCK(bs);
 474         }
 475         BSL_UNLOCK();
 476         return BACKING_STORE_NULL;
 477 }
 478
 479 void backing_store_add(backing_store_t);        /* forward */
 480
 481 void
 482 backing_store_add(
 483         __unused backing_store_t bs)
 484 {
 485 //      MACH_PORT_FACE          port = bs->bs_port;
 486 //      MACH_PORT_FACE          pset = default_pager_default_set;
 487         kern_return_t           kr = KERN_SUCCESS;
 488
 489         if (kr != KERN_SUCCESS)
 490                 panic("backing_store_add: add to set");
 491
 492 }
 493
 494 /*
 495  * Set up default page shift, but only if not already
 496  * set and argument is within range.
 497  */
 498 boolean_t
 499 bs_set_default_clsize(unsigned int npages)
 500 {
 501         switch(npages){
 502             case 1:
 503             case 2:
 504             case 4:
 505             case 8:
 506                 if (default_pager_clsize == 0)  /* if not yet set */
 507                         vstruct_def_clshift = local_log2(npages);
 508                 return(TRUE);
 509         }
 510         return(FALSE);
 511 }
 512
 513 int bs_get_global_clsize(int clsize);   /* forward */
 514
 515 int
 516 bs_get_global_clsize(
 517         int     clsize)
 518 {
 519         int                     i;
 520         memory_object_default_t dmm;
 521         kern_return_t           kr;
 522
 523         /*
 524          * Only allow setting of cluster size once. If called
 525          * with no cluster size (default), we use the compiled-in default
 526          * for the duration. The same cluster size is used for all
 527          * paging segments.
 528          */
 529         if (default_pager_clsize == 0) {
 530                 /*
 531                  * Keep cluster size in bit shift because it's quicker
 532                  * arithmetic, and easier to keep at a power of 2.
 533                  */
 534                 if (clsize != NO_CLSIZE) {
 535                         for (i = 0; (1 << i) < clsize; i++);
 536                         if (i > MAX_CLUSTER_SHIFT)
 537                                 i = MAX_CLUSTER_SHIFT;
 538                         vstruct_def_clshift = i;
 539                 }
 540                 default_pager_clsize = (1 << vstruct_def_clshift);
 541
 542                 /*
 543                  * Let the user know the new (and definitive) cluster size.
 544                  */
 545                 if (verbose)
 546                         printf("%scluster size = %d page%s\n",
 547                                 my_name, default_pager_clsize,
 548                                 (default_pager_clsize == 1) ? "" : "s");
 549
 550                 /*
 551                  * Let the kernel know too, in case it hasn't used the
 552                  * default value provided in main() yet.
 553                  */
 554                 dmm = default_pager_object;
 555                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 556                 kr = host_default_memory_manager(host_priv_self(),
 557                                                  &dmm,
 558                                                  clsize);
 559                 memory_object_default_deallocate(dmm);
 560
 561                 if (kr != KERN_SUCCESS) {
 562                    panic("bs_get_global_cl_size:host_default_memory_manager");
 563                 }
 564                 if (dmm != default_pager_object) {
 565                   panic("bs_get_global_cl_size:there is another default pager");
 566                 }
 567         }
 568         ASSERT(default_pager_clsize > 0 &&
 569                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 570
 571         return default_pager_clsize;
 572 }
 573
 574 kern_return_t
 575 default_pager_backing_store_create(
 576         memory_object_default_t pager,
 577         int                     priority,
 578         int                     clsize,         /* in bytes */
 579         MACH_PORT_FACE          *backing_store)
 580 {
 581         backing_store_t bs;
 582         MACH_PORT_FACE  port;
 583 //      kern_return_t   kr;
 584         struct vstruct_alias *alias_struct;
 585
 586         if (pager != default_pager_object)
 587                 return KERN_INVALID_ARGUMENT;
 588
 589         bs = backing_store_alloc();
 590         port = ipc_port_alloc_kernel();
 591         ipc_port_make_send(port);
 592         assert (port != IP_NULL);
 593
 594         DP_DEBUG(DEBUG_BS_EXTERNAL,
 595                  ("priority=%d clsize=%d bs_port=0x%x\n",
 596                   priority, clsize, (int) backing_store));
 597
 598         alias_struct = (struct vstruct_alias *)
 599                                 kalloc(sizeof (struct vstruct_alias));
 600         if(alias_struct != NULL) {
 601                 alias_struct->vs = (struct vstruct *)bs;
 602                 alias_struct->name = &default_pager_ops;
 603                 port->alias = (uintptr_t) alias_struct;
 604         }
 605         else {
 606                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 607
 608                 BS_LOCK_DESTROY(bs);
 609                 kfree(bs, sizeof (struct backing_store));
 610
 611                 return KERN_RESOURCE_SHORTAGE;
 612         }
 613
 614         bs->bs_port = port;
 615         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 616                 priority = BS_MAXPRI;
 617         else if (priority == BS_NOPRI)
 618                 priority = BS_MAXPRI;
 619         else
 620                 priority = BS_MINPRI;
 621         bs->bs_priority = priority;
 622
 623         bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
 624
 625         BSL_LOCK();
 626         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 627                     bs_links);
 628         BSL_UNLOCK();
 629
 630         backing_store_add(bs);
 631
 632         *backing_store = port;
 633         return KERN_SUCCESS;
 634 }
 635
 636 kern_return_t
 637 default_pager_backing_store_info(
 638         MACH_PORT_FACE          backing_store,
 639         backing_store_flavor_t  flavour,
 640         backing_store_info_t    info,
 641         mach_msg_type_number_t  *size)
 642 {
 643         backing_store_t                 bs;
 644         backing_store_basic_info_t      basic;
 645         int                             i;
 646         paging_segment_t                ps;
 647
 648         if (flavour != BACKING_STORE_BASIC_INFO ||
 649             *size < BACKING_STORE_BASIC_INFO_COUNT)
 650                 return KERN_INVALID_ARGUMENT;
 651
 652         basic = (backing_store_basic_info_t)info;
 653         *size = BACKING_STORE_BASIC_INFO_COUNT;
 654
 655         VSTATS_LOCK(&global_stats.gs_lock);
 656         basic->pageout_calls    = global_stats.gs_pageout_calls;
 657         basic->pagein_calls     = global_stats.gs_pagein_calls;
 658         basic->pages_in         = global_stats.gs_pages_in;
 659         basic->pages_out        = global_stats.gs_pages_out;
 660         basic->pages_unavail    = global_stats.gs_pages_unavail;
 661         basic->pages_init       = global_stats.gs_pages_init;
 662         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 663         VSTATS_UNLOCK(&global_stats.gs_lock);
 664
 665         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 666                 return KERN_INVALID_ARGUMENT;
 667
 668         basic->bs_pages_total   = bs->bs_pages_total;
 669         PSL_LOCK();
 670         bs->bs_pages_free = 0;
 671         for (i = 0; i <= paging_segment_max; i++) {
 672                 ps = paging_segments[i];
 673                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 674                         PS_LOCK(ps);
 675                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 676                         PS_UNLOCK(ps);
 677                 }
 678         }
 679         PSL_UNLOCK();
 680         basic->bs_pages_free    = bs->bs_pages_free;
 681         basic->bs_pages_in      = bs->bs_pages_in;
 682         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 683         basic->bs_pages_out     = bs->bs_pages_out;
 684         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 685
 686         basic->bs_priority      = bs->bs_priority;
 687         basic->bs_clsize        = ptoa_32(bs->bs_clsize);       /* in bytes */
 688
 689         BS_UNLOCK(bs);
 690
 691         return KERN_SUCCESS;
 692 }
 693
 694 int ps_delete(paging_segment_t);        /* forward */
 695 boolean_t current_thread_aborted(void);
 696
 697 int
 698 ps_delete(
 699         paging_segment_t ps)
 700 {
 701         vstruct_t       vs;
 702         kern_return_t   error = KERN_SUCCESS;
 703         int             vs_count;
 704
 705         VSL_LOCK();             /* get the lock on the list of vs's      */
 706
 707         /* The lock relationship and sequence is farily complicated      */
 708         /* this code looks at a live list, locking and unlocking the list */
 709         /* as it traverses it.  It depends on the locking behavior of    */
 710         /* default_pager_no_senders.  no_senders always locks the vstruct */
 711         /* targeted for removal before locking the vstruct list.  However */
 712         /* it will remove that member of the list without locking its    */
 713         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 714         /* it cannot be removed from the list but we must hold the list  */
 715         /* lock to be sure that its pointers to its neighbors are valid. */
 716         /* Also, we can hold off destruction of a vstruct when the list  */
 717         /* lock and the vs locks are not being held by bumping the       */
 718         /* vs_async_pending count.      */
 719
 720
 721         while(backing_store_release_trigger_disable != 0) {
 722                 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
 723         }
 724
 725         /* we will choose instead to hold a send right */
 726         vs_count = vstruct_list.vsl_count;
 727         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 728         if(vs == (vstruct_t)&vstruct_list)  {
 729                 VSL_UNLOCK();
 730                 return KERN_SUCCESS;
 731         }
 732         VS_LOCK(vs);
 733         vs_async_wait(vs);  /* wait for any pending async writes */
 734         if ((vs_count != 0) && (vs != NULL))
 735                 vs->vs_async_pending += 1;  /* hold parties calling  */
 736                                             /* vs_async_wait */
 737
 738         if (bs_low == FALSE)
 739                 backing_store_abort_compaction = FALSE;
 740
 741         VS_UNLOCK(vs);
 742         VSL_UNLOCK();
 743         while((vs_count != 0) && (vs != NULL)) {
 744                 /* We take the count of AMO's before beginning the         */
 745                 /* transfer of of the target segment.                      */
 746                 /* We are guaranteed that the target segment cannot get    */
 747                 /* more users.  We also know that queue entries are        */
 748                 /* made at the back of the list.  If some of the entries   */
 749                 /* we would check disappear while we are traversing the    */
 750                 /* list then we will either check new entries which        */
 751                 /* do not have any backing store in the target segment     */
 752                 /* or re-check old entries.  This might not be optimal     */
 753                 /* but it will always be correct. The alternative is to    */
 754                 /* take a snapshot of the list.                            */
 755                 vstruct_t       next_vs;
 756
 757                 if(dp_pages_free < cluster_transfer_minimum)
 758                         error = KERN_FAILURE;
 759                 else {
 760                         vm_object_t     transfer_object;
 761                         unsigned int    count;
 762                         upl_t           upl;
 763                         int             upl_flags;
 764
 765                         transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
 766                         count = 0;
 767                         upl_flags = (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE |
 768                                      UPL_SET_LITE | UPL_SET_INTERNAL);
 769                         if (dp_encryption) {
 770                                 /* mark the pages as "encrypted" when they come in */
 771                                 upl_flags |= UPL_ENCRYPT;
 772                         }
 773                         error = vm_object_upl_request(transfer_object,
 774                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 775                                 &upl, NULL, &count, upl_flags);
 776
 777                         if(error == KERN_SUCCESS) {
 778                                 error = ps_vstruct_transfer_from_segment(
 779                                                         vs, ps, upl);
 780                                 upl_commit(upl, NULL, 0);
 781                                 upl_deallocate(upl);
 782                         } else {
 783                                 error = KERN_FAILURE;
 784                         }
 785                         vm_object_deallocate(transfer_object);
 786                 }
 787                 if(error || current_thread_aborted()) {
 788                         VS_LOCK(vs);
 789                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 790                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 791                                 vs->vs_waiting_async = FALSE;
 792                                 VS_UNLOCK(vs);
 793                                 thread_wakeup(&vs->vs_async_pending);
 794                         } else {
 795                                 VS_UNLOCK(vs);
 796                         }
 797                         return KERN_FAILURE;
 798                 }
 799
 800                 VSL_LOCK();
 801
 802                 while(backing_store_release_trigger_disable != 0) {
 803                         VSL_SLEEP(&backing_store_release_trigger_disable,
 804                                   THREAD_UNINT);
 805                 }
 806
 807                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 808                 if((next_vs != (vstruct_t)&vstruct_list) &&
 809                                 (vs != next_vs) && (vs_count != 1)) {
 810                         VS_LOCK(next_vs);
 811                         vs_async_wait(next_vs);  /* wait for any  */
 812                                                  /* pending async writes */
 813                         next_vs->vs_async_pending += 1; /* hold parties  */
 814                                                 /* calling vs_async_wait */
 815                         VS_UNLOCK(next_vs);
 816                 }
 817                 VSL_UNLOCK();
 818                 VS_LOCK(vs);
 819                 vs->vs_async_pending -= 1;
 820                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 821                         vs->vs_waiting_async = FALSE;
 822                         VS_UNLOCK(vs);
 823                         thread_wakeup(&vs->vs_async_pending);
 824                 } else {
 825                         VS_UNLOCK(vs);
 826                 }
 827                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 828                         vs = NULL;
 829                 else
 830                         vs = next_vs;
 831                 vs_count--;
 832         }
 833         return KERN_SUCCESS;
 834 }
 835
 836
 837 kern_return_t
 838 default_pager_backing_store_delete_internal(
 839         MACH_PORT_FACE backing_store)
 840 {
 841         backing_store_t         bs;
 842         int                     i;
 843         paging_segment_t        ps;
 844         int                     error;
 845         int                     interim_pages_removed = 0;
 846         boolean_t               dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store );
 847
 848         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 849                 return KERN_INVALID_ARGUMENT;
 850
 851 restart:
 852         PSL_LOCK();
 853         error = KERN_SUCCESS;
 854         for (i = 0; i <= paging_segment_max; i++) {
 855                 ps = paging_segments[i];
 856                 if (ps != PAGING_SEGMENT_NULL &&
 857                     ps->ps_bs == bs &&
 858                     ! IS_PS_GOING_AWAY(ps)) {
 859                         PS_LOCK(ps);
 860
 861                         if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) {
 862                         /*
 863                          * Someone is already busy reclamining this paging segment.
 864                          * If it's the emergency segment we are looking at then check
 865                          * that someone has not already recovered it and set the right
 866                          * state i.e. online but not activated.
 867                          */
 868                                 PS_UNLOCK(ps);
 869                                 continue;
 870                         }
 871
 872                         /* disable access to this segment */
 873                         ps->ps_state &= ~PS_CAN_USE;
 874                         ps->ps_state |= PS_GOING_AWAY;
 875                         PS_UNLOCK(ps);
 876                         /*
 877                          * The "ps" segment is "off-line" now,
 878                          * we can try and delete it...
 879                          */
 880                         if(dp_pages_free < (cluster_transfer_minimum
 881                                                         + ps->ps_pgcount)) {
 882                                 error = KERN_FAILURE;
 883                                 PSL_UNLOCK();
 884                         }
 885                         else {
 886                                 /* remove all pages associated with the  */
 887                                 /* segment from the list of free pages   */
 888                                 /* when transfer is through, all target  */
 889                                 /* segment pages will appear to be free  */
 890
 891                                 dp_pages_free -=  ps->ps_pgcount;
 892                                 interim_pages_removed += ps->ps_pgcount;
 893                                 PSL_UNLOCK();
 894                                 error = ps_delete(ps);
 895                         }
 896                         if (error != KERN_SUCCESS) {
 897                                 /*
 898                                  * We couldn't delete the segment,
 899                                  * probably because there's not enough
 900                                  * virtual memory left.
 901                                  * Re-enable all the segments.
 902                                  */
 903                                 PSL_LOCK();
 904                                 break;
 905                         }
 906                         goto restart;
 907                 }
 908         }
 909
 910         if (error != KERN_SUCCESS) {
 911                 for (i = 0; i <= paging_segment_max; i++) {
 912                         ps = paging_segments[i];
 913                         if (ps != PAGING_SEGMENT_NULL &&
 914                             ps->ps_bs == bs &&
 915                             IS_PS_GOING_AWAY(ps)) {
 916                                 PS_LOCK(ps);
 917
 918                                 if( !IS_PS_GOING_AWAY(ps)) {
 919                                         PS_UNLOCK(ps);
 920                                         continue;
 921                                 }
 922                                 /* Handle the special clusters that came in while we let go the lock*/
 923                                 if( ps->ps_special_clusters) {
 924                                         dp_pages_free += ps->ps_special_clusters << ps->ps_clshift;
 925                                         ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift;
 926                                         ps->ps_clcount += ps->ps_special_clusters;
 927                                         if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) {
 928                                                 ps_select_array[ps->ps_bs->bs_priority] = 0;
 929                                         }
 930                                         ps->ps_special_clusters = 0;
 931                                 }
 932                                 /* re-enable access to this segment */
 933                                 ps->ps_state &= ~PS_GOING_AWAY;
 934                                 ps->ps_state |= PS_CAN_USE;
 935                                 PS_UNLOCK(ps);
 936                         }
 937                 }
 938                 dp_pages_free += interim_pages_removed;
 939                 PSL_UNLOCK();
 940                 BS_UNLOCK(bs);
 941                 return error;
 942         }
 943
 944         for (i = 0; i <= paging_segment_max; i++) {
 945                 ps = paging_segments[i];
 946                 if (ps != PAGING_SEGMENT_NULL &&
 947                     ps->ps_bs == bs) {
 948                         if(IS_PS_GOING_AWAY(ps)) {
 949                                 if(IS_PS_EMERGENCY_SEGMENT(ps)) {
 950                                         PS_LOCK(ps);
 951                                         ps->ps_state &= ~PS_GOING_AWAY;
 952                                         ps->ps_special_clusters = 0;
 953                                         ps->ps_pgcount = ps->ps_pgnum;
 954                                         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
 955                                         dp_pages_reserve += ps->ps_pgcount;
 956                                         PS_UNLOCK(ps);
 957                                 } else {
 958                                         paging_segments[i] = PAGING_SEGMENT_NULL;
 959                                         paging_segment_count--;
 960                                         PS_LOCK(ps);
 961                                         kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
 962                                         kfree(ps, sizeof *ps);
 963                                 }
 964                         }
 965                 }
 966         }
 967
 968         /* Scan the entire ps array separately to make certain we find the */
 969         /* proper paging_segment_max                                       */
 970         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 971                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 972                    paging_segment_max = i;
 973         }
 974
 975         PSL_UNLOCK();
 976
 977         if( dealing_with_emergency_segment ) {
 978                 BS_UNLOCK(bs);
 979                 return KERN_SUCCESS;
 980         }
 981
 982         /*
 983          * All the segments have been deleted.
 984          * We can remove the backing store.
 985          */
 986
 987         /*
 988          * Disable lookups of this backing store.
 989          */
 990         if((void *)bs->bs_port->alias != NULL)
 991                 kfree((void *) bs->bs_port->alias,
 992                       sizeof (struct vstruct_alias));
 993         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 994         bs->bs_port = MACH_PORT_NULL;
 995         BS_UNLOCK(bs);
 996
 997         /*
 998          * Remove backing store from backing_store list.
 999          */
1000         BSL_LOCK();
1001         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
1002                      bs_links);
1003         BSL_UNLOCK();
1004
1005         /*
1006          * Free the backing store structure.
1007          */
1008         BS_LOCK_DESTROY(bs);
1009         kfree(bs, sizeof *bs);
1010
1011         return KERN_SUCCESS;
1012 }
1013
1014 kern_return_t
1015 default_pager_backing_store_delete(
1016         MACH_PORT_FACE backing_store)
1017 {
1018         if( backing_store != emergency_segment_backing_store ) {
1019                 default_pager_backing_store_delete_internal(emergency_segment_backing_store);
1020         }
1021         return(default_pager_backing_store_delete_internal(backing_store));
1022 }
1023
1024 int     ps_enter(paging_segment_t);     /* forward */
1025
1026 int
1027 ps_enter(
1028         paging_segment_t ps)
1029 {
1030         int i;
1031
1032         PSL_LOCK();
1033
1034         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
1035                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
1036                         break;
1037         }
1038
1039         if (i < MAX_NUM_PAGING_SEGMENTS) {
1040                 paging_segments[i] = ps;
1041                 if (i > paging_segment_max)
1042                         paging_segment_max = i;
1043                 paging_segment_count++;
1044                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
1045                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
1046                         ps_select_array[ps->ps_bs->bs_priority] = 0;
1047                 i = 0;
1048         } else {
1049                 PSL_UNLOCK();
1050                 return KERN_RESOURCE_SHORTAGE;
1051         }
1052
1053         PSL_UNLOCK();
1054         return i;
1055 }
1056
1057 #ifdef DEVICE_PAGING
1058 kern_return_t
1059 default_pager_add_segment(
1060         MACH_PORT_FACE  backing_store,
1061         MACH_PORT_FACE  device,
1062         recnum_t        offset,
1063         recnum_t        count,
1064         int             record_size)
1065 {
1066         backing_store_t         bs;
1067         paging_segment_t        ps;
1068         int                     i;
1069         int                     error;
1070
1071         if ((bs = backing_store_lookup(backing_store))
1072             == BACKING_STORE_NULL)
1073                 return KERN_INVALID_ARGUMENT;
1074
1075         PSL_LOCK();
1076         for (i = 0; i <= paging_segment_max; i++) {
1077                 ps = paging_segments[i];
1078                 if (ps == PAGING_SEGMENT_NULL)
1079                         continue;
1080
1081                 /*
1082                  * Check for overlap on same device.
1083                  */
1084                 if (!(ps->ps_device != device
1085                       || offset >= ps->ps_offset + ps->ps_recnum
1086                       || offset + count <= ps->ps_offset)) {
1087                         PSL_UNLOCK();
1088                         BS_UNLOCK(bs);
1089                         return KERN_INVALID_ARGUMENT;
1090                 }
1091         }
1092         PSL_UNLOCK();
1093
1094         /*
1095          * Set up the paging segment
1096          */
1097         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1098         if (ps == PAGING_SEGMENT_NULL) {
1099                 BS_UNLOCK(bs);
1100                 return KERN_RESOURCE_SHORTAGE;
1101         }
1102
1103         ps->ps_segtype = PS_PARTITION;
1104         ps->ps_device = device;
1105         ps->ps_offset = offset;
1106         ps->ps_record_shift = local_log2(vm_page_size / record_size);
1107         ps->ps_recnum = count;
1108         ps->ps_pgnum = count >> ps->ps_record_shift;
1109
1110         ps->ps_pgcount = ps->ps_pgnum;
1111         ps->ps_clshift = local_log2(bs->bs_clsize);
1112         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1113         ps->ps_hint = 0;
1114
1115         PS_LOCK_INIT(ps);
1116         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1117         if (!ps->ps_bmap) {
1118                 PS_LOCK_DESTROY(ps);
1119                 kfree(ps, sizeof *ps);
1120                 BS_UNLOCK(bs);
1121                 return KERN_RESOURCE_SHORTAGE;
1122         }
1123         for (i = 0; i < ps->ps_ncls; i++) {
1124                 clrbit(ps->ps_bmap, i);
1125         }
1126
1127         if(paging_segment_count == 0) {
1128                 ps->ps_state = PS_EMERGENCY_SEGMENT;
1129                 if(use_emergency_swap_file_first) {
1130                         ps->ps_state |= PS_CAN_USE;
1131                 }
1132         } else {
1133                 ps->ps_state = PS_CAN_USE;
1134         }
1135
1136         ps->ps_bs = bs;
1137
1138         if ((error = ps_enter(ps)) != 0) {
1139                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1140
1141                 PS_LOCK_DESTROY(ps);
1142                 kfree(ps, sizeof *ps);
1143                 BS_UNLOCK(bs);
1144                 return KERN_RESOURCE_SHORTAGE;
1145         }
1146
1147         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1148         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1149         BS_UNLOCK(bs);
1150
1151         PSL_LOCK();
1152         if(IS_PS_OK_TO_USE(ps)) {
1153                 dp_pages_free += ps->ps_pgcount;
1154         } else {
1155                 dp_pages_reserve += ps->ps_pgcount;
1156         }
1157         PSL_UNLOCK();
1158
1159         bs_more_space(ps->ps_clcount);
1160
1161         DP_DEBUG(DEBUG_BS_INTERNAL,
1162                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1163                   device, offset, count, record_size,
1164                   ps->ps_record_shift, ps->ps_pgnum));
1165
1166         return KERN_SUCCESS;
1167 }
1168
1169 boolean_t
1170 bs_add_device(
1171         char            *dev_name,
1172         MACH_PORT_FACE  master)
1173 {
1174         security_token_t        null_security_token = {
1175                 { 0, 0 }
1176         };
1177         MACH_PORT_FACE  device;
1178         int             info[DEV_GET_SIZE_COUNT];
1179         mach_msg_type_number_t info_count;
1180         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1181         unsigned int    rec_size;
1182         recnum_t        count;
1183         int             clsize;
1184         MACH_PORT_FACE  reply_port;
1185
1186         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1187                         null_security_token, dev_name, &device))
1188                 return FALSE;
1189
1190         info_count = DEV_GET_SIZE_COUNT;
1191         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1192                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1193                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1194                 clsize = bs_get_global_clsize(0);
1195                 if (!default_pager_backing_store_create(
1196                                         default_pager_object,
1197                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1198                                         (clsize * vm_page_size),
1199                                         &bs)) {
1200                         if (!default_pager_add_segment(bs, device,
1201                                                        0, count, rec_size)) {
1202                                 return TRUE;
1203                         }
1204                         ipc_port_release_receive(bs);
1205                 }
1206         }
1207
1208         ipc_port_release_send(device);
1209         return FALSE;
1210 }
1211 #endif /* DEVICE_PAGING */
1212
1213 #if     VS_ASYNC_REUSE
1214
1215 struct vs_async *
1216 vs_alloc_async(void)
1217 {
1218         struct vs_async *vsa;
1219         MACH_PORT_FACE  reply_port;
1220 //      kern_return_t   kr;
1221
1222         VS_ASYNC_LOCK();
1223         if (vs_async_free_list == NULL) {
1224                 VS_ASYNC_UNLOCK();
1225                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1226                 if (vsa != NULL) {
1227                         /*
1228                          * Try allocating a reply port named after the
1229                          * address of the vs_async structure.
1230                          */
1231                         struct vstruct_alias    *alias_struct;
1232
1233                         reply_port = ipc_port_alloc_kernel();
1234                         alias_struct = (struct vstruct_alias *)
1235                                 kalloc(sizeof (struct vstruct_alias));
1236                         if(alias_struct != NULL) {
1237                                 alias_struct->vs = (struct vstruct *)vsa;
1238                                 alias_struct->name = &default_pager_ops;
1239                                 reply_port->alias = (uintptr_t) alias_struct;
1240                                 vsa->reply_port = reply_port;
1241                                 vs_alloc_async_count++;
1242                         }
1243                         else {
1244                                 vs_alloc_async_failed++;
1245                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1246                                                                 (reply_port));
1247                                 kfree(vsa, sizeof (struct vs_async));
1248                                 vsa = NULL;
1249                         }
1250                 }
1251         } else {
1252                 vsa = vs_async_free_list;
1253                 vs_async_free_list = vs_async_free_list->vsa_next;
1254                 VS_ASYNC_UNLOCK();
1255         }
1256
1257         return vsa;
1258 }
1259
1260 void
1261 vs_free_async(
1262         struct vs_async *vsa)
1263 {
1264         VS_ASYNC_LOCK();
1265         vsa->vsa_next = vs_async_free_list;
1266         vs_async_free_list = vsa;
1267         VS_ASYNC_UNLOCK();
1268 }
1269
1270 #else   /* VS_ASYNC_REUSE */
1271
1272 struct vs_async *
1273 vs_alloc_async(void)
1274 {
1275         struct vs_async *vsa;
1276         MACH_PORT_FACE  reply_port;
1277         kern_return_t   kr;
1278
1279         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1280         if (vsa != NULL) {
1281                 /*
1282                  * Try allocating a reply port named after the
1283                  * address of the vs_async structure.
1284                  */
1285                         reply_port = ipc_port_alloc_kernel();
1286                         alias_struct = (vstruct_alias *)
1287                                 kalloc(sizeof (struct vstruct_alias));
1288                         if(alias_struct != NULL) {
1289                                 alias_struct->vs = reply_port;
1290                                 alias_struct->name = &default_pager_ops;
1291                                 reply_port->alias = (int) vsa;
1292                                 vsa->reply_port = reply_port;
1293                                 vs_alloc_async_count++;
1294                         }
1295                         else {
1296                                 vs_alloc_async_failed++;
1297                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1298                                                                 (reply_port));
1299                                 kfree(vsa, sizeof (struct vs_async));
1300                                 vsa = NULL;
1301                         }
1302         }
1303
1304         return vsa;
1305 }
1306
1307 void
1308 vs_free_async(
1309         struct vs_async *vsa)
1310 {
1311         MACH_PORT_FACE  reply_port;
1312         kern_return_t   kr;
1313
1314         reply_port = vsa->reply_port;
1315         kfree(reply_port->alias, sizeof (struct vstuct_alias));
1316         kfree(vsa, sizeof (struct vs_async));
1317         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1318 #if 0
1319         VS_ASYNC_LOCK();
1320         vs_alloc_async_count--;
1321         VS_ASYNC_UNLOCK();
1322 #endif
1323 }
1324
1325 #endif  /* VS_ASYNC_REUSE */
1326
1327 zone_t  vstruct_zone;
1328
1329 vstruct_t
1330 ps_vstruct_create(
1331         dp_size_t size)
1332 {
1333         vstruct_t       vs;
1334         unsigned int    i;
1335
1336         vs = (vstruct_t) zalloc(vstruct_zone);
1337         if (vs == VSTRUCT_NULL) {
1338                 return VSTRUCT_NULL;
1339         }
1340
1341         VS_LOCK_INIT(vs);
1342
1343         /*
1344          * The following fields will be provided later.
1345          */
1346         vs->vs_pager_ops = NULL;
1347         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1348         vs->vs_references = 1;
1349         vs->vs_seqno = 0;
1350
1351         vs->vs_waiting_seqno = FALSE;
1352         vs->vs_waiting_read = FALSE;
1353         vs->vs_waiting_write = FALSE;
1354         vs->vs_waiting_async = FALSE;
1355
1356         vs->vs_readers = 0;
1357         vs->vs_writers = 0;
1358
1359         vs->vs_errors = 0;
1360
1361         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1362         vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1363         vs->vs_async_pending = 0;
1364
1365         /*
1366          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1367          * depending on the size of the memory object.
1368          */
1369         if (INDIRECT_CLMAP(vs->vs_size)) {
1370                 vs->vs_imap = (struct vs_map **)
1371                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1372                 vs->vs_indirect = TRUE;
1373         } else {
1374                 vs->vs_dmap = (struct vs_map *)
1375                         kalloc(CLMAP_SIZE(vs->vs_size));
1376                 vs->vs_indirect = FALSE;
1377         }
1378         vs->vs_xfer_pending = FALSE;
1379         DP_DEBUG(DEBUG_VS_INTERNAL,
1380                  ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1381
1382         /*
1383          * Check to see that we got the space.
1384          */
1385         if (!vs->vs_dmap) {
1386                 kfree(vs, sizeof *vs);
1387                 return VSTRUCT_NULL;
1388         }
1389
1390         /*
1391          * Zero the indirect pointers, or clear the direct pointers.
1392          */
1393         if (vs->vs_indirect)
1394                 memset(vs->vs_imap, 0,
1395                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1396         else
1397                 for (i = 0; i < vs->vs_size; i++)
1398                         VSM_CLR(vs->vs_dmap[i]);
1399
1400         VS_MAP_LOCK_INIT(vs);
1401
1402         bs_commit(vs->vs_size);
1403
1404         return vs;
1405 }
1406
1407 paging_segment_t ps_select_segment(unsigned int, int *);        /* forward */
1408
1409 paging_segment_t
1410 ps_select_segment(
1411         unsigned int    shift,
1412         int             *psindex)
1413 {
1414         paging_segment_t        ps;
1415         int                     i;
1416         int                     j;
1417
1418         /*
1419          * Optimize case where there's only one segment.
1420          * paging_segment_max will index the one and only segment.
1421          */
1422
1423         PSL_LOCK();
1424         if (paging_segment_count == 1) {
1425                 paging_segment_t lps = PAGING_SEGMENT_NULL;     /* used to avoid extra PS_UNLOCK */
1426                 ipc_port_t trigger = IP_NULL;
1427
1428                 ps = paging_segments[paging_segment_max];
1429                 *psindex = paging_segment_max;
1430                 PS_LOCK(ps);
1431                 if( !IS_PS_EMERGENCY_SEGMENT(ps) ) {
1432                         panic("Emergency paging segment missing\n");
1433                 }
1434                 ASSERT(ps->ps_clshift >= shift);
1435                 if(IS_PS_OK_TO_USE(ps)) {
1436                         if (ps->ps_clcount) {
1437                                 ps->ps_clcount--;
1438                                 dp_pages_free -=  1 << ps->ps_clshift;
1439                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1440                                 if(min_pages_trigger_port &&
1441                                   (dp_pages_free < minimum_pages_remaining)) {
1442                                         trigger = min_pages_trigger_port;
1443                                         min_pages_trigger_port = NULL;
1444                                         bs_low = TRUE;
1445                                         backing_store_abort_compaction = TRUE;
1446                                 }
1447                                 lps = ps;
1448                         }
1449                 }
1450                 PS_UNLOCK(ps);
1451
1452                 if( lps == PAGING_SEGMENT_NULL ) {
1453                         if(dp_pages_free) {
1454                                 dp_pages_free_drift_count++;
1455                                 if(dp_pages_free > dp_pages_free_drifted_max) {
1456                                         dp_pages_free_drifted_max = dp_pages_free;
1457                                 }
1458                                 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1459                         }
1460                         dp_pages_free = 0;
1461                 }
1462
1463                 PSL_UNLOCK();
1464
1465                 if (trigger != IP_NULL) {
1466                         dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1467
1468                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1469                         ipc_port_release_send(trigger);
1470                 }
1471                 return lps;
1472         }
1473
1474         if (paging_segment_count == 0) {
1475                 if(dp_pages_free) {
1476                         dp_pages_free_drift_count++;
1477                         if(dp_pages_free > dp_pages_free_drifted_max) {
1478                                 dp_pages_free_drifted_max = dp_pages_free;
1479                         }
1480                         dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1481                 }
1482                 dp_pages_free = 0;
1483                 PSL_UNLOCK();
1484                 return PAGING_SEGMENT_NULL;
1485         }
1486
1487         for (i = BS_MAXPRI;
1488              i >= BS_MINPRI; i--) {
1489                 int start_index;
1490
1491                 if ((ps_select_array[i] == BS_NOPRI) ||
1492                                 (ps_select_array[i] == BS_FULLPRI))
1493                         continue;
1494                 start_index = ps_select_array[i];
1495
1496                 if(!(paging_segments[start_index])) {
1497                         j = start_index+1;
1498                         physical_transfer_cluster_count = 0;
1499                 }
1500                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1501                                 (((paging_segments[start_index])->ps_clshift)
1502                                 + vm_page_shift))) {
1503                         physical_transfer_cluster_count = 0;
1504                         j = start_index + 1;
1505                 } else {
1506                         physical_transfer_cluster_count+=1;
1507                         j = start_index;
1508                         if(start_index == 0)
1509                                 start_index = paging_segment_max;
1510                         else
1511                                 start_index = start_index - 1;
1512                 }
1513
1514                 while (1) {
1515                         if (j > paging_segment_max)
1516                                 j = 0;
1517                         if ((ps = paging_segments[j]) &&
1518                             (ps->ps_bs->bs_priority == i)) {
1519                                 /*
1520                                  * Force the ps cluster size to be
1521                                  * >= that of the vstruct.
1522                                  */
1523                                 PS_LOCK(ps);
1524                                 if (IS_PS_OK_TO_USE(ps)) {
1525                                         if ((ps->ps_clcount) &&
1526                                                    (ps->ps_clshift >= shift)) {
1527                                                 ipc_port_t trigger = IP_NULL;
1528
1529                                                 ps->ps_clcount--;
1530                                                 dp_pages_free -=  1 << ps->ps_clshift;
1531                                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1532                                                 if(min_pages_trigger_port &&
1533                                                         (dp_pages_free <
1534                                                         minimum_pages_remaining)) {
1535                                                         trigger = min_pages_trigger_port;
1536                                                         min_pages_trigger_port = NULL;
1537                                                         bs_low = TRUE;
1538                                                         backing_store_abort_compaction = TRUE;
1539                                                 }
1540                                                 PS_UNLOCK(ps);
1541                                                 /*
1542                                                  * found one, quit looking.
1543                                                  */
1544                                                 ps_select_array[i] = j;
1545                                                 PSL_UNLOCK();
1546
1547                                                 if (trigger != IP_NULL) {
1548                                                         dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1549
1550                                                         default_pager_space_alert(
1551                                                                 trigger,
1552                                                                 HI_WAT_ALERT);
1553                                                         ipc_port_release_send(trigger);
1554                                                 }
1555                                                 *psindex = j;
1556                                                 return ps;
1557                                         }
1558                                 }
1559                                 PS_UNLOCK(ps);
1560                         }
1561                         if (j == start_index) {
1562                                 /*
1563                                  * none at this priority -- mark it full
1564                                  */
1565                                 ps_select_array[i] = BS_FULLPRI;
1566                                 break;
1567                         }
1568                         j++;
1569                 }
1570         }
1571
1572         if(dp_pages_free) {
1573                 dp_pages_free_drift_count++;
1574                 if(dp_pages_free > dp_pages_free_drifted_max) {
1575                         dp_pages_free_drifted_max = dp_pages_free;
1576                 }
1577                 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free));
1578         }
1579         dp_pages_free = 0;
1580         PSL_UNLOCK();
1581         return PAGING_SEGMENT_NULL;
1582 }
1583
1584 dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1585
1586 dp_offset_t
1587 ps_allocate_cluster(
1588         vstruct_t               vs,
1589         int                     *psindex,
1590         paging_segment_t        use_ps)
1591 {
1592         unsigned int            byte_num;
1593         int                     bit_num = 0;
1594         paging_segment_t        ps;
1595         dp_offset_t             cluster;
1596         ipc_port_t              trigger = IP_NULL;
1597
1598         /*
1599          * Find best paging segment.
1600          * ps_select_segment will decrement cluster count on ps.
1601          * Must pass cluster shift to find the most appropriate segment.
1602          */
1603         /* NOTE:  The addition of paging segment delete capability threatened
1604          * to seriously complicate the treatment of paging segments in this
1605          * module and the ones that call it (notably ps_clmap), because of the
1606          * difficulty in assuring that the paging segment would continue to
1607          * exist between being unlocked and locked.   This was
1608          * avoided because all calls to this module are based in either
1609          * dp_memory_object calls which rely on the vs lock, or by
1610          * the transfer function which is part of the segment delete path.
1611          * The transfer function which is part of paging segment delete is
1612          * protected from multiple callers by the backing store lock.
1613          * The paging segment delete function treats mappings to a paging
1614          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1615          * while data is transferred to the remaining segments.  This is in
1616          * line with the view that incomplete or in-transition mappings between
1617          * data, a vstruct, and backing store are protected by the vs lock.
1618          * This and the ordering of the paging segment "going_away" bit setting
1619          * protects us.
1620          */
1621 retry:
1622         if (use_ps != PAGING_SEGMENT_NULL) {
1623                 ps = use_ps;
1624                 PSL_LOCK();
1625                 PS_LOCK(ps);
1626
1627                 ASSERT(ps->ps_clcount != 0);
1628
1629                 ps->ps_clcount--;
1630                 dp_pages_free -=  1 << ps->ps_clshift;
1631                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1632                 if(min_pages_trigger_port &&
1633                                 (dp_pages_free < minimum_pages_remaining)) {
1634                         trigger = min_pages_trigger_port;
1635                         min_pages_trigger_port = NULL;
1636                         bs_low = TRUE;
1637                         backing_store_abort_compaction = TRUE;
1638                 }
1639                 PSL_UNLOCK();
1640                 PS_UNLOCK(ps);
1641                 if (trigger != IP_NULL) {
1642                         dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1643
1644                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1645                         ipc_port_release_send(trigger);
1646                 }
1647
1648         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1649                    PAGING_SEGMENT_NULL) {
1650                 static clock_sec_t lastnotify = 0;
1651                 clock_sec_t now;
1652                 clock_nsec_t nanoseconds_dummy;
1653
1654                 /*
1655                  * Don't immediately jump to the emergency segment. Give the
1656                  * dynamic pager a chance to create it's first normal swap file.
1657                  * Unless, of course the very first normal swap file can't be
1658                  * created due to some problem and we didn't expect that problem
1659                  * i.e. use_emergency_swap_file_first was never set to true initially.
1660                  * It then gets set in the swap file creation error handling.
1661                  */
1662                 if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) {
1663
1664                         ps = paging_segments[EMERGENCY_PSEG_INDEX];
1665                         if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) {
1666                                 PSL_LOCK();
1667                                 PS_LOCK(ps);
1668
1669                                 if(IS_PS_GOING_AWAY(ps)) {
1670                                         /* Someone de-activated the emergency paging segment*/
1671                                         PS_UNLOCK(ps);
1672                                         PSL_UNLOCK();
1673
1674                                 } else if(dp_pages_free) {
1675                                         /*
1676                                          * Someone has already activated the emergency paging segment
1677                                          * OR
1678                                          * Between us having rec'd a NULL segment from ps_select_segment
1679                                          * and reaching here a new normal segment could have been added.
1680                                          * E.g. we get NULL segment and another thread just added the
1681                                          * new swap file. Hence check to see if we have more dp_pages_free
1682                                          * before activating the emergency segment.
1683                                          */
1684                                         PS_UNLOCK(ps);
1685                                         PSL_UNLOCK();
1686                                         goto retry;
1687
1688                                 } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) {
1689                                         /*
1690                                          * PS_CAN_USE is only reset from the emergency segment when it's
1691                                          * been successfully recovered. So it's legal to have an emergency
1692                                          * segment that has PS_CAN_USE but no clusters because it's recovery
1693                                          * failed.
1694                                          */
1695                                         backing_store_t bs = ps->ps_bs;
1696                                         ps->ps_state |= PS_CAN_USE;
1697                                         if(ps_select_array[bs->bs_priority] == BS_FULLPRI ||
1698                                                 ps_select_array[bs->bs_priority] == BS_NOPRI) {
1699                                                 ps_select_array[bs->bs_priority] = 0;
1700                                         }
1701                                         dp_pages_free += ps->ps_pgcount;
1702                                         dp_pages_reserve -= ps->ps_pgcount;
1703                                         PS_UNLOCK(ps);
1704                                         PSL_UNLOCK();
1705                                         dprintf(("Switching ON Emergency paging segment\n"));
1706                                         goto retry;
1707                                 }
1708
1709                                 PS_UNLOCK(ps);
1710                                 PSL_UNLOCK();
1711                         }
1712                 }
1713
1714                 /*
1715                  * Emit a notification of the low-paging resource condition
1716                  * but don't issue it more than once every five seconds.  This
1717                  * prevents us from overflowing logs with thousands of
1718                  * repetitions of the message.
1719                  */
1720                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1721                 if (paging_segment_count > 1 && (now > lastnotify + 5)) {
1722                         /* With an activated emergency paging segment we still
1723                          * didn't get any clusters. This could mean that the
1724                          * emergency paging segment is exhausted.
1725                          */
1726                         dprintf(("System is out of paging space.\n"));
1727                         lastnotify = now;
1728                 }
1729
1730                 PSL_LOCK();
1731
1732                 if(min_pages_trigger_port) {
1733                         trigger = min_pages_trigger_port;
1734                         min_pages_trigger_port = NULL;
1735                         bs_low = TRUE;
1736                         backing_store_abort_compaction = TRUE;
1737                 }
1738                 PSL_UNLOCK();
1739                 if (trigger != IP_NULL) {
1740                         dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1741
1742                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1743                         ipc_port_release_send(trigger);
1744                 }
1745                 return (dp_offset_t) -1;
1746         }
1747
1748         /*
1749          * Look for an available cluster.  At the end of the loop,
1750          * byte_num is the byte offset and bit_num is the bit offset of the
1751          * first zero bit in the paging segment bitmap.
1752          */
1753         PS_LOCK(ps);
1754         byte_num = ps->ps_hint;
1755         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1756                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1757                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1758                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1759                                         break;
1760                         }
1761                         ASSERT(bit_num != NBBY);
1762                         break;
1763                 }
1764         }
1765         ps->ps_hint = byte_num;
1766         cluster = (byte_num*NBBY) + bit_num;
1767
1768         /* Space was reserved, so this must be true */
1769         ASSERT(cluster < ps->ps_ncls);
1770
1771         setbit(ps->ps_bmap, cluster);
1772         PS_UNLOCK(ps);
1773
1774         return cluster;
1775 }
1776
1777 void ps_deallocate_cluster(paging_segment_t, dp_offset_t);      /* forward */
1778
1779 void
1780 ps_deallocate_cluster(
1781         paging_segment_t        ps,
1782         dp_offset_t             cluster)
1783 {
1784
1785         if (cluster >= ps->ps_ncls)
1786                 panic("ps_deallocate_cluster: Invalid cluster number");
1787
1788         /*
1789          * Lock the paging segment, clear the cluster's bitmap and increment the
1790          * number of free cluster.
1791          */
1792         PSL_LOCK();
1793         PS_LOCK(ps);
1794         clrbit(ps->ps_bmap, cluster);
1795         if( IS_PS_OK_TO_USE(ps)) {
1796                 ++ps->ps_clcount;
1797                 ps->ps_pgcount +=  1 << ps->ps_clshift;
1798                 dp_pages_free +=  1 << ps->ps_clshift;
1799         } else {
1800                 ps->ps_special_clusters += 1;
1801         }
1802
1803         /*
1804          * Move the hint down to the freed cluster if it is
1805          * less than the current hint.
1806          */
1807         if ((cluster/NBBY) < ps->ps_hint) {
1808                 ps->ps_hint = (cluster/NBBY);
1809         }
1810
1811
1812         /*
1813          * If we're freeing space on a full priority, reset the array.
1814          */
1815         if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1816                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1817         PS_UNLOCK(ps);
1818         PSL_UNLOCK();
1819
1820         return;
1821 }
1822
1823 void ps_dealloc_vsmap(struct vs_map *, dp_size_t);      /* forward */
1824
1825 void
1826 ps_dealloc_vsmap(
1827         struct vs_map   *vsmap,
1828         dp_size_t       size)
1829 {
1830         unsigned int i;
1831         struct ps_vnode_trim_data trim_data;
1832
1833         ps_vnode_trim_init(&trim_data);
1834
1835         for (i = 0; i < size; i++) {
1836                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) {
1837                         ps_vnode_trim_more(&trim_data,
1838                                               &vsmap[i],
1839                                               VSM_PS(vsmap[i])->ps_clshift,
1840                                               vm_page_size << VSM_PS(vsmap[i])->ps_clshift);
1841                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1842                                               VSM_CLOFF(vsmap[i]));
1843                 } else {
1844                         ps_vnode_trim_now(&trim_data);
1845                 }
1846         }
1847         ps_vnode_trim_now(&trim_data);
1848 }
1849
1850 void
1851 ps_vstruct_dealloc(
1852         vstruct_t vs)
1853 {
1854         unsigned int    i;
1855 //      spl_t   s;
1856
1857         VS_MAP_LOCK(vs);
1858
1859         /*
1860          * If this is an indirect structure, then we walk through the valid
1861          * (non-zero) indirect pointers and deallocate the clusters
1862          * associated with each used map entry (via ps_dealloc_vsmap).
1863          * When all of the clusters in an indirect block have been
1864          * freed, we deallocate the block.  When all of the indirect
1865          * blocks have been deallocated we deallocate the memory
1866          * holding the indirect pointers.
1867          */
1868         if (vs->vs_indirect) {
1869                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1870                         if (vs->vs_imap[i] != NULL) {
1871                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1872                                 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1873                         }
1874                 }
1875                 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1876         } else {
1877                 /*
1878                  * Direct map.  Free used clusters, then memory.
1879                  */
1880                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1881                 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1882         }
1883         VS_MAP_UNLOCK(vs);
1884
1885         bs_commit(- vs->vs_size);
1886
1887         VS_MAP_LOCK_DESTROY(vs);
1888
1889         zfree(vstruct_zone, vs);
1890 }
1891
1892 void
1893 ps_vstruct_reclaim(
1894         vstruct_t vs,
1895         boolean_t return_to_vm,
1896         boolean_t reclaim_backing_store)
1897 {
1898         unsigned int    i, j;
1899         struct vs_map   *vsmap;
1900         boolean_t       vsmap_all_clear, vsimap_all_clear;
1901         struct vm_object_fault_info fault_info;
1902         int             clmap_off;
1903         unsigned int    vsmap_size;
1904         kern_return_t   kr;
1905
1906         VS_MAP_LOCK(vs);
1907
1908         fault_info.cluster_size = VM_SUPER_CLUSTER;
1909         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
1910         fault_info.user_tag = 0;
1911         fault_info.lo_offset = 0;
1912         fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift);
1913         fault_info.io_sync = reclaim_backing_store;
1914         fault_info.batch_pmap_op = FALSE;
1915
1916         /*
1917          * If this is an indirect structure, then we walk through the valid
1918          * (non-zero) indirect pointers and deallocate the clusters
1919          * associated with each used map entry (via ps_dealloc_vsmap).
1920          * When all of the clusters in an indirect block have been
1921          * freed, we deallocate the block.  When all of the indirect
1922          * blocks have been deallocated we deallocate the memory
1923          * holding the indirect pointers.
1924          */
1925         if (vs->vs_indirect) {
1926                 vsimap_all_clear = TRUE;
1927                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1928                         vsmap = vs->vs_imap[i];
1929                         if (vsmap == NULL)
1930                                 continue;
1931                         /* loop on clusters in this indirect map */
1932                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
1933                                      VSCLSIZE(vs) * i);
1934                         if (i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
1935                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
1936                         else
1937                                 vsmap_size = CLMAP_ENTRIES;
1938                         vsmap_all_clear = TRUE;
1939                         if (return_to_vm) {
1940                                 for (j = 0; j < vsmap_size;) {
1941                                         if (VSM_ISCLR(vsmap[j]) ||
1942                                             VSM_ISERR(vsmap[j])) {
1943                                                 j++;
1944                                                 clmap_off += vm_page_size * VSCLSIZE(vs);
1945                                                 continue;
1946                                         }
1947                                         VS_MAP_UNLOCK(vs);
1948                                         kr = pvs_cluster_read(
1949                                                 vs,
1950                                                 clmap_off,
1951                                                 (dp_size_t) -1, /* read whole cluster */
1952                                                 &fault_info);
1953                                         VS_MAP_LOCK(vs); /* XXX what if it changed ? */
1954                                         if (kr != KERN_SUCCESS) {
1955                                                 vsmap_all_clear = FALSE;
1956                                                 vsimap_all_clear = FALSE;
1957                                         }
1958                                 }
1959                         }
1960                         if (vsmap_all_clear) {
1961                                 ps_dealloc_vsmap(vsmap, CLMAP_ENTRIES);
1962                                 kfree(vsmap, CLMAP_THRESHOLD);
1963                                 vs->vs_imap[i] = NULL;
1964                         }
1965                 }
1966                 if (vsimap_all_clear) {
1967 //                      kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1968                 }
1969         } else {
1970                 /*
1971                  * Direct map.  Free used clusters, then memory.
1972                  */
1973                 vsmap = vs->vs_dmap;
1974                 if (vsmap == NULL) {
1975                         goto out;
1976                 }
1977                 vsmap_all_clear = TRUE;
1978                 /* loop on clusters in the direct map */
1979                 if (return_to_vm) {
1980                         for (j = 0; j < vs->vs_size;) {
1981                                 if (VSM_ISCLR(vsmap[j]) ||
1982                                     VSM_ISERR(vsmap[j])) {
1983                                         j++;
1984                                         continue;
1985                                 }
1986                                 clmap_off = vm_page_size * (j << vs->vs_clshift);
1987                                 VS_MAP_UNLOCK(vs);
1988                                 kr = pvs_cluster_read(
1989                                         vs,
1990                                         clmap_off,
1991                                         (dp_size_t) -1, /* read whole cluster */
1992                                         &fault_info);
1993                                 VS_MAP_LOCK(vs); /* XXX what if it changed ? */
1994                                 if (kr != KERN_SUCCESS) {
1995                                         vsmap_all_clear = FALSE;
1996                                 } else {
1997 //                                      VSM_CLR(vsmap[j]);
1998                                 }
1999                         }
2000                 }
2001                 if (vsmap_all_clear) {
2002                         ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
2003 //                      kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
2004                 }
2005         }
2006 out:
2007         VS_MAP_UNLOCK(vs);
2008 }
2009
2010 int ps_map_extend(vstruct_t, unsigned int);     /* forward */
2011
2012 int ps_map_extend(
2013         vstruct_t       vs,
2014         unsigned int    new_size)
2015 {
2016         struct vs_map   **new_imap;
2017         struct vs_map   *new_dmap = NULL;
2018         int             newdsize;
2019         int             i;
2020         void            *old_map = NULL;
2021         int             old_map_size = 0;
2022
2023         if (vs->vs_size >= new_size) {
2024                 /*
2025                  * Someone has already done the work.
2026                  */
2027                 return 0;
2028         }
2029
2030         /*
2031          * If the new size extends into the indirect range, then we have one
2032          * of two cases: we are going from indirect to indirect, or we are
2033          * going from direct to indirect.  If we are going from indirect to
2034          * indirect, then it is possible that the new size will fit in the old
2035          * indirect map.  If this is the case, then just reset the size of the
2036          * vstruct map and we are done.  If the new size will not
2037          * fit into the old indirect map, then we have to allocate a new
2038          * indirect map and copy the old map pointers into this new map.
2039          *
2040          * If we are going from direct to indirect, then we have to allocate a
2041          * new indirect map and copy the old direct pages into the first
2042          * indirect page of the new map.
2043          * NOTE: allocating memory here is dangerous, as we're in the
2044          * pageout path.
2045          */
2046         if (INDIRECT_CLMAP(new_size)) {
2047                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
2048
2049                 /*
2050                  * Get a new indirect map and zero it.
2051                  */
2052                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
2053                 if (vs->vs_indirect &&
2054                     (new_map_size == old_map_size)) {
2055                         bs_commit(new_size - vs->vs_size);
2056                         vs->vs_size = new_size;
2057                         return 0;
2058                 }
2059
2060                 new_imap = (struct vs_map **)kalloc(new_map_size);
2061                 if (new_imap == NULL) {
2062                         return -1;
2063                 }
2064                 memset(new_imap, 0, new_map_size);
2065
2066                 if (vs->vs_indirect) {
2067                         /* Copy old entries into new map */
2068                         memcpy(new_imap, vs->vs_imap, old_map_size);
2069                         /* Arrange to free the old map */
2070                         old_map = (void *) vs->vs_imap;
2071                         newdsize = 0;
2072                 } else {        /* Old map was a direct map */
2073                         /* Allocate an indirect page */
2074                         if ((new_imap[0] = (struct vs_map *)
2075                              kalloc(CLMAP_THRESHOLD)) == NULL) {
2076                                 kfree(new_imap, new_map_size);
2077                                 return -1;
2078                         }
2079                         new_dmap = new_imap[0];
2080                         newdsize = CLMAP_ENTRIES;
2081                 }
2082         } else {
2083                 new_imap = NULL;
2084                 newdsize = new_size;
2085                 /*
2086                  * If the new map is a direct map, then the old map must
2087                  * also have been a direct map.  All we have to do is
2088                  * to allocate a new direct map, copy the old entries
2089                  * into it and free the old map.
2090                  */
2091                 if ((new_dmap = (struct vs_map *)
2092                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
2093                         return -1;
2094                 }
2095         }
2096         if (newdsize) {
2097
2098                 /* Free the old map */
2099                 old_map = (void *) vs->vs_dmap;
2100                 old_map_size = CLMAP_SIZE(vs->vs_size);
2101
2102                 /* Copy info from the old map into the new map */
2103                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
2104
2105                 /* Initialize the rest of the new map */
2106                 for (i = vs->vs_size; i < newdsize; i++)
2107                         VSM_CLR(new_dmap[i]);
2108         }
2109         if (new_imap) {
2110                 vs->vs_imap = new_imap;
2111                 vs->vs_indirect = TRUE;
2112         } else
2113                 vs->vs_dmap = new_dmap;
2114         bs_commit(new_size - vs->vs_size);
2115         vs->vs_size = new_size;
2116         if (old_map)
2117                 kfree(old_map, old_map_size);
2118         return 0;
2119 }
2120
2121 dp_offset_t
2122 ps_clmap(
2123         vstruct_t       vs,
2124         dp_offset_t     offset,
2125         struct clmap    *clmap,
2126         int             flag,
2127         dp_size_t       size,
2128         int             error)
2129 {
2130         dp_offset_t     cluster;        /* The cluster of offset.       */
2131         dp_offset_t     newcl;          /* The new cluster allocated.   */
2132         dp_offset_t     newoff;
2133         unsigned int    i;
2134         struct vs_map   *vsmap;
2135
2136         VS_MAP_LOCK(vs);
2137
2138         ASSERT(vs->vs_dmap);
2139         cluster = atop_32(offset) >> vs->vs_clshift;
2140
2141         /*
2142          * Initialize cluster error value
2143          */
2144         clmap->cl_error = 0;
2145
2146         /*
2147          * If the object has grown, extend the page map.
2148          */
2149         if (cluster >= vs->vs_size) {
2150                 if (flag == CL_FIND) {
2151                         /* Do not allocate if just doing a lookup */
2152                         VS_MAP_UNLOCK(vs);
2153                         return (dp_offset_t) -1;
2154                 }
2155                 if (ps_map_extend(vs, cluster + 1)) {
2156                         VS_MAP_UNLOCK(vs);
2157                         return (dp_offset_t) -1;
2158                 }
2159         }
2160
2161         /*
2162          * Look for the desired cluster.  If the map is indirect, then we
2163          * have a two level lookup.  First find the indirect block, then
2164          * find the actual cluster.  If the indirect block has not yet
2165          * been allocated, then do so.  If the cluster has not yet been
2166          * allocated, then do so.
2167          *
2168          * If any of the allocations fail, then return an error.
2169          * Don't allocate if just doing a lookup.
2170          */
2171         if (vs->vs_indirect) {
2172                 long    ind_block = cluster/CLMAP_ENTRIES;
2173
2174                 /* Is the indirect block allocated? */
2175                 vsmap = vs->vs_imap[ind_block];
2176                 if (vsmap == NULL) {
2177                         if (flag == CL_FIND) {
2178                                 VS_MAP_UNLOCK(vs);
2179                                 return (dp_offset_t) -1;
2180                         }
2181
2182                         /* Allocate the indirect block */
2183                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
2184                         if (vsmap == NULL) {
2185                                 VS_MAP_UNLOCK(vs);
2186                                 return (dp_offset_t) -1;
2187                         }
2188                         /* Initialize the cluster offsets */
2189                         for (i = 0; i < CLMAP_ENTRIES; i++)
2190                                 VSM_CLR(vsmap[i]);
2191                         vs->vs_imap[ind_block] = vsmap;
2192                 }
2193         } else
2194                 vsmap = vs->vs_dmap;
2195
2196         ASSERT(vsmap);
2197         vsmap += cluster%CLMAP_ENTRIES;
2198
2199         /*
2200          * At this point, vsmap points to the struct vs_map desired.
2201          *
2202          * Look in the map for the cluster, if there was an error on a
2203          * previous write, flag it and return.  If it is not yet
2204          * allocated, then allocate it, if we're writing; if we're
2205          * doing a lookup and the cluster's not allocated, return error.
2206          */
2207         if (VSM_ISERR(*vsmap)) {
2208                 clmap->cl_error = VSM_GETERR(*vsmap);
2209                 VS_MAP_UNLOCK(vs);
2210                 return (dp_offset_t) -1;
2211         } else if (VSM_ISCLR(*vsmap)) {
2212                 int psindex;
2213
2214                 if (flag == CL_FIND) {
2215                         /*
2216                          * If there's an error and the entry is clear, then
2217                          * we've run out of swap space.  Record the error
2218                          * here and return.
2219                          */
2220                         if (error) {
2221                                 VSM_SETERR(*vsmap, error);
2222                         }
2223                         VS_MAP_UNLOCK(vs);
2224                         return (dp_offset_t) -1;
2225                 } else {
2226                         /*
2227                          * Attempt to allocate a cluster from the paging segment
2228                          */
2229                         newcl = ps_allocate_cluster(vs, &psindex,
2230                                                     PAGING_SEGMENT_NULL);
2231                         if (newcl == (dp_offset_t) -1) {
2232                                 VS_MAP_UNLOCK(vs);
2233                                 return (dp_offset_t) -1;
2234                         }
2235                         VSM_CLR(*vsmap);
2236                         VSM_SETCLOFF(*vsmap, newcl);
2237                         VSM_SETPS(*vsmap, psindex);
2238                 }
2239         } else
2240                 newcl = VSM_CLOFF(*vsmap);
2241
2242         /*
2243          * Fill in pertinent fields of the clmap
2244          */
2245         clmap->cl_ps = VSM_PS(*vsmap);
2246         clmap->cl_numpages = VSCLSIZE(vs);
2247         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
2248
2249         /*
2250          * Byte offset in paging segment is byte offset to cluster plus
2251          * byte offset within cluster.  It looks ugly, but should be
2252          * relatively quick.
2253          */
2254         ASSERT(trunc_page(offset) == offset);
2255         newcl = ptoa_32(newcl) << vs->vs_clshift;
2256         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
2257         if (flag == CL_ALLOC) {
2258                 /*
2259                  * set bits in the allocation bitmap according to which
2260                  * pages were requested.  size is in bytes.
2261                  */
2262                 i = atop_32(newoff);
2263                 while ((size > 0) && (i < VSCLSIZE(vs))) {
2264                         VSM_SETALLOC(*vsmap, i);
2265                         i++;
2266                         size -= vm_page_size;
2267                 }
2268         }
2269         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
2270         if (newoff) {
2271                 /*
2272                  * Offset is not cluster aligned, so number of pages
2273                  * and bitmaps must be adjusted
2274                  */
2275                 clmap->cl_numpages -= atop_32(newoff);
2276                 CLMAP_SHIFT(clmap, vs);
2277                 CLMAP_SHIFTALLOC(clmap, vs);
2278         }
2279
2280         /*
2281          *
2282          * The setting of valid bits and handling of write errors
2283          * must be done here, while we hold the lock on the map.
2284          * It logically should be done in ps_vs_write_complete().
2285          * The size and error information has been passed from
2286          * ps_vs_write_complete().  If the size parameter is non-zero,
2287          * then there is work to be done.  If error is also non-zero,
2288          * then the error number is recorded in the cluster and the
2289          * entire cluster is in error.
2290          */
2291         if (size && flag == CL_FIND) {
2292                 dp_offset_t off = (dp_offset_t) 0;
2293
2294                 if (!error) {
2295                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
2296                              i++) {
2297                                 VSM_SETPG(*vsmap, i);
2298                                 size -= vm_page_size;
2299                         }
2300                         ASSERT(i <= VSCLSIZE(vs));
2301                 } else {
2302                         BS_STAT(clmap->cl_ps->ps_bs,
2303                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
2304                                         atop_32(size));
2305                         off = VSM_CLOFF(*vsmap);
2306                         VSM_SETERR(*vsmap, error);
2307                 }
2308                 /*
2309                  * Deallocate cluster if error, and no valid pages
2310                  * already present.
2311                  */
2312                 if (off != (dp_offset_t) 0)
2313                         ps_deallocate_cluster(clmap->cl_ps, off);
2314                 VS_MAP_UNLOCK(vs);
2315                 return (dp_offset_t) 0;
2316         } else
2317                 VS_MAP_UNLOCK(vs);
2318
2319         DP_DEBUG(DEBUG_VS_INTERNAL,
2320                  ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2321                   newcl+newoff, (int) vs, (int) vsmap, flag));
2322         DP_DEBUG(DEBUG_VS_INTERNAL,
2323                  ("     clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2324                   (int) clmap->cl_ps, clmap->cl_numpages,
2325                   (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
2326
2327         return (newcl + newoff);
2328 }
2329
2330 void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t);     /* forward */
2331
2332 void
2333 ps_clunmap(
2334         vstruct_t       vs,
2335         dp_offset_t     offset,
2336         dp_size_t       length)
2337 {
2338         dp_offset_t             cluster; /* The cluster number of offset */
2339         struct vs_map           *vsmap;
2340         struct ps_vnode_trim_data trim_data;
2341
2342         ps_vnode_trim_init(&trim_data);
2343
2344         VS_MAP_LOCK(vs);
2345
2346         /*
2347          * Loop through all clusters in this range, freeing paging segment
2348          * clusters and map entries as encountered.
2349          */
2350         while (length > 0) {
2351                 dp_offset_t     newoff;
2352                 unsigned int    i;
2353
2354                 cluster = atop_32(offset) >> vs->vs_clshift;
2355                 if (vs->vs_indirect)    /* indirect map */
2356                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2357                 else
2358                         vsmap = vs->vs_dmap;
2359                 if (vsmap == NULL) {
2360                         ps_vnode_trim_now(&trim_data);
2361                         VS_MAP_UNLOCK(vs);
2362                         return;
2363                 }
2364                 vsmap += cluster%CLMAP_ENTRIES;
2365                 if (VSM_ISCLR(*vsmap)) {
2366                         ps_vnode_trim_now(&trim_data);
2367                         length -= vm_page_size;
2368                         offset += vm_page_size;
2369                         continue;
2370                 }
2371                 /*
2372                  * We've got a valid mapping.  Clear it and deallocate
2373                  * paging segment cluster pages.
2374                  * Optimize for entire cluster cleraing.
2375                  */
2376                 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2377                         /*
2378                          * Not cluster aligned.
2379                          */
2380                         ASSERT(trunc_page(newoff) == newoff);
2381                         i = atop_32(newoff);
2382                 } else
2383                         i = 0;
2384                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2385                         VSM_CLRPG(*vsmap, i);
2386                         VSM_CLRALLOC(*vsmap, i);
2387                         length -= vm_page_size;
2388                         offset += vm_page_size;
2389                         i++;
2390                 }
2391
2392                 /*
2393                  * If map entry is empty, clear and deallocate cluster.
2394                  */
2395                 if (!VSM_BMAP(*vsmap)) {
2396                         ps_vnode_trim_more(&trim_data,
2397                                               vsmap,
2398                                               vs->vs_clshift,
2399                                               VSCLSIZE(vs) * vm_page_size);
2400                         ps_deallocate_cluster(VSM_PS(*vsmap),
2401                                               VSM_CLOFF(*vsmap));
2402                         VSM_CLR(*vsmap);
2403                 } else {
2404                         ps_vnode_trim_now(&trim_data);
2405                 }
2406         }
2407         ps_vnode_trim_now(&trim_data);
2408
2409         VS_MAP_UNLOCK(vs);
2410 }
2411
2412 void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */
2413
2414 void
2415 ps_vs_write_complete(
2416         vstruct_t       vs,
2417         dp_offset_t     offset,
2418         dp_size_t       size,
2419         int             error)
2420 {
2421         struct clmap    clmap;
2422
2423         /*
2424          * Get the struct vsmap for this cluster.
2425          * Use READ, even though it was written, because the
2426          * cluster MUST be present, unless there was an error
2427          * in the original ps_clmap (e.g. no space), in which
2428          * case, nothing happens.
2429          *
2430          * Must pass enough information to ps_clmap to allow it
2431          * to set the vs_map structure bitmap under lock.
2432          */
2433         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2434 }
2435
2436 void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int);    /* forward */
2437
2438 void
2439 vs_cl_write_complete(
2440         vstruct_t                       vs,
2441         __unused paging_segment_t       ps,
2442         dp_offset_t                     offset,
2443         __unused vm_offset_t            addr,
2444         dp_size_t                       size,
2445         boolean_t                       async,
2446         int                             error)
2447 {
2448 //      kern_return_t   kr;
2449
2450         if (error) {
2451                 /*
2452                  * For internal objects, the error is recorded on a
2453                  * per-cluster basis by ps_clmap() which is called
2454                  * by ps_vs_write_complete() below.
2455                  */
2456                 dprintf(("write failed error = 0x%x\n", error));
2457                 /* add upl_abort code here */
2458         } else
2459                 GSTAT(global_stats.gs_pages_out += atop_32(size));
2460         /*
2461          * Notify the vstruct mapping code, so it can do its accounting.
2462          */
2463         ps_vs_write_complete(vs, offset, size, error);
2464
2465         if (async) {
2466                 VS_LOCK(vs);
2467                 ASSERT(vs->vs_async_pending > 0);
2468                 vs->vs_async_pending -= size;
2469                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2470                         vs->vs_waiting_async = FALSE;
2471                         VS_UNLOCK(vs);
2472                         thread_wakeup(&vs->vs_async_pending);
2473                 } else {
2474                         VS_UNLOCK(vs);
2475                 }
2476         }
2477 }
2478
2479 #ifdef DEVICE_PAGING
2480 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2481
2482 kern_return_t
2483 device_write_reply(
2484         MACH_PORT_FACE  reply_port,
2485         kern_return_t   device_code,
2486         io_buf_len_t    bytes_written)
2487 {
2488         struct vs_async *vsa;
2489
2490         vsa = (struct vs_async *)
2491                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2492
2493         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2494                 device_code = KERN_FAILURE;
2495         }
2496
2497         vsa->vsa_error = device_code;
2498
2499
2500         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2501         if(vsa->vsa_flags & VSA_TRANSFER) {
2502                 /* revisit when async disk segments redone */
2503                 if(vsa->vsa_error) {
2504                    /* need to consider error condition.  re-write data or */
2505                    /* throw it away here. */
2506                    vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2507                 }
2508                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2509                                                 vsa->vsa_size, vsa->vsa_error);
2510         } else {
2511                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2512                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2513                              vsa->vsa_error);
2514         }
2515         VS_FREE_ASYNC(vsa);
2516
2517         return KERN_SUCCESS;
2518 }
2519
2520 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2521 kern_return_t
2522 device_write_reply_inband(
2523         MACH_PORT_FACE          reply_port,
2524         kern_return_t           return_code,
2525         io_buf_len_t            bytes_written)
2526 {
2527         panic("device_write_reply_inband: illegal");
2528         return KERN_SUCCESS;
2529 }
2530
2531 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2532 kern_return_t
2533 device_read_reply(
2534         MACH_PORT_FACE          reply_port,
2535         kern_return_t           return_code,
2536         io_buf_ptr_t            data,
2537         mach_msg_type_number_t  dataCnt)
2538 {
2539         struct vs_async *vsa;
2540         vsa = (struct vs_async *)
2541                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2542         vsa->vsa_addr = (vm_offset_t)data;
2543         vsa->vsa_size = (vm_size_t)dataCnt;
2544         vsa->vsa_error = return_code;
2545         thread_wakeup(&vsa);
2546         return KERN_SUCCESS;
2547 }
2548
2549 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2550 kern_return_t
2551 device_read_reply_inband(
2552         MACH_PORT_FACE          reply_port,
2553         kern_return_t           return_code,
2554         io_buf_ptr_inband_t     data,
2555         mach_msg_type_number_t  dataCnt)
2556 {
2557         panic("device_read_reply_inband: illegal");
2558         return KERN_SUCCESS;
2559 }
2560
2561 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2562 kern_return_t
2563 device_read_reply_overwrite(
2564         MACH_PORT_FACE          reply_port,
2565         kern_return_t           return_code,
2566         io_buf_len_t            bytes_read)
2567 {
2568         panic("device_read_reply_overwrite: illegal\n");
2569         return KERN_SUCCESS;
2570 }
2571
2572 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2573 kern_return_t
2574 device_open_reply(
2575         MACH_PORT_FACE          reply_port,
2576         kern_return_t           return_code,
2577         MACH_PORT_FACE          device_port)
2578 {
2579         panic("device_open_reply: illegal\n");
2580         return KERN_SUCCESS;
2581 }
2582
2583 kern_return_t
2584 ps_read_device(
2585         paging_segment_t        ps,
2586         dp_offset_t             offset,
2587         vm_offset_t             *bufferp,
2588         unsigned int            size,
2589         unsigned int            *residualp,
2590         int                     flags)
2591 {
2592         kern_return_t   kr;
2593         recnum_t        dev_offset;
2594         unsigned int    bytes_wanted;
2595         unsigned int    bytes_read;
2596         unsigned int    total_read;
2597         vm_offset_t     dev_buffer;
2598         vm_offset_t     buf_ptr;
2599         unsigned int    records_read;
2600         struct vs_async *vsa;
2601
2602         device_t        device;
2603         vm_map_copy_t   device_data = NULL;
2604         default_pager_thread_t *dpt = NULL;
2605
2606         device = dev_port_lookup(ps->ps_device);
2607         clustered_reads[atop_32(size)]++;
2608
2609         dev_offset = (ps->ps_offset +
2610                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2611         bytes_wanted = size;
2612         total_read = 0;
2613         *bufferp = (vm_offset_t)NULL;
2614
2615         do {
2616                 vsa = VS_ALLOC_ASYNC();
2617                 if (vsa) {
2618                         vsa->vsa_vs = NULL;
2619                         vsa->vsa_addr = 0;
2620                         vsa->vsa_offset = 0;
2621                         vsa->vsa_size = 0;
2622                         vsa->vsa_ps = NULL;
2623                 }
2624                 ip_lock(vsa->reply_port);
2625                 vsa->reply_port->ip_sorights++;
2626                 ip_reference(vsa->reply_port);
2627                 ip_unlock(vsa->reply_port);
2628                 kr = ds_device_read_common(device,
2629                                  vsa->reply_port,
2630                                  (mach_msg_type_name_t)
2631                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2632                                  (dev_mode_t) 0,
2633                                  dev_offset,
2634                                  bytes_wanted,
2635                                  (IO_READ | IO_CALL),
2636                                  (io_buf_ptr_t *) &dev_buffer,
2637                                  (mach_msg_type_number_t *) &bytes_read);
2638                 if(kr == MIG_NO_REPLY) {
2639                         assert_wait(&vsa, THREAD_UNINT);
2640                         thread_block(THREAD_CONTINUE_NULL);
2641
2642                         dev_buffer = vsa->vsa_addr;
2643                         bytes_read = (unsigned int)vsa->vsa_size;
2644                         kr = vsa->vsa_error;
2645                 }
2646                 VS_FREE_ASYNC(vsa);
2647                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2648                         break;
2649                 }
2650                 total_read += bytes_read;
2651
2652                 /*
2653                  * If we got the entire range, use the returned dev_buffer.
2654                  */
2655                 if (bytes_read == size) {
2656                         *bufferp = (vm_offset_t)dev_buffer;
2657                         break;
2658                 }
2659
2660 #if 1
2661                 dprintf(("read only %d bytes out of %d\n",
2662                          bytes_read, bytes_wanted));
2663 #endif
2664                 if(dpt == NULL) {
2665                         dpt = get_read_buffer();
2666                         buf_ptr = dpt->dpt_buffer;
2667                         *bufferp = (vm_offset_t)buf_ptr;
2668                 }
2669                 /*
2670                  * Otherwise, copy the data into the provided buffer (*bufferp)
2671                  * and append the rest of the range as it comes in.
2672                  */
2673                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2674                 buf_ptr += bytes_read;
2675                 bytes_wanted -= bytes_read;
2676                 records_read = (bytes_read >>
2677                                 (vm_page_shift - ps->ps_record_shift));
2678                 dev_offset += records_read;
2679                 DP_DEBUG(DEBUG_VS_INTERNAL,
2680                          ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2681                           dev_buffer, bytes_read));
2682                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2683                     != KERN_SUCCESS)
2684                         Panic("dealloc buf");
2685         } while (bytes_wanted);
2686
2687         *residualp = size - total_read;
2688         if((dev_buffer != *bufferp) && (total_read != 0)) {
2689                 vm_offset_t temp_buffer;
2690                 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2691                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2692                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2693                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2694                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2695                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2696                         (vm_map_copy_t *)&device_data, FALSE))
2697                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2698         }
2699         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2700                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2701                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2702                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2703                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2704                         (vm_map_copy_t *)&device_data, FALSE))
2705                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2706         }
2707         else {
2708                 device_data = NULL;
2709         }
2710         *bufferp = (vm_offset_t)device_data;
2711
2712         if(dpt != NULL) {
2713                 /* Free the receive buffer */
2714                 dpt->checked_out = 0;
2715                 thread_wakeup(&dpt_array);
2716         }
2717         return KERN_SUCCESS;
2718 }
2719
2720 kern_return_t
2721 ps_write_device(
2722         paging_segment_t        ps,
2723         dp_offset_t             offset,
2724         vm_offset_t             addr,
2725         unsigned int            size,
2726         struct vs_async         *vsa)
2727 {
2728         recnum_t        dev_offset;
2729         io_buf_len_t    bytes_to_write, bytes_written;
2730         recnum_t        records_written;
2731         kern_return_t   kr;
2732         MACH_PORT_FACE  reply_port;
2733
2734
2735
2736         clustered_writes[atop_32(size)]++;
2737
2738         dev_offset = (ps->ps_offset +
2739                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2740         bytes_to_write = size;
2741
2742         if (vsa) {
2743                 /*
2744                  * Asynchronous write.
2745                  */
2746                 reply_port = vsa->reply_port;
2747                 ip_lock(reply_port);
2748                 reply_port->ip_sorights++;
2749                 ip_reference(reply_port);
2750                 ip_unlock(reply_port);
2751                 {
2752                 device_t        device;
2753                 device = dev_port_lookup(ps->ps_device);
2754
2755                 vsa->vsa_addr = addr;
2756                 kr=ds_device_write_common(device,
2757                         reply_port,
2758                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2759                         (dev_mode_t) 0,
2760                         dev_offset,
2761                         (io_buf_ptr_t)  addr,
2762                         size,
2763                         (IO_WRITE | IO_CALL),
2764                         &bytes_written);
2765                 }
2766                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2767                         if (verbose)
2768                                 dprintf(("%s0x%x, addr=0x%x,"
2769                                          "size=0x%x,offset=0x%x\n",
2770                                          "device_write_request returned ",
2771                                          kr, addr, size, offset));
2772                         BS_STAT(ps->ps_bs,
2773                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2774                         /* do the completion notification to free resources */
2775                         device_write_reply(reply_port, kr, 0);
2776                         return PAGER_ERROR;
2777                 }
2778         } else do {
2779                 /*
2780                  * Synchronous write.
2781                  */
2782                 {
2783                 device_t        device;
2784                 device = dev_port_lookup(ps->ps_device);
2785                 kr=ds_device_write_common(device,
2786                         IP_NULL, 0,
2787                         (dev_mode_t) 0,
2788                         dev_offset,
2789                         (io_buf_ptr_t)  addr,
2790                         size,
2791                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2792                         &bytes_written);
2793                 }
2794                 if (kr != KERN_SUCCESS) {
2795                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2796                                  "device_write returned ",
2797                                  kr, addr, size, offset));
2798                         BS_STAT(ps->ps_bs,
2799                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2800                         return PAGER_ERROR;
2801                 }
2802                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2803                         Panic("fragmented write");
2804                 records_written = (bytes_written >>
2805                                    (vm_page_shift - ps->ps_record_shift));
2806                 dev_offset += records_written;
2807 #if 1
2808                 if (bytes_written != bytes_to_write) {
2809                         dprintf(("wrote only %d bytes out of %d\n",
2810                                  bytes_written, bytes_to_write));
2811                 }
2812 #endif
2813                 bytes_to_write -= bytes_written;
2814                 addr += bytes_written;
2815         } while (bytes_to_write > 0);
2816
2817         return PAGER_SUCCESS;
2818 }
2819
2820
2821 #else /* !DEVICE_PAGING */
2822
2823 kern_return_t
2824 ps_read_device(
2825         __unused paging_segment_t       ps,
2826         __unused dp_offset_t            offset,
2827         __unused vm_offset_t            *bufferp,
2828         __unused unsigned int           size,
2829         __unused unsigned int           *residualp,
2830         __unused int                            flags)
2831 {
2832   panic("ps_read_device not supported");
2833   return KERN_FAILURE;
2834 }
2835
2836 kern_return_t
2837 ps_write_device(
2838         __unused paging_segment_t       ps,
2839         __unused dp_offset_t            offset,
2840         __unused vm_offset_t            addr,
2841         __unused unsigned int           size,
2842         __unused struct vs_async        *vsa)
2843 {
2844   panic("ps_write_device not supported");
2845   return KERN_FAILURE;
2846 }
2847
2848 #endif /* DEVICE_PAGING */
2849 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t);      /* forward */
2850
2851 void
2852 pvs_object_data_provided(
2853         __unused vstruct_t              vs,
2854         __unused upl_t                  upl,
2855         __unused upl_offset_t   offset,
2856         upl_size_t                              size)
2857 {
2858
2859         DP_DEBUG(DEBUG_VS_INTERNAL,
2860                  ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2861                   upl, offset, size));
2862
2863         ASSERT(size > 0);
2864         GSTAT(global_stats.gs_pages_in += atop_32(size));
2865
2866 /* check upl iosync flag instead of using RECLAIM_SWAP*/
2867 #if     RECLAIM_SWAP
2868         if (size != upl->size) {
2869                 upl_abort(upl, UPL_ABORT_ERROR);
2870                 upl_deallocate(upl);
2871         } else {
2872                 ps_clunmap(vs, offset, size);
2873                 upl_commit(upl, NULL, 0);
2874                 upl_deallocate(upl);
2875         }
2876 #endif  /* RECLAIM_SWAP */
2877
2878 }
2879
2880 static memory_object_offset_t   last_start;
2881 static vm_size_t                last_length;
2882
2883 /*
2884  * A "cnt" of 0 means that the caller just wants to check if the page at
2885  * offset "vs_offset" exists in the backing store.  That page hasn't been
2886  * prepared, so no need to release it.
2887  *
2888  * A "cnt" of -1 means that the caller wants to bring back from the backing
2889  * store all existing pages in the cluster containing "vs_offset".
2890  */
2891 kern_return_t
2892 pvs_cluster_read(
2893         vstruct_t       vs,
2894         dp_offset_t     vs_offset,
2895         dp_size_t       cnt,
2896         void            *fault_info)
2897 {
2898         kern_return_t           error = KERN_SUCCESS;
2899         unsigned int            size;
2900         unsigned int            residual;
2901         unsigned int            request_flags;
2902         int                     io_flags = 0;
2903         int                     seg_index;
2904         int                     pages_in_cl;
2905         int                     cl_size;
2906         int                     cl_mask;
2907         int                     cl_index;
2908         unsigned int            xfer_size;
2909         dp_offset_t             orig_vs_offset;
2910         dp_offset_t       ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2911         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2912         struct clmap            clmap;
2913         upl_t                   upl;
2914         unsigned int            page_list_count;
2915         memory_object_offset_t  cluster_start;
2916         vm_size_t               cluster_length;
2917         uint32_t                io_streaming;
2918         int                     i;
2919         boolean_t               io_sync = FALSE;
2920
2921         pages_in_cl = 1 << vs->vs_clshift;
2922         cl_size = pages_in_cl * vm_page_size;
2923         cl_mask = cl_size - 1;
2924
2925         request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2926
2927         if (cnt == (dp_size_t) -1) {
2928                 /*
2929                  * We've been called from ps_vstruct_reclaim() to move all
2930                  * the object's swapped pages back to VM pages.
2931                  * This can put memory pressure on the system, so we do want
2932                  * to wait for free pages, to avoid getting in the way of the
2933                  * vm_pageout_scan() thread.
2934                  * Let's not use UPL_NOBLOCK in this case.
2935                  */
2936                 vs_offset &= ~cl_mask;
2937                 i = pages_in_cl;
2938         } else {
2939                 i = 1;
2940
2941                 /*
2942                  * if the I/O cluster size == PAGE_SIZE, we don't want to set
2943                  * the UPL_NOBLOCK since we may be trying to recover from a
2944                  * previous partial pagein I/O that occurred because we were low
2945                  * on memory and bailed early in order to honor the UPL_NOBLOCK...
2946                  * since we're only asking for a single page, we can block w/o fear
2947                  * of tying up pages while waiting for more to become available
2948                  */
2949                 if (fault_info == NULL || ((vm_object_fault_info_t)fault_info)->cluster_size > PAGE_SIZE)
2950                         request_flags |= UPL_NOBLOCK;
2951         }
2952
2953 again:
2954         cl_index = (vs_offset & cl_mask) / vm_page_size;
2955
2956         if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) ||
2957             !CLMAP_ISSET(clmap, cl_index)) {
2958                 /*
2959                  * the needed page doesn't exist in the backing store...
2960                  * we don't want to try to do any I/O, just abort the
2961                  * page and let the fault handler provide a zero-fill
2962                  */
2963                 if (cnt == 0) {
2964                         /*
2965                          * The caller was just poking at us to see if
2966                          * the page has been paged out.  No need to
2967                          * mess with the page at all.
2968                          * Just let the caller know we don't have that page.
2969                          */
2970                         return KERN_FAILURE;
2971                 }
2972                 if (cnt == (dp_size_t) -1) {
2973                         i--;
2974                         if (i == 0) {
2975                                 /* no more pages in this cluster */
2976                                 return KERN_FAILURE;
2977                         }
2978                         /* try the next page in this cluster */
2979                         vs_offset += vm_page_size;
2980                         goto again;
2981                 }
2982
2983                 page_list_count = 0;
2984
2985                 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2986                                                 PAGE_SIZE, PAGE_SIZE,
2987                                                 &upl, NULL, &page_list_count,
2988                                                 request_flags  | UPL_SET_INTERNAL);
2989                 upl_range_needed(upl, 0, 1);
2990
2991                 if (clmap.cl_error)
2992                         upl_abort(upl, UPL_ABORT_ERROR);
2993                 else
2994                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2995                 upl_deallocate(upl);
2996
2997                 return KERN_SUCCESS;
2998         }
2999
3000         if (cnt == 0) {
3001                 /*
3002                  * The caller was just poking at us to see if
3003                  * the page has been paged out.  No need to
3004                  * mess with the page at all.
3005                  * Just let the caller know we do have that page.
3006                  */
3007                 return KERN_SUCCESS;
3008         }
3009
3010         if(((vm_object_fault_info_t)fault_info)->io_sync == TRUE ) {
3011                 io_sync = TRUE;
3012         } else {
3013 #if RECLAIM_SWAP
3014                 io_sync = TRUE;
3015 #endif  /* RECLAIM_SWAP */
3016         }
3017
3018         if( io_sync == TRUE ) {
3019
3020                 io_flags |= UPL_IOSYNC | UPL_NOCOMMIT;
3021 #if USE_PRECIOUS
3022                 request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE;
3023 #else   /* USE_PRECIOUS */
3024                 request_flags |= UPL_REQUEST_SET_DIRTY;
3025 #endif  /* USE_PRECIOUS */
3026         }
3027
3028         assert(dp_encryption_inited);
3029         if (dp_encryption) {
3030                 /*
3031                  * ENCRYPTED SWAP:
3032                  * request that the UPL be prepared for
3033                  * decryption.
3034                  */
3035                 request_flags |= UPL_ENCRYPT;
3036                 io_flags |= UPL_PAGING_ENCRYPTED;
3037         }
3038         orig_vs_offset = vs_offset;
3039
3040         assert(cnt != 0);
3041         cnt = VM_SUPER_CLUSTER;
3042         cluster_start = (memory_object_offset_t) vs_offset;
3043         cluster_length = (vm_size_t) cnt;
3044         io_streaming = 0;
3045
3046         /*
3047          * determine how big a speculative I/O we should try for...
3048          */
3049         if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
3050                 assert(vs_offset >= (dp_offset_t) cluster_start &&
3051                        vs_offset < (dp_offset_t) (cluster_start + cluster_length));
3052                 vs_offset = (dp_offset_t) cluster_start;
3053                 cnt = (dp_size_t) cluster_length;
3054         } else {
3055                 cluster_length = PAGE_SIZE;
3056                 cnt = PAGE_SIZE;
3057         }
3058
3059         if (io_streaming)
3060                 io_flags |= UPL_IOSTREAMING;
3061
3062         last_start = cluster_start;
3063         last_length = cluster_length;
3064
3065         /*
3066          * This loop will be executed multiple times until the entire
3067          * range has been looked at or we issue an I/O... if the request spans cluster
3068          * boundaries, the clusters will be checked for logical continunity,
3069          * if contiguous the I/O request will span multiple clusters...
3070          * at most only 1 I/O will be issued... it will encompass the original offset
3071          */
3072         while (cnt && error == KERN_SUCCESS) {
3073                 int     ps_info_valid;
3074
3075                 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
3076                         size = VM_SUPER_CLUSTER;
3077                         size -= vs_offset & cl_mask;
3078                 } else if (cnt > VM_SUPER_CLUSTER)
3079                         size = VM_SUPER_CLUSTER;
3080                 else
3081                         size = cnt;
3082
3083                 cnt -= size;
3084
3085                 ps_info_valid = 0;
3086                 seg_index     = 0;
3087
3088                 while (size > 0 && error == KERN_SUCCESS) {
3089                         unsigned int  abort_size;
3090                         int           failed_size;
3091                         int           beg_pseg;
3092                         int           beg_indx;
3093                         dp_offset_t   cur_offset;
3094
3095                         if ( !ps_info_valid) {
3096                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3097                                 psp[seg_index]       = CLMAP_PS(clmap);
3098                                 ps_info_valid = 1;
3099                         }
3100                         /*
3101                          * skip over unallocated physical segments
3102                          */
3103                         if (ps_offset[seg_index] == (dp_offset_t) -1) {
3104                                 abort_size = cl_size - (vs_offset & cl_mask);
3105                                 abort_size = MIN(abort_size, size);
3106
3107                                 size      -= abort_size;
3108                                 vs_offset += abort_size;
3109
3110                                 seg_index++;
3111                                 ps_info_valid = 0;
3112
3113                                 continue;
3114                         }
3115                         cl_index = (vs_offset & cl_mask) / vm_page_size;
3116
3117                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
3118                                 /*
3119                                  * skip over unallocated pages
3120                                  */
3121                                 if (CLMAP_ISSET(clmap, cl_index))
3122                                         break;
3123                                 abort_size += vm_page_size;
3124                         }
3125                         if (abort_size) {
3126                                 size      -= abort_size;
3127                                 vs_offset += abort_size;
3128
3129                                 if (cl_index == pages_in_cl) {
3130                                         /*
3131                                          * if we're at the end of this physical cluster
3132                                          * then bump to the next one and continue looking
3133                                          */
3134                                         seg_index++;
3135                                         ps_info_valid = 0;
3136
3137                                         continue;
3138                                 }
3139                                 if (size == 0)
3140                                         break;
3141                         }
3142                         /*
3143                          * remember the starting point of the first allocated page
3144                          * for the I/O we're about to issue
3145                          */
3146                         beg_pseg   = seg_index;
3147                         beg_indx   = cl_index;
3148                         cur_offset = vs_offset;
3149
3150                         /*
3151                          * calculate the size of the I/O that we can do...
3152                          * this may span multiple physical segments if
3153                          * they are contiguous
3154                          */
3155                         for (xfer_size = 0; xfer_size < size; ) {
3156
3157                                 while (cl_index < pages_in_cl && xfer_size < size) {
3158                                         /*
3159                                          * accumulate allocated pages within
3160                                          * a physical segment
3161                                          */
3162                                         if (CLMAP_ISSET(clmap, cl_index)) {
3163                                                 xfer_size  += vm_page_size;
3164                                                 cur_offset += vm_page_size;
3165                                                 cl_index++;
3166
3167                                                 BS_STAT(psp[seg_index]->ps_bs,
3168                                                         psp[seg_index]->ps_bs->bs_pages_in++);
3169                                         } else
3170                                                 break;
3171                                 }
3172                                 if (cl_index < pages_in_cl || xfer_size >= size) {
3173                                         /*
3174                                          * we've hit an unallocated page or
3175                                          * the end of this request... see if
3176                                          * it's time to fire the I/O
3177                                          */
3178                                         break;
3179                                 }
3180                                 /*
3181                                  * we've hit the end of the current physical
3182                                  * segment and there's more to do, so try
3183                                  * moving to the next one
3184                                  */
3185                                 seg_index++;
3186
3187                                 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3188                                 psp[seg_index] = CLMAP_PS(clmap);
3189                                 ps_info_valid = 1;
3190
3191                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
3192                                         /*
3193                                          * if the physical segment we're about
3194                                          * to step into is not contiguous to
3195                                          * the one we're currently in, or it's
3196                                          * in a different paging file, or
3197                                          * it hasn't been allocated....
3198                                          * we stop this run and go check
3199                                          * to see if it's time to fire the I/O
3200                                          */
3201                                         break;
3202                                 }
3203                                 /*
3204                                  * start with first page of the next physical
3205                                  * segment
3206                                  */
3207                                 cl_index = 0;
3208                         }
3209                         if (xfer_size == 0) {
3210                                 /*
3211                                  * no I/O to generate for this segment
3212                                  */
3213                                 continue;
3214                         }
3215                         if (cur_offset <= orig_vs_offset) {
3216                                 /*
3217                                  * we've hit a hole in our speculative cluster
3218                                  * before the offset that we're really after...
3219                                  * don't issue the I/O since it doesn't encompass
3220                                  * the original offset and we're looking to only
3221                                  * pull in the speculative pages if they can be
3222                                  * made part of a single I/O
3223                                  */
3224                                 size      -= xfer_size;
3225                                 vs_offset += xfer_size;
3226
3227                                 continue;
3228                         }
3229                         /*
3230                          * we have a contiguous range of allocated pages
3231                          * to read from that encompasses the original offset
3232                          */
3233                         page_list_count = 0;
3234                         memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
3235                                                         xfer_size, xfer_size,
3236                                                         &upl, NULL, &page_list_count,
3237                                                         request_flags | UPL_SET_INTERNAL);
3238
3239                         error = ps_read_file(psp[beg_pseg],
3240                                              upl, (upl_offset_t) 0,
3241                                              ps_offset[beg_pseg] + (beg_indx * vm_page_size),
3242                                              xfer_size, &residual, io_flags);
3243
3244                         failed_size = 0;
3245
3246                         /*
3247                          * Adjust counts and send response to VM.  Optimize
3248                          * for the common case, i.e. no error and/or partial
3249                          * data. If there was an error, then we need to error
3250                          * the entire range, even if some data was successfully
3251                          * read. If there was a partial read we may supply some
3252                          * data and may error some as well.  In all cases the
3253                          * VM must receive some notification for every page
3254                          * in the range.
3255                          */
3256                         if ((error == KERN_SUCCESS) && (residual == 0)) {
3257                                 /*
3258                                  * Got everything we asked for, supply the data
3259                                  * to the VM.  Note that as a side effect of
3260                                  * supplying the data, the buffer holding the
3261                                  * supplied data is deallocated from the pager's
3262                                  *  address space.
3263                                  */
3264                                 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
3265                         } else {
3266                                 failed_size = xfer_size;
3267
3268                                 if (error == KERN_SUCCESS) {
3269                                         if (residual == xfer_size) {
3270                                                 /*
3271                                                  * If a read operation returns no error
3272                                                  * and no data moved, we turn it into
3273                                                  * an error, assuming we're reading at
3274                                                  * or beyong EOF.
3275                                                  * Fall through and error the entire range.
3276                                                  */
3277                                                 error = KERN_FAILURE;
3278                                         } else {
3279                                                 /*
3280                                                  * Otherwise, we have partial read. If
3281                                                  * the part read is a integral number
3282                                                  * of pages supply it. Otherwise round
3283                                                  * it up to a page boundary, zero fill
3284                                                  * the unread part, and supply it.
3285                                                  * Fall through and error the remainder
3286                                                  * of the range, if any.
3287                                                  */
3288                                                 int fill;
3289                                                 unsigned int lsize;
3290
3291                                                 fill = residual & ~vm_page_size;
3292                                                 lsize = (xfer_size - residual) + fill;
3293
3294                                                 pvs_object_data_provided(vs, upl, vs_offset, lsize);
3295
3296                                                 if (lsize < xfer_size) {
3297                                                         failed_size = xfer_size - lsize;
3298                                                         error = KERN_FAILURE;
3299                                                 }
3300                                         }
3301                                 }
3302                         }
3303                         if (error != KERN_SUCCESS) {
3304                                 /*
3305                                  * There was an error in some part of the range, tell
3306                                  * the VM. Note that error is explicitly checked again
3307                                  * since it can be modified above.
3308                                  */
3309                                 BS_STAT(psp[beg_pseg]->ps_bs,
3310                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
3311                         }
3312                         /*
3313                          * we've issued a single I/O that encompassed the original offset
3314                          * at this point we either met our speculative request length or
3315                          * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3316                          * not present or not physically contiguous to the previous one), so
3317                          * we're done issuing I/O at this point
3318                          */
3319                         return (error);
3320                 }
3321         }
3322         return error;
3323 }
3324
3325 int vs_do_async_write = 1;
3326
3327 kern_return_t
3328 vs_cluster_write(
3329         vstruct_t       vs,
3330         upl_t           internal_upl,
3331         upl_offset_t    offset,
3332         upl_size_t      cnt,
3333         boolean_t       dp_internal,
3334         int             flags)
3335 {
3336         upl_size_t      transfer_size;
3337         int             error = 0;
3338         struct clmap    clmap;
3339
3340         dp_offset_t     actual_offset;  /* Offset within paging segment */
3341         paging_segment_t ps;
3342         dp_offset_t     mobj_base_addr;
3343         dp_offset_t     mobj_target_addr;
3344
3345         upl_t           upl;
3346         upl_page_info_t *pl;
3347         int             page_index;
3348         unsigned int    page_max_index;
3349         int             list_size;
3350         int             pages_in_cl;
3351         unsigned int    cl_size;
3352         int             base_index;
3353         unsigned int    seg_size;
3354         unsigned int    upl_offset_in_object;
3355         boolean_t       minimal_clustering = FALSE;
3356         boolean_t       found_dirty;
3357
3358         if (!dp_encryption_inited) {
3359                 /*
3360                  * ENCRYPTED SWAP:
3361                  * Once we've started using swap, we
3362                  * can't change our mind on whether
3363                  * it needs to be encrypted or
3364                  * not.
3365                  */
3366                 dp_encryption_inited = TRUE;
3367         }
3368         if (dp_encryption) {
3369                 /*
3370                  * ENCRYPTED SWAP:
3371                  * the UPL will need to be encrypted...
3372                  */
3373                 flags |= UPL_PAGING_ENCRYPTED;
3374         }
3375
3376         pages_in_cl = 1 << vs->vs_clshift;
3377         cl_size = pages_in_cl * vm_page_size;
3378
3379 #if CONFIG_FREEZE
3380         minimal_clustering = TRUE;
3381 #else
3382         if (dp_isssd == TRUE)
3383                 minimal_clustering = TRUE;
3384 #endif
3385         if (!dp_internal) {
3386                 unsigned int page_list_count;
3387                 int          request_flags;
3388                 unsigned int super_size;
3389                 int          first_dirty;
3390                 int          num_dirty;
3391                 int          num_of_pages;
3392                 int          seg_index;
3393                 upl_offset_t  upl_offset;
3394                 upl_offset_t  upl_offset_aligned;
3395                 dp_offset_t  seg_offset;
3396                 dp_offset_t  ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3397                 paging_segment_t   psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3398
3399
3400                 if (bs_low)
3401                         super_size = cl_size;
3402                 else
3403                         super_size = VM_SUPER_CLUSTER;
3404
3405                 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
3406                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
3407                                 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
3408
3409                 if (dp_encryption) {
3410                         /*
3411                          * ENCRYPTED SWAP:
3412                          * request that the UPL be prepared for
3413                          * encryption.
3414                          */
3415                         request_flags |= UPL_ENCRYPT;
3416                         flags |= UPL_PAGING_ENCRYPTED;
3417                 }
3418
3419                 page_list_count = 0;
3420                 memory_object_super_upl_request(vs->vs_control,
3421                                 (memory_object_offset_t)offset,
3422                                 cnt, super_size,
3423                                 &upl, NULL, &page_list_count,
3424                                 request_flags | UPL_FOR_PAGEOUT);
3425
3426                 /*
3427                  * The default pager does not handle objects larger than
3428                  * 4GB, so it does not deal with offset that don't fit in
3429                  * 32-bit.  Cast down upl->offset now and make sure we
3430                  * did not lose any valuable bits.
3431                  */
3432                 upl_offset_in_object = (unsigned int) upl->offset;
3433                 assert(upl->offset == upl_offset_in_object);
3434
3435                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3436
3437                 seg_size = cl_size - (upl_offset_in_object % cl_size);
3438                 upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1);
3439                 page_index = 0;
3440                 page_max_index = upl->size / PAGE_SIZE;
3441                 found_dirty = TRUE;
3442
3443                 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
3444
3445                         unsigned int    seg_pgcnt;
3446
3447                         seg_pgcnt = seg_size / PAGE_SIZE;
3448
3449                         if (minimal_clustering == TRUE) {
3450                                 unsigned int    non_dirty;
3451
3452                                 non_dirty = 0;
3453                                 found_dirty = FALSE;
3454
3455                                 for (; non_dirty < seg_pgcnt; non_dirty++) {
3456                                         if ((page_index + non_dirty) >= page_max_index)
3457                                                 break;
3458
3459                                         if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) ||
3460                                             UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) {
3461                                                 found_dirty = TRUE;
3462                                                 break;
3463                                         }
3464                                 }
3465                         }
3466                         if (found_dirty == TRUE) {
3467                                 ps_offset[seg_index] =
3468                                         ps_clmap(vs,
3469                                                  upl_offset_aligned,
3470                                                  &clmap, CL_ALLOC,
3471                                                  cl_size, 0);
3472
3473                                 if (ps_offset[seg_index] == (dp_offset_t) -1) {
3474                                         upl_abort(upl, 0);
3475                                         upl_deallocate(upl);
3476
3477                                         return KERN_FAILURE;
3478                                 }
3479                                 psp[seg_index] = CLMAP_PS(clmap);
3480                         }
3481                         if (transfer_size > seg_size) {
3482                                 page_index += seg_pgcnt;
3483                                 transfer_size -= seg_size;
3484                                 upl_offset_aligned += cl_size;
3485                                 seg_size = cl_size;
3486                                 seg_index++;
3487                         } else
3488                                 transfer_size = 0;
3489                 }
3490                 /*
3491                  * Ignore any non-present pages at the end of the
3492                  * UPL.
3493                  */
3494                 for (page_index = upl->size / vm_page_size; page_index > 0;)  {
3495                         if (UPL_PAGE_PRESENT(pl, --page_index)) {
3496                                 page_index++;
3497                                 break;
3498                         }
3499                 }
3500                 if (page_index == 0) {
3501                         /*
3502                          * no pages in the UPL
3503                          * abort and return
3504                          */
3505                         upl_abort(upl, 0);
3506                         upl_deallocate(upl);
3507
3508                         return KERN_SUCCESS;
3509                 }
3510                 num_of_pages = page_index;
3511
3512                 base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE;
3513
3514                 for (page_index = 0; page_index < num_of_pages; ) {
3515                         /*
3516                          * skip over non-dirty pages
3517                          */
3518                         for ( ; page_index < num_of_pages; page_index++) {
3519                                 if (UPL_DIRTY_PAGE(pl, page_index)
3520                                         || UPL_PRECIOUS_PAGE(pl, page_index))
3521                                         /*
3522                                          * this is a page we need to write
3523                                          * go see if we can buddy it up with
3524                                          * others that are contiguous to it
3525                                          */
3526                                         break;
3527                                 /*
3528                                  * if the page is not-dirty, but present we
3529                                  * need to commit it...  This is an unusual
3530                                  * case since we only asked for dirty pages
3531                                  */
3532                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
3533                                         boolean_t empty = FALSE;
3534                                         upl_commit_range(upl,
3535                                                  page_index * vm_page_size,
3536                                                  vm_page_size,
3537                                                  UPL_COMMIT_NOTIFY_EMPTY,
3538                                                  pl,
3539                                                  page_list_count,
3540                                                  &empty);
3541                                         if (empty) {
3542                                                 assert(page_index ==
3543                                                        num_of_pages - 1);
3544                                                 upl_deallocate(upl);
3545                                         }
3546                                 }
3547                         }
3548                         if (page_index == num_of_pages)
3549                                 /*
3550                                  * no more pages to look at, we're out of here
3551                                  */
3552                                 break;
3553
3554                         /*
3555                          * gather up contiguous dirty pages... we have at
3556                          * least 1 * otherwise we would have bailed above
3557                          * make sure that each physical segment that we step
3558                          * into is contiguous to the one we're currently in
3559                          * if it's not, we have to stop and write what we have
3560                          */
3561                         for (first_dirty = page_index;
3562                                         page_index < num_of_pages; ) {
3563                                 if ( !UPL_DIRTY_PAGE(pl, page_index)
3564                                         && !UPL_PRECIOUS_PAGE(pl, page_index))
3565                                         break;
3566                                 page_index++;
3567                                 /*
3568                                  * if we just looked at the last page in the UPL
3569                                  * we don't need to check for physical segment
3570                                  * continuity
3571                                  */
3572                                 if (page_index < num_of_pages) {
3573                                         int cur_seg;
3574                                         int nxt_seg;
3575
3576                                         cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3577                                         nxt_seg = (base_index + page_index)/pages_in_cl;
3578
3579                                         if (cur_seg != nxt_seg) {
3580                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3581                                                 /*
3582                                                  * if the segment we're about
3583                                                  * to step into is not
3584                                                  * contiguous to the one we're
3585                                                  * currently in, or it's in a
3586                                                  * different paging file....
3587                                                  * we stop here and generate
3588                                                  * the I/O
3589                                                  */
3590                                                         break;
3591                                         }
3592                                 }
3593                         }
3594                         num_dirty = page_index - first_dirty;
3595
3596                         if (num_dirty) {
3597                                 upl_offset = first_dirty * vm_page_size;
3598                                 transfer_size = num_dirty * vm_page_size;
3599
3600                                 while (transfer_size) {
3601
3602                                         if ((seg_size = cl_size -
3603                                                 ((upl_offset_in_object +
3604                                                   upl_offset) % cl_size))
3605                                                         > transfer_size)
3606                                                 seg_size = transfer_size;
3607
3608                                         ps_vs_write_complete(
3609                                                 vs,
3610                                                 (upl_offset_in_object +
3611                                                  upl_offset),
3612                                                 seg_size, error);
3613
3614                                         transfer_size -= seg_size;
3615                                         upl_offset += seg_size;
3616                                 }
3617                                 upl_offset = first_dirty * vm_page_size;
3618                                 transfer_size = num_dirty * vm_page_size;
3619
3620                                 seg_index  = (base_index + first_dirty) / pages_in_cl;
3621                                 seg_offset = (upl_offset_in_object + upl_offset) % cl_size;
3622
3623                                 error = ps_write_file(psp[seg_index],
3624                                                 upl, upl_offset,
3625                                                 ps_offset[seg_index]
3626                                                                 + seg_offset,
3627                                                 transfer_size, flags);
3628                         }
3629                 }
3630
3631         } else {
3632                 assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift));
3633                 list_size = cnt;
3634
3635                 page_index = 0;
3636                 /* The caller provides a mapped_data which is derived  */
3637                 /* from a temporary object.  The targeted pages are    */
3638                 /* guaranteed to be set at offset 0 in the mapped_data */
3639                 /* The actual offset however must still be derived     */
3640                 /* from the offset in the vs in question               */
3641                 mobj_base_addr = offset;
3642                 mobj_target_addr = mobj_base_addr;
3643
3644                 for (transfer_size = list_size; transfer_size != 0;) {
3645                         actual_offset = ps_clmap(vs, mobj_target_addr,
3646                                 &clmap, CL_ALLOC,
3647                                 transfer_size < cl_size ?
3648                                         transfer_size : cl_size, 0);
3649                         if(actual_offset == (dp_offset_t) -1) {
3650                                 error = 1;
3651                                 break;
3652                         }
3653                         cnt = MIN(transfer_size,
3654                                   (unsigned) CLMAP_NPGS(clmap) * vm_page_size);
3655                         ps = CLMAP_PS(clmap);
3656                         /* Assume that the caller has given us contiguous */
3657                         /* pages */
3658                         if(cnt) {
3659                                 ps_vs_write_complete(vs, mobj_target_addr,
3660                                                                 cnt, error);
3661                                 error = ps_write_file(ps, internal_upl,
3662                                                 0, actual_offset,
3663                                                 cnt, flags);
3664                                 if (error)
3665                                         break;
3666                            }
3667                         if (error)
3668                                 break;
3669                         actual_offset += cnt;
3670                         mobj_target_addr += cnt;
3671                         transfer_size -= cnt;
3672                         cnt = 0;
3673
3674                         if (error)
3675                                 break;
3676                 }
3677         }
3678         if(error)
3679                 return KERN_FAILURE;
3680         else
3681                 return KERN_SUCCESS;
3682 }
3683
3684 vm_size_t
3685 ps_vstruct_allocated_size(
3686         vstruct_t       vs)
3687 {
3688         int             num_pages;
3689         struct vs_map   *vsmap;
3690         unsigned int    i, j, k;
3691
3692         num_pages = 0;
3693         if (vs->vs_indirect) {
3694                 /* loop on indirect maps */
3695                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3696                         vsmap = vs->vs_imap[i];
3697                         if (vsmap == NULL)
3698                                 continue;
3699                         /* loop on clusters in this indirect map */
3700                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3701                                 if (VSM_ISCLR(vsmap[j]) ||
3702                                     VSM_ISERR(vsmap[j]))
3703                                         continue;
3704                                 /* loop on pages in this cluster */
3705                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3706                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3707                                                 num_pages++;
3708                                 }
3709                         }
3710                 }
3711         } else {
3712                 vsmap = vs->vs_dmap;
3713                 if (vsmap == NULL)
3714                         return 0;
3715                 /* loop on clusters in the direct map */
3716                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3717                         if (VSM_ISCLR(vsmap[j]) ||
3718                             VSM_ISERR(vsmap[j]))
3719                                 continue;
3720                         /* loop on pages in this cluster */
3721                         for (k = 0; k < VSCLSIZE(vs); k++) {
3722                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3723                                         num_pages++;
3724                         }
3725                 }
3726         }
3727
3728         return ptoa_32(num_pages);
3729 }
3730
3731 unsigned int
3732 ps_vstruct_allocated_pages(
3733         vstruct_t               vs,
3734         default_pager_page_t    *pages,
3735         unsigned int            pages_size)
3736 {
3737         unsigned int    num_pages;
3738         struct vs_map   *vsmap;
3739         dp_offset_t     offset;
3740         unsigned int    i, j, k;
3741
3742         num_pages = 0;
3743         offset = 0;
3744         if (vs->vs_indirect) {
3745                 /* loop on indirect maps */
3746                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3747                         vsmap = vs->vs_imap[i];
3748                         if (vsmap == NULL) {
3749                                 offset += (vm_page_size * CLMAP_ENTRIES *
3750                                            VSCLSIZE(vs));
3751                                 continue;
3752                         }
3753                         /* loop on clusters in this indirect map */
3754                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3755                                 if (VSM_ISCLR(vsmap[j]) ||
3756                                     VSM_ISERR(vsmap[j])) {
3757                                         offset += vm_page_size * VSCLSIZE(vs);
3758                                         continue;
3759                                 }
3760                                 /* loop on pages in this cluster */
3761                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3762                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3763                                                 num_pages++;
3764                                                 if (num_pages < pages_size)
3765                                                         pages++->dpp_offset =
3766                                                                 offset;
3767                                         }
3768                                         offset += vm_page_size;
3769                                 }
3770                         }
3771                 }
3772         } else {
3773                 vsmap = vs->vs_dmap;
3774                 if (vsmap == NULL)
3775                         return 0;
3776                 /* loop on clusters in the direct map */
3777                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3778                         if (VSM_ISCLR(vsmap[j]) ||
3779                             VSM_ISERR(vsmap[j])) {
3780                                 offset += vm_page_size * VSCLSIZE(vs);
3781                                 continue;
3782                         }
3783                         /* loop on pages in this cluster */
3784                         for (k = 0; k < VSCLSIZE(vs); k++) {
3785                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3786                                         num_pages++;
3787                                         if (num_pages < pages_size)
3788                                                 pages++->dpp_offset = offset;
3789                                 }
3790                                 offset += vm_page_size;
3791                         }
3792                 }
3793         }
3794
3795         return num_pages;
3796 }
3797
3798
3799 kern_return_t
3800 ps_vstruct_transfer_from_segment(
3801         vstruct_t        vs,
3802         paging_segment_t segment,
3803         upl_t            upl)
3804 {
3805         struct vs_map   *vsmap;
3806 //      struct vs_map   old_vsmap;
3807 //      struct vs_map   new_vsmap;
3808         unsigned int    i, j;
3809
3810         VS_LOCK(vs);    /* block all work on this vstruct */
3811                         /* can't allow the normal multiple write */
3812                         /* semantic because writes may conflict */
3813         vs->vs_xfer_pending = TRUE;
3814         vs_wait_for_sync_writers(vs);
3815         vs_start_write(vs);
3816         vs_wait_for_readers(vs);
3817         /* we will unlock the vs to allow other writes while transferring */
3818         /* and will be guaranteed of the persistance of the vs struct     */
3819         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3820         /* vs_async_pending */
3821         /* OK we now have guaranteed no other parties are accessing this */
3822         /* vs.  Now that we are also supporting simple lock versions of  */
3823         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3824         /* our purpose in holding it before was the multiple write case */
3825         /* we now use the boolean xfer_pending to do that.  We can use  */
3826         /* a boolean instead of a count because we have guaranteed single */
3827         /* file access to this code in its caller */
3828         VS_UNLOCK(vs);
3829 vs_changed:
3830         if (vs->vs_indirect) {
3831                 unsigned int    vsmap_size;
3832                 int             clmap_off;
3833                 /* loop on indirect maps */
3834                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3835                         vsmap = vs->vs_imap[i];
3836                         if (vsmap == NULL)
3837                                 continue;
3838                         /* loop on clusters in this indirect map */
3839                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3840                                            VSCLSIZE(vs) * i);
3841                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3842                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3843                         else
3844                                 vsmap_size = CLMAP_ENTRIES;
3845                         for (j = 0; j < vsmap_size; j++) {
3846                                 if (VSM_ISCLR(vsmap[j]) ||
3847                                     VSM_ISERR(vsmap[j]) ||
3848                                     (VSM_PS(vsmap[j]) != segment))
3849                                         continue;
3850                                 if(vs_cluster_transfer(vs,
3851                                         (vm_page_size * (j << vs->vs_clshift))
3852                                         + clmap_off,
3853                                         vm_page_size << vs->vs_clshift,
3854                                         upl)
3855                                                 != KERN_SUCCESS) {
3856                                    VS_LOCK(vs);
3857                                    vs->vs_xfer_pending = FALSE;
3858                                    VS_UNLOCK(vs);
3859                                    vs_finish_write(vs);
3860                                    return KERN_FAILURE;
3861                                 }
3862                                 /* allow other readers/writers during transfer*/
3863                                 VS_LOCK(vs);
3864                                 vs->vs_xfer_pending = FALSE;
3865                                 VS_UNLOCK(vs);
3866                                 vs_finish_write(vs);
3867
3868                                 if (backing_store_abort_compaction || backing_store_stop_compaction) {
3869                                         backing_store_abort_compaction = FALSE;
3870                                         dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n"));
3871                                         return KERN_FAILURE;
3872                                 }
3873                                 vnode_pager_throttle();
3874
3875                                 VS_LOCK(vs);
3876                                 vs->vs_xfer_pending = TRUE;
3877                                 vs_wait_for_sync_writers(vs);
3878                                 vs_start_write(vs);
3879                                 vs_wait_for_readers(vs);
3880                                 VS_UNLOCK(vs);
3881                                 if (!(vs->vs_indirect)) {
3882                                         goto vs_changed;
3883                                 }
3884                         }
3885                 }
3886         } else {
3887                 vsmap = vs->vs_dmap;
3888                 if (vsmap == NULL) {
3889                         VS_LOCK(vs);
3890                         vs->vs_xfer_pending = FALSE;
3891                         VS_UNLOCK(vs);
3892                         vs_finish_write(vs);
3893                         return KERN_SUCCESS;
3894                 }
3895                 /* loop on clusters in the direct map */
3896                 for (j = 0; j < vs->vs_size; j++) {
3897                         if (VSM_ISCLR(vsmap[j]) ||
3898                             VSM_ISERR(vsmap[j]) ||
3899                             (VSM_PS(vsmap[j]) != segment))
3900                                 continue;
3901                         if(vs_cluster_transfer(vs,
3902                                 vm_page_size * (j << vs->vs_clshift),
3903                                 vm_page_size << vs->vs_clshift,
3904                                 upl) != KERN_SUCCESS) {
3905                            VS_LOCK(vs);
3906                            vs->vs_xfer_pending = FALSE;
3907                            VS_UNLOCK(vs);
3908                            vs_finish_write(vs);
3909                            return KERN_FAILURE;
3910                         }
3911                         /* allow other readers/writers during transfer*/
3912                         VS_LOCK(vs);
3913                         vs->vs_xfer_pending = FALSE;
3914                         VS_UNLOCK(vs);
3915                         vs_finish_write(vs);
3916                         VS_LOCK(vs);
3917                         vs->vs_xfer_pending = TRUE;
3918                         vs_wait_for_sync_writers(vs);
3919                         vs_start_write(vs);
3920                         vs_wait_for_readers(vs);
3921                         VS_UNLOCK(vs);
3922                         if (vs->vs_indirect) {
3923                                 goto vs_changed;
3924                         }
3925                 }
3926         }
3927
3928         VS_LOCK(vs);
3929         vs->vs_xfer_pending = FALSE;
3930         VS_UNLOCK(vs);
3931         vs_finish_write(vs);
3932         return KERN_SUCCESS;
3933 }
3934
3935
3936
3937 vs_map_t
3938 vs_get_map_entry(
3939         vstruct_t       vs,
3940         dp_offset_t     offset)
3941 {
3942         struct vs_map   *vsmap;
3943         dp_offset_t     cluster;
3944
3945         cluster = atop_32(offset) >> vs->vs_clshift;
3946         if (vs->vs_indirect) {
3947                 long    ind_block = cluster/CLMAP_ENTRIES;
3948
3949                 /* Is the indirect block allocated? */
3950                 vsmap = vs->vs_imap[ind_block];
3951                 if(vsmap == (vs_map_t) NULL)
3952                         return vsmap;
3953         } else
3954                 vsmap = vs->vs_dmap;
3955         vsmap += cluster%CLMAP_ENTRIES;
3956         return vsmap;
3957 }
3958
3959 kern_return_t
3960 vs_cluster_transfer(
3961         vstruct_t       vs,
3962         dp_offset_t     offset,
3963         dp_size_t       cnt,
3964         upl_t           upl)
3965 {
3966         dp_offset_t             actual_offset;
3967         paging_segment_t        ps;
3968         struct clmap            clmap;
3969         kern_return_t           error = KERN_SUCCESS;
3970         unsigned int            size, size_wanted;
3971         int                     i;
3972         unsigned int            residual = 0;
3973         unsigned int            unavail_size;
3974 //      default_pager_thread_t  *dpt;
3975 //      boolean_t               dealloc;
3976         struct  vs_map          *vsmap_ptr = NULL;
3977         struct  vs_map          read_vsmap;
3978         struct  vs_map          original_read_vsmap;
3979         struct  vs_map          write_vsmap;
3980 //      upl_t                           sync_upl;
3981 //      vm_offset_t                     ioaddr;
3982
3983         /* vs_cluster_transfer reads in the pages of a cluster and
3984          * then writes these pages back to new backing store.  The
3985          * segment the pages are being read from is assumed to have
3986          * been taken off-line and is no longer considered for new
3987          * space requests.
3988          */
3989
3990         /*
3991          * This loop will be executed once per cluster referenced.
3992          * Typically this means once, since it's unlikely that the
3993          * VM system will ask for anything spanning cluster boundaries.
3994          *
3995          * If there are holes in a cluster (in a paging segment), we stop
3996          * reading at the hole, then loop again, hoping to
3997          * find valid pages later in the cluster.  This continues until
3998          * the entire range has been examined, and read, if present.  The
3999          * pages are written as they are read.  If a failure occurs after
4000          * some pages are written the unmap call at the bottom of the loop
4001          * recovers the backing store and the old backing store remains
4002          * in effect.
4003          */
4004
4005         VSM_CLR(write_vsmap);
4006         VSM_CLR(original_read_vsmap);
4007         /* grab the actual object's pages to sync with I/O */
4008         while (cnt && (error == KERN_SUCCESS)) {
4009                 vsmap_ptr = vs_get_map_entry(vs, offset);
4010                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
4011
4012                 if (actual_offset == (dp_offset_t) -1) {
4013
4014                         /*
4015                          * Nothing left to write in this cluster at least
4016                          * set write cluster information for any previous
4017                          * write, clear for next cluster, if there is one
4018                          */
4019                         unsigned int local_size, clmask, clsize;
4020
4021                         clsize = vm_page_size << vs->vs_clshift;
4022                         clmask = clsize - 1;
4023                         local_size = clsize - (offset & clmask);
4024                         ASSERT(local_size);
4025                         local_size = MIN(local_size, cnt);
4026
4027                         /* This cluster has no data in it beyond what may */
4028                         /* have been found on a previous iteration through */
4029                         /* the loop "write_vsmap" */
4030                         *vsmap_ptr = write_vsmap;
4031                         VSM_CLR(write_vsmap);
4032                         VSM_CLR(original_read_vsmap);
4033
4034                         cnt -= local_size;
4035                         offset += local_size;
4036                         continue;
4037                 }
4038
4039                 /*
4040                  * Count up contiguous available or unavailable
4041                  * pages.
4042                  */
4043                 ps = CLMAP_PS(clmap);
4044                 ASSERT(ps);
4045                 size = 0;
4046                 unavail_size = 0;
4047                 for (i = 0;
4048                      (size < cnt) && (unavail_size < cnt) &&
4049                      (i < CLMAP_NPGS(clmap)); i++) {
4050                         if (CLMAP_ISSET(clmap, i)) {
4051                                 if (unavail_size != 0)
4052                                         break;
4053                                 size += vm_page_size;
4054                                 BS_STAT(ps->ps_bs,
4055                                         ps->ps_bs->bs_pages_in++);
4056                         } else {
4057                                 if (size != 0)
4058                                         break;
4059                                 unavail_size += vm_page_size;
4060                         }
4061                 }
4062
4063                 if (size == 0) {
4064                         ASSERT(unavail_size);
4065                         ps_clunmap(vs, offset, unavail_size);
4066                         cnt -= unavail_size;
4067                         offset += unavail_size;
4068                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
4069                                 == 0) {
4070                                 /* There is no more to transfer in this
4071                                    cluster
4072                                 */
4073                                 *vsmap_ptr = write_vsmap;
4074                                 VSM_CLR(write_vsmap);
4075                                 VSM_CLR(original_read_vsmap);
4076                         }
4077                         continue;
4078                 }
4079
4080                 if(VSM_ISCLR(original_read_vsmap))
4081                         original_read_vsmap = *vsmap_ptr;
4082
4083                 if(ps->ps_segtype == PS_PARTITION) {
4084                         panic("swap partition not supported\n");
4085                         /*NOTREACHED*/
4086                         error = KERN_FAILURE;
4087                         residual = size;
4088 /*
4089                         NEED TO ISSUE WITH SYNC & NO COMMIT
4090                         error = ps_read_device(ps, actual_offset, &buffer,
4091                                        size, &residual, flags);
4092 */
4093                 } else {
4094                         /* NEED TO ISSUE WITH SYNC & NO COMMIT */
4095                         error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
4096                                         size, &residual,
4097                                         (UPL_IOSYNC | UPL_NOCOMMIT | (dp_encryption ? UPL_PAGING_ENCRYPTED : 0)));
4098                 }
4099
4100                 read_vsmap = *vsmap_ptr;
4101
4102
4103                 /*
4104                  * Adjust counts and put data in new BS.  Optimize for the
4105                  * common case, i.e. no error and/or partial data.
4106                  * If there was an error, then we need to error the entire
4107                  * range, even if some data was successfully read.
4108                  *
4109                  */
4110                 if ((error == KERN_SUCCESS) && (residual == 0)) {
4111
4112                         /*
4113                          * Got everything we asked for, supply the data to
4114                          * the new BS.  Note that as a side effect of supplying
4115                          * the data, the buffer holding the supplied data is
4116                          * deallocated from the pager's address space unless
4117                          * the write is unsuccessful.
4118                          */
4119
4120                         /* note buffer will be cleaned up in all cases by */
4121                         /* internal_cluster_write or if an error on write */
4122                         /* the vm_map_copy_page_discard call              */
4123                         *vsmap_ptr = write_vsmap;
4124
4125                         if(vs_cluster_write(vs, upl, offset,
4126                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
4127                                 error = KERN_FAILURE;
4128                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
4129                                         /* unmap the new backing store object */
4130                                         ps_clunmap(vs, offset, size);
4131                                 }
4132                                 /* original vsmap */
4133                                 *vsmap_ptr = original_read_vsmap;
4134                                 VSM_CLR(write_vsmap);
4135                         } else {
4136                                if((offset + size) &
4137                                         ((vm_page_size << vs->vs_clshift)
4138                                         - 1)) {
4139                                         /* There is more to transfer in this
4140                                            cluster
4141                                         */
4142                                         write_vsmap = *vsmap_ptr;
4143                                         *vsmap_ptr = read_vsmap;
4144                                         ps_clunmap(vs, offset, size);
4145                                 } else {
4146                                         /* discard the old backing object */
4147                                         write_vsmap = *vsmap_ptr;
4148                                         *vsmap_ptr = read_vsmap;
4149                                         ps_clunmap(vs, offset, size);
4150                                         *vsmap_ptr = write_vsmap;
4151                                         VSM_CLR(write_vsmap);
4152                                         VSM_CLR(original_read_vsmap);
4153                                 }
4154                         }
4155                 } else {
4156                         size_wanted = size;
4157                         if (error == KERN_SUCCESS) {
4158                                 if (residual == size) {
4159                                         /*
4160                                          * If a read operation returns no error
4161                                          * and no data moved, we turn it into
4162                                          * an error, assuming we're reading at
4163                                          * or beyond EOF.
4164                                          * Fall through and error the entire
4165                                          * range.
4166                                          */
4167                                         error = KERN_FAILURE;
4168                                         *vsmap_ptr = write_vsmap;
4169                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
4170                                         /* unmap the new backing store object */
4171                                         ps_clunmap(vs, offset, size);
4172                                         }
4173                                         *vsmap_ptr = original_read_vsmap;
4174                                         VSM_CLR(write_vsmap);
4175                                         continue;
4176                                 } else {
4177                                         /*
4178                                          * Otherwise, we have partial read.
4179                                          * This is also considered an error
4180                                          * for the purposes of cluster transfer
4181                                          */
4182                                         error = KERN_FAILURE;
4183                                         *vsmap_ptr = write_vsmap;
4184                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
4185                                         /* unmap the new backing store object */
4186                                         ps_clunmap(vs, offset, size);
4187                                         }
4188                                         *vsmap_ptr = original_read_vsmap;
4189                                         VSM_CLR(write_vsmap);
4190                                         continue;
4191                                 }
4192                         }
4193
4194                 }
4195                 cnt -= size;
4196                 offset += size;
4197
4198         } /* END while (cnt && (error == 0)) */
4199         if(!VSM_ISCLR(write_vsmap))
4200                 *vsmap_ptr = write_vsmap;
4201
4202         return error;
4203 }
4204
4205 kern_return_t
4206 default_pager_add_file(
4207         MACH_PORT_FACE  backing_store,
4208         vnode_ptr_t     vp,
4209         int             record_size,
4210         vm_size_t       size)
4211 {
4212         backing_store_t         bs;
4213         paging_segment_t        ps;
4214         int                     i;
4215         unsigned int            j;
4216         int                     error;
4217
4218         if ((bs = backing_store_lookup(backing_store))
4219             == BACKING_STORE_NULL)
4220                 return KERN_INVALID_ARGUMENT;
4221
4222         PSL_LOCK();
4223         for (i = 0; i <= paging_segment_max; i++) {
4224                 ps = paging_segments[i];
4225                 if (ps == PAGING_SEGMENT_NULL)
4226                         continue;
4227                 if (ps->ps_segtype != PS_FILE)
4228                         continue;
4229
4230                 /*
4231                  * Check for overlap on same device.
4232                  */
4233                 if (ps->ps_vnode == (struct vnode *)vp) {
4234                         PSL_UNLOCK();
4235                         BS_UNLOCK(bs);
4236                         return KERN_INVALID_ARGUMENT;
4237                 }
4238         }
4239         PSL_UNLOCK();
4240
4241         /*
4242          * Set up the paging segment
4243          */
4244         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
4245         if (ps == PAGING_SEGMENT_NULL) {
4246                 BS_UNLOCK(bs);
4247                 return KERN_RESOURCE_SHORTAGE;
4248         }
4249
4250         ps->ps_segtype = PS_FILE;
4251         ps->ps_vnode = (struct vnode *)vp;
4252         ps->ps_offset = 0;
4253         ps->ps_record_shift = local_log2(vm_page_size / record_size);
4254         assert((dp_size_t) size == size);
4255         ps->ps_recnum = (dp_size_t) size;
4256         ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift;
4257
4258         ps->ps_pgcount = ps->ps_pgnum;
4259         ps->ps_clshift = local_log2(bs->bs_clsize);
4260         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
4261         ps->ps_special_clusters = 0;
4262         ps->ps_hint = 0;
4263
4264         PS_LOCK_INIT(ps);
4265         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
4266         if (!ps->ps_bmap) {
4267                 PS_LOCK_DESTROY(ps);
4268                 kfree(ps, sizeof *ps);
4269                 BS_UNLOCK(bs);
4270                 return KERN_RESOURCE_SHORTAGE;
4271         }
4272         for (j = 0; j < ps->ps_ncls; j++) {
4273                 clrbit(ps->ps_bmap, j);
4274         }
4275
4276         if(paging_segment_count == 0) {
4277                 ps->ps_state = PS_EMERGENCY_SEGMENT;
4278                 if(use_emergency_swap_file_first) {
4279                         ps->ps_state |= PS_CAN_USE;
4280                 }
4281                 emergency_segment_backing_store = backing_store;
4282         } else {
4283                 ps->ps_state = PS_CAN_USE;
4284         }
4285
4286         ps->ps_bs = bs;
4287
4288         if ((error = ps_enter(ps)) != 0) {
4289                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
4290                 PS_LOCK_DESTROY(ps);
4291                 kfree(ps, sizeof *ps);
4292                 BS_UNLOCK(bs);
4293                 return KERN_RESOURCE_SHORTAGE;
4294         }
4295
4296         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
4297         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
4298         PSL_LOCK();
4299         if(IS_PS_OK_TO_USE(ps)) {
4300                 dp_pages_free += ps->ps_pgcount;
4301         } else {
4302                 dp_pages_reserve += ps->ps_pgcount;
4303         }
4304         PSL_UNLOCK();
4305
4306         BS_UNLOCK(bs);
4307
4308         bs_more_space(ps->ps_clcount);
4309
4310         /*
4311          * If the paging segment being activated is not the emergency
4312          * segment and we notice that the emergency segment is being
4313          * used then we help recover it. If all goes well, the
4314          * emergency segment will be back to its original state of
4315          * online but not activated (till it's needed the next time).
4316          */
4317 #if CONFIG_FREEZE
4318         if (!memorystatus_freeze_enabled)
4319 #endif
4320         {
4321                 ps = paging_segments[EMERGENCY_PSEG_INDEX];
4322                 if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) {
4323                         if(default_pager_backing_store_delete(emergency_segment_backing_store)) {
4324                                 dprintf(("Failed to recover emergency paging segment\n"));
4325                         } else {
4326                                 dprintf(("Recovered emergency paging segment\n"));
4327                         }
4328                 }
4329         }
4330
4331         DP_DEBUG(DEBUG_BS_INTERNAL,
4332                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
4333                   device, offset, (dp_size_t) size, record_size,
4334                   ps->ps_record_shift, ps->ps_pgnum));
4335
4336         return KERN_SUCCESS;
4337 }
4338
4339
4340
4341 kern_return_t
4342 ps_read_file(
4343         paging_segment_t        ps,
4344         upl_t                   upl,
4345         upl_offset_t            upl_offset,
4346         dp_offset_t             offset,
4347         upl_size_t              size,
4348         unsigned int            *residualp,
4349         int                     flags)
4350 {
4351         vm_object_offset_t      f_offset;
4352         int                     error = 0;
4353         int                     result;
4354
4355         assert(dp_encryption_inited);
4356
4357         clustered_reads[atop_32(size)]++;
4358
4359         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4360
4361         /*
4362          * for transfer case we need to pass uploffset and flags
4363          */
4364         assert((upl_size_t) size == size);
4365         error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL);
4366
4367         /* The vnode_pagein semantic is somewhat at odds with the existing   */
4368         /* device_read semantic.  Partial reads are not experienced at this  */
4369         /* level.  It is up to the bit map code and cluster read code to     */
4370         /* check that requested data locations are actually backed, and the  */
4371         /* pagein code to either read all of the requested data or return an */
4372         /* error. */
4373
4374         if (error)
4375                 result = KERN_FAILURE;
4376         else {
4377                 *residualp = 0;
4378                 result = KERN_SUCCESS;
4379         }
4380         return result;
4381 }
4382
4383 kern_return_t
4384 ps_write_file(
4385         paging_segment_t        ps,
4386         upl_t                   upl,
4387         upl_offset_t            upl_offset,
4388         dp_offset_t             offset,
4389         unsigned int            size,
4390         int                     flags)
4391 {
4392         vm_object_offset_t      f_offset;
4393         kern_return_t           result;
4394
4395         assert(dp_encryption_inited);
4396
4397         clustered_writes[atop_32(size)]++;
4398         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4399
4400         if (flags & UPL_PAGING_ENCRYPTED) {
4401                 /*
4402                  * ENCRYPTED SWAP:
4403                  * encrypt all the pages that we're going
4404                  * to pageout.
4405                  */
4406                 upl_encrypt(upl, upl_offset, size);
4407         }
4408         assert((upl_size_t) size == size);
4409         if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL))
4410                 result = KERN_FAILURE;
4411         else
4412                 result = KERN_SUCCESS;
4413
4414         return result;
4415 }
4416
4417 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data)
4418 {
4419 #if CONFIG_EMBEDDED
4420         data->vp = NULL;
4421         data->offset = 0;
4422         data->length = 0;
4423 #else
4424 #pragma unused(data)
4425 #endif
4426 }
4427
4428 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data)
4429 {
4430 #if CONFIG_EMBEDDED
4431         if ((data->vp) != NULL) {
4432                 vnode_trim(data->vp,
4433                                    data->offset,
4434                                    data->length);
4435                 ps_vnode_trim_init(data);
4436         }
4437 #else
4438 #pragma unused(data)
4439 #endif
4440 }
4441
4442 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length)
4443 {
4444 #if CONFIG_EMBEDDED
4445         struct vnode *vp = VSM_PS(*map)->ps_vnode;
4446         dp_offset_t offset = ptoa_32(VSM_CLOFF(*map)) << shift;
4447
4448         if ((vp != data->vp) || (offset) != (data->offset + data->length)) {
4449                 ps_vnode_trim_now(data);
4450                 data->vp = vp;
4451                 data->offset = offset;
4452                 data->length = 0;
4453         }
4454         data->length += (length);
4455 #else
4456 #pragma unused(data, map, shift, length)
4457 #endif
4458 }
4459
4460 kern_return_t
4461 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
4462         int             hi_wat,
4463         int             lo_wat,
4464         int             flags,
4465         MACH_PORT_FACE  trigger_port)
4466 {
4467         MACH_PORT_FACE release = IPC_PORT_NULL;
4468         kern_return_t kr;
4469         clock_sec_t now;
4470         clock_nsec_t nanoseconds_dummy;
4471         static clock_sec_t error_notify = 0;
4472
4473         PSL_LOCK();
4474         if (flags == SWAP_ENCRYPT_ON) {
4475                 /* ENCRYPTED SWAP: turn encryption on */
4476                 release = trigger_port;
4477                 if (!dp_encryption_inited) {
4478                         dp_encryption_inited = TRUE;
4479                         dp_encryption = TRUE;
4480                         kr = KERN_SUCCESS;
4481                 } else {
4482                         kr = KERN_FAILURE;
4483                 }
4484         } else if (flags == SWAP_ENCRYPT_OFF) {
4485                 /* ENCRYPTED SWAP: turn encryption off */
4486                 release = trigger_port;
4487                 if (!dp_encryption_inited) {
4488                         dp_encryption_inited = TRUE;
4489                         dp_encryption = FALSE;
4490                         kr = KERN_SUCCESS;
4491                 } else {
4492                         kr = KERN_FAILURE;
4493                 }
4494         } else if (flags == HI_WAT_ALERT) {
4495                 release = min_pages_trigger_port;
4496 #if CONFIG_FREEZE
4497                 /* High and low water signals aren't applicable when freeze is */
4498                 /* enabled, so release the trigger ports here and return       */
4499                 /* KERN_FAILURE.                                               */
4500                 if (memorystatus_freeze_enabled) {
4501                         if (IP_VALID( trigger_port )){
4502                                 ipc_port_release_send( trigger_port );
4503                         }
4504                         min_pages_trigger_port = IPC_PORT_NULL;
4505                         kr = KERN_FAILURE;
4506                 }
4507                 else
4508 #endif
4509                 {
4510                         min_pages_trigger_port = trigger_port;
4511                         minimum_pages_remaining = hi_wat/vm_page_size;
4512                         bs_low = FALSE;
4513                         kr = KERN_SUCCESS;
4514                 }
4515         } else if (flags ==  LO_WAT_ALERT) {
4516                 release = max_pages_trigger_port;
4517 #if CONFIG_FREEZE
4518                 if (memorystatus_freeze_enabled) {
4519                         if (IP_VALID( trigger_port )){
4520                                 ipc_port_release_send( trigger_port );
4521                         }
4522                         max_pages_trigger_port = IPC_PORT_NULL;
4523                         kr = KERN_FAILURE;
4524                 }
4525                 else
4526 #endif
4527                 {
4528                         max_pages_trigger_port = trigger_port;
4529                         maximum_pages_free = lo_wat/vm_page_size;
4530                         kr = KERN_SUCCESS;
4531                 }
4532         } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) {
4533                 use_emergency_swap_file_first = TRUE;
4534                 release = trigger_port;
4535                 kr = KERN_SUCCESS;
4536         } else if (flags == SWAP_FILE_CREATION_ERROR) {
4537                 release = trigger_port;
4538                 kr = KERN_SUCCESS;
4539                 if( paging_segment_count == 1) {
4540                         use_emergency_swap_file_first = TRUE;
4541                 }
4542                 no_paging_space_action();
4543                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
4544                 if (now > error_notify + 5) {
4545                         dprintf(("Swap File Error.\n"));
4546                         error_notify = now;
4547                 }
4548         } else {
4549                 release = trigger_port;
4550                 kr =  KERN_INVALID_ARGUMENT;
4551         }
4552         PSL_UNLOCK();
4553
4554         if (IP_VALID(release))
4555                 ipc_port_release_send(release);
4556
4557         return kr;
4558 }
4559
4560 /*
4561  * Monitor the amount of available backing store vs. the amount of
4562  * required backing store, notify a listener (if present) when
4563  * backing store may safely be removed.
4564  *
4565  * We attempt to avoid the situation where backing store is
4566  * discarded en masse, as this can lead to thrashing as the
4567  * backing store is compacted.
4568  */
4569
4570 #define PF_INTERVAL     3       /* time between free level checks */
4571 #define PF_LATENCY      10      /* number of intervals before release */
4572
4573 static int dp_pages_free_low_count = 0;
4574 thread_call_t default_pager_backing_store_monitor_callout;
4575
4576 void
4577 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
4578                                                                         __unused thread_call_param_t p2)
4579 {
4580 //      unsigned long long      average;
4581         ipc_port_t              trigger;
4582         uint64_t                deadline;
4583
4584         /*
4585          * We determine whether it will be safe to release some
4586          * backing store by watching the free page level.  If
4587          * it remains below the maximum_pages_free threshold for
4588          * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4589          * then we deem it safe.
4590          *
4591          * Note that this establishes a maximum rate at which backing
4592          * store will be released, as each notification (currently)
4593          * only results in a single backing store object being
4594          * released.
4595          */
4596         if (dp_pages_free > maximum_pages_free) {
4597                 dp_pages_free_low_count++;
4598         } else {
4599                 dp_pages_free_low_count = 0;
4600         }
4601
4602         /* decide whether to send notification */
4603         trigger = IP_NULL;
4604         if (max_pages_trigger_port &&
4605             (backing_store_release_trigger_disable == 0) &&
4606             (dp_pages_free_low_count > PF_LATENCY)) {
4607                 trigger = max_pages_trigger_port;
4608                 max_pages_trigger_port = NULL;
4609         }
4610
4611         /* send notification */
4612         if (trigger != IP_NULL) {
4613                 VSL_LOCK();
4614                 if(backing_store_release_trigger_disable != 0) {
4615                         assert_wait((event_t)
4616                                     &backing_store_release_trigger_disable,
4617                                     THREAD_UNINT);
4618                         VSL_UNLOCK();
4619                         thread_block(THREAD_CONTINUE_NULL);
4620                 } else {
4621                         VSL_UNLOCK();
4622                 }
4623                 dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n"));
4624
4625                 default_pager_space_alert(trigger, LO_WAT_ALERT);
4626                 ipc_port_release_send(trigger);
4627                 dp_pages_free_low_count = 0;
4628         }
4629
4630         clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
4631         thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
4632 }
4633
4634 #if CONFIG_FREEZE
4635 unsigned int default_pager_swap_pages_free() {
4636         return dp_pages_free;
4637 }
4638 #endif