osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56
  57 /*
  58  *      Default Pager.
  59  *              Paging File Management.
  60  */
  61
  62 #include <mach/host_priv.h>
  63 #include <mach/memory_object_control.h>
  64 #include <mach/memory_object_server.h>
  65 #include <mach/upl.h>
  66 #include <default_pager/default_pager_internal.h>
  67 #include <default_pager/default_pager_alerts.h>
  68 #include <default_pager/default_pager_object_server.h>
  69
  70 #include <ipc/ipc_types.h>
  71 #include <ipc/ipc_port.h>
  72 #include <ipc/ipc_space.h>
  73
  74 #include <kern/kern_types.h>
  75 #include <kern/host.h>
  76 #include <kern/queue.h>
  77 #include <kern/counters.h>
  78 #include <kern/sched_prim.h>
  79
  80 #include <vm/vm_kern.h>
  81 #include <vm/vm_pageout.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/vm_object.h>
  84 #include <vm/vm_protos.h>
  85
  86
  87 /* todo - need large internal object support */
  88
  89 /*
  90  * ALLOC_STRIDE... the maximum number of bytes allocated from
  91  * a swap file before moving on to the next swap file... if
  92  * all swap files reside on a single disk, this value should
  93  * be very large (this is the default assumption)... if the
  94  * swap files are spread across multiple disks, than this value
  95  * should be small (128 * 1024)...
  96  *
  97  * This should be determined dynamically in the future
  98  */
  99
 100 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
 101 int physical_transfer_cluster_count = 0;
 102
 103 #define VM_SUPER_CLUSTER        0x40000
 104 #define VM_SUPER_PAGES          64
 105
 106 /*
 107  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
 108  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
 109  */
 110 #define VSTRUCT_DEF_CLSHIFT     2
 111 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
 112 int default_pager_clsize = 0;
 113
 114 /* statistics */
 115 unsigned int clustered_writes[VM_SUPER_PAGES+1];
 116 unsigned int clustered_reads[VM_SUPER_PAGES+1];
 117
 118 /*
 119  * Globals used for asynchronous paging operations:
 120  *      vs_async_list:  head of list of to-be-completed I/O ops
 121  *      async_num_queued: number of pages completed, but not yet
 122  *              processed by async thread.
 123  *      async_requests_out: number of pages of requests not completed.
 124  */
 125
 126 #if 0
 127 struct vs_async *vs_async_list;
 128 int     async_num_queued;
 129 int     async_requests_out;
 130 #endif
 131
 132
 133 #define VS_ASYNC_REUSE 1
 134 struct vs_async *vs_async_free_list;
 135
 136 lck_mtx_t       default_pager_async_lock;       /* Protects globals above */
 137
 138
 139 int vs_alloc_async_failed = 0;                  /* statistics */
 140 int vs_alloc_async_count = 0;                   /* statistics */
 141 struct vs_async *vs_alloc_async(void);          /* forward */
 142 void vs_free_async(struct vs_async *vsa);       /* forward */
 143
 144
 145 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 146 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 147
 148 #define VS_ASYNC_LOCK()         lck_mtx_lock(&default_pager_async_lock)
 149 #define VS_ASYNC_UNLOCK()       lck_mtx_unlock(&default_pager_async_lock)
 150 #define VS_ASYNC_LOCK_INIT()    lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
 151 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 152 /*
 153  *  Paging Space Hysteresis triggers and the target notification port
 154  *
 155  */
 156 unsigned int    dp_pages_free_drift_count = 0;
 157 unsigned int    dp_pages_free_drifted_max = 0;
 158 unsigned int    minimum_pages_remaining = 0;
 159 unsigned int    maximum_pages_free = 0;
 160 ipc_port_t      min_pages_trigger_port = NULL;
 161 ipc_port_t      max_pages_trigger_port = NULL;
 162
 163 boolean_t       use_emergency_swap_file_first = FALSE;
 164 boolean_t       bs_low = FALSE;
 165 int             backing_store_release_trigger_disable = 0;
 166 boolean_t       backing_store_stop_compaction = FALSE;
 167
 168
 169 /* Have we decided if swap needs to be encrypted yet ? */
 170 boolean_t       dp_encryption_inited = FALSE;
 171 /* Should we encrypt swap ? */
 172 boolean_t       dp_encryption = FALSE;
 173
 174
 175 /*
 176  * Object sizes are rounded up to the next power of 2,
 177  * unless they are bigger than a given maximum size.
 178  */
 179 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 180
 181 /*
 182  * List of all backing store and segments.
 183  */
 184 MACH_PORT_FACE          emergency_segment_backing_store;
 185 struct backing_store_list_head backing_store_list;
 186 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 187 lck_mtx_t                       paging_segments_lock;
 188 int                     paging_segment_max = 0;
 189 int                     paging_segment_count = 0;
 190 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 191
 192
 193 /*
 194  * Total pages free in system
 195  * This differs from clusters committed/avail which is a measure of the
 196  * over commitment of paging segments to backing store.  An idea which is
 197  * likely to be deprecated.
 198  */
 199 unsigned  int   dp_pages_free = 0;
 200 unsigned  int   dp_pages_reserve = 0;
 201 unsigned  int   cluster_transfer_minimum = 100;
 202
 203 /* forward declarations */
 204 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int);     /* forward */
 205 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int);     /* forward */
 206 default_pager_thread_t *get_read_buffer( void );
 207 kern_return_t ps_vstruct_transfer_from_segment(
 208         vstruct_t        vs,
 209         paging_segment_t segment,
 210         upl_t            upl);
 211 kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
 212 kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
 213 kern_return_t vs_cluster_transfer(
 214         vstruct_t       vs,
 215         dp_offset_t     offset,
 216         dp_size_t       cnt,
 217         upl_t           upl);
 218 vs_map_t vs_get_map_entry(
 219         vstruct_t       vs,
 220         dp_offset_t     offset);
 221
 222 kern_return_t
 223 default_pager_backing_store_delete_internal( MACH_PORT_FACE );
 224
 225 default_pager_thread_t *
 226 get_read_buffer( void )
 227 {
 228         int     i;
 229
 230         DPT_LOCK(dpt_lock);
 231         while(TRUE) {
 232                 for (i=0; i<default_pager_internal_count; i++) {
 233                         if(dpt_array[i]->checked_out == FALSE) {
 234                           dpt_array[i]->checked_out = TRUE;
 235                           DPT_UNLOCK(dpt_lock);
 236                           return  dpt_array[i];
 237                         }
 238                 }
 239                 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
 240         }
 241 }
 242
 243 void
 244 bs_initialize(void)
 245 {
 246         int i;
 247
 248         /*
 249          * List of all backing store.
 250          */
 251         BSL_LOCK_INIT();
 252         queue_init(&backing_store_list.bsl_queue);
 253         PSL_LOCK_INIT();
 254
 255         VS_ASYNC_LOCK_INIT();
 256 #if     VS_ASYNC_REUSE
 257         vs_async_free_list = NULL;
 258 #endif  /* VS_ASYNC_REUSE */
 259
 260         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 261                 clustered_writes[i] = 0;
 262                 clustered_reads[i] = 0;
 263         }
 264
 265 }
 266
 267 /*
 268  * When things do not quite workout...
 269  */
 270 void bs_no_paging_space(boolean_t);     /* forward */
 271
 272 void
 273 bs_no_paging_space(
 274         boolean_t out_of_memory)
 275 {
 276
 277         if (out_of_memory)
 278                 dprintf(("*** OUT OF MEMORY ***\n"));
 279         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 280 }
 281
 282 void bs_more_space(int);        /* forward */
 283 void bs_commit(int);            /* forward */
 284
 285 boolean_t       user_warned = FALSE;
 286 unsigned int    clusters_committed = 0;
 287 unsigned int    clusters_available = 0;
 288 unsigned int    clusters_committed_peak = 0;
 289
 290 void
 291 bs_more_space(
 292         int     nclusters)
 293 {
 294         BSL_LOCK();
 295         /*
 296          * Account for new paging space.
 297          */
 298         clusters_available += nclusters;
 299
 300         if (clusters_available >= clusters_committed) {
 301                 if (verbose && user_warned) {
 302                         printf("%s%s - %d excess clusters now.\n",
 303                                my_name,
 304                                "paging space is OK now",
 305                                clusters_available - clusters_committed);
 306                         user_warned = FALSE;
 307                         clusters_committed_peak = 0;
 308                 }
 309         } else {
 310                 if (verbose && user_warned) {
 311                         printf("%s%s - still short of %d clusters.\n",
 312                                my_name,
 313                                "WARNING: paging space over-committed",
 314                                clusters_committed - clusters_available);
 315                         clusters_committed_peak -= nclusters;
 316                 }
 317         }
 318         BSL_UNLOCK();
 319
 320         return;
 321 }
 322
 323 void
 324 bs_commit(
 325         int     nclusters)
 326 {
 327         BSL_LOCK();
 328         clusters_committed += nclusters;
 329         if (clusters_committed > clusters_available) {
 330                 if (verbose && !user_warned) {
 331                         user_warned = TRUE;
 332                         printf("%s%s - short of %d clusters.\n",
 333                                my_name,
 334                                "WARNING: paging space over-committed",
 335                                clusters_committed - clusters_available);
 336                 }
 337                 if (clusters_committed > clusters_committed_peak) {
 338                         clusters_committed_peak = clusters_committed;
 339                 }
 340         } else {
 341                 if (verbose && user_warned) {
 342                         printf("%s%s - was short of up to %d clusters.\n",
 343                                my_name,
 344                                "paging space is OK now",
 345                                clusters_committed_peak - clusters_available);
 346                         user_warned = FALSE;
 347                         clusters_committed_peak = 0;
 348                 }
 349         }
 350         BSL_UNLOCK();
 351
 352         return;
 353 }
 354
 355 int default_pager_info_verbose = 1;
 356
 357 void
 358 bs_global_info(
 359         uint64_t        *totalp,
 360         uint64_t        *freep)
 361 {
 362         uint64_t                pages_total, pages_free;
 363         paging_segment_t        ps;
 364         int                     i;
 365
 366         PSL_LOCK();
 367         pages_total = pages_free = 0;
 368         for (i = 0; i <= paging_segment_max; i++) {
 369                 ps = paging_segments[i];
 370                 if (ps == PAGING_SEGMENT_NULL)
 371                         continue;
 372
 373                 /*
 374                  * no need to lock: by the time this data
 375                  * gets back to any remote requestor it
 376                  * will be obsolete anyways
 377                  */
 378                 pages_total += ps->ps_pgnum;
 379                 pages_free += ps->ps_clcount << ps->ps_clshift;
 380                 DP_DEBUG(DEBUG_BS_INTERNAL,
 381                          ("segment #%d: %d total, %d free\n",
 382                           i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 383         }
 384         *totalp = pages_total;
 385         *freep = pages_free;
 386         if (verbose && user_warned && default_pager_info_verbose) {
 387                 if (clusters_available < clusters_committed) {
 388                         printf("%s %d clusters committed, %d available.\n",
 389                                my_name,
 390                                clusters_committed,
 391                                clusters_available);
 392                 }
 393         }
 394         PSL_UNLOCK();
 395 }
 396
 397 backing_store_t backing_store_alloc(void);      /* forward */
 398
 399 backing_store_t
 400 backing_store_alloc(void)
 401 {
 402         backing_store_t bs;
 403
 404         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 405         if (bs == BACKING_STORE_NULL)
 406                 panic("backing_store_alloc: no memory");
 407
 408         BS_LOCK_INIT(bs);
 409         bs->bs_port = MACH_PORT_NULL;
 410         bs->bs_priority = 0;
 411         bs->bs_clsize = 0;
 412         bs->bs_pages_total = 0;
 413         bs->bs_pages_in = 0;
 414         bs->bs_pages_in_fail = 0;
 415         bs->bs_pages_out = 0;
 416         bs->bs_pages_out_fail = 0;
 417
 418         return bs;
 419 }
 420
 421 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 422
 423 /* Even in both the component space and external versions of this pager, */
 424 /* backing_store_lookup will be called from tasks in the application space */
 425 backing_store_t
 426 backing_store_lookup(
 427         MACH_PORT_FACE port)
 428 {
 429         backing_store_t bs;
 430
 431 /*
 432         port is currently backed with a vs structure in the alias field
 433         we could create an ISBS alias and a port_is_bs call but frankly
 434         I see no reason for the test, the bs->port == port check below
 435         will work properly on junk entries.
 436
 437         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 438 */
 439         if ((port == MACH_PORT_NULL))
 440                 return BACKING_STORE_NULL;
 441
 442         BSL_LOCK();
 443         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 444                       bs_links) {
 445                 BS_LOCK(bs);
 446                 if (bs->bs_port == port) {
 447                         BSL_UNLOCK();
 448                         /* Success, return it locked. */
 449                         return bs;
 450                 }
 451                 BS_UNLOCK(bs);
 452         }
 453         BSL_UNLOCK();
 454         return BACKING_STORE_NULL;
 455 }
 456
 457 void backing_store_add(backing_store_t);        /* forward */
 458
 459 void
 460 backing_store_add(
 461         __unused backing_store_t bs)
 462 {
 463 //      MACH_PORT_FACE          port = bs->bs_port;
 464 //      MACH_PORT_FACE          pset = default_pager_default_set;
 465         kern_return_t           kr = KERN_SUCCESS;
 466
 467         if (kr != KERN_SUCCESS)
 468                 panic("backing_store_add: add to set");
 469
 470 }
 471
 472 /*
 473  * Set up default page shift, but only if not already
 474  * set and argument is within range.
 475  */
 476 boolean_t
 477 bs_set_default_clsize(unsigned int npages)
 478 {
 479         switch(npages){
 480             case 1:
 481             case 2:
 482             case 4:
 483             case 8:
 484                 if (default_pager_clsize == 0)  /* if not yet set */
 485                         vstruct_def_clshift = local_log2(npages);
 486                 return(TRUE);
 487         }
 488         return(FALSE);
 489 }
 490
 491 int bs_get_global_clsize(int clsize);   /* forward */
 492
 493 int
 494 bs_get_global_clsize(
 495         int     clsize)
 496 {
 497         int                     i;
 498         memory_object_default_t dmm;
 499         kern_return_t           kr;
 500
 501         /*
 502          * Only allow setting of cluster size once. If called
 503          * with no cluster size (default), we use the compiled-in default
 504          * for the duration. The same cluster size is used for all
 505          * paging segments.
 506          */
 507         if (default_pager_clsize == 0) {
 508                 /*
 509                  * Keep cluster size in bit shift because it's quicker
 510                  * arithmetic, and easier to keep at a power of 2.
 511                  */
 512                 if (clsize != NO_CLSIZE) {
 513                         for (i = 0; (1 << i) < clsize; i++);
 514                         if (i > MAX_CLUSTER_SHIFT)
 515                                 i = MAX_CLUSTER_SHIFT;
 516                         vstruct_def_clshift = i;
 517                 }
 518                 default_pager_clsize = (1 << vstruct_def_clshift);
 519
 520                 /*
 521                  * Let the user know the new (and definitive) cluster size.
 522                  */
 523                 if (verbose)
 524                         printf("%scluster size = %d page%s\n",
 525                                 my_name, default_pager_clsize,
 526                                 (default_pager_clsize == 1) ? "" : "s");
 527
 528                 /*
 529                  * Let the kernel know too, in case it hasn't used the
 530                  * default value provided in main() yet.
 531                  */
 532                 dmm = default_pager_object;
 533                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 534                 kr = host_default_memory_manager(host_priv_self(),
 535                                                  &dmm,
 536                                                  clsize);
 537                 memory_object_default_deallocate(dmm);
 538
 539                 if (kr != KERN_SUCCESS) {
 540                    panic("bs_get_global_cl_size:host_default_memory_manager");
 541                 }
 542                 if (dmm != default_pager_object) {
 543                   panic("bs_get_global_cl_size:there is another default pager");
 544                 }
 545         }
 546         ASSERT(default_pager_clsize > 0 &&
 547                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 548
 549         return default_pager_clsize;
 550 }
 551
 552 kern_return_t
 553 default_pager_backing_store_create(
 554         memory_object_default_t pager,
 555         int                     priority,
 556         int                     clsize,         /* in bytes */
 557         MACH_PORT_FACE          *backing_store)
 558 {
 559         backing_store_t bs;
 560         MACH_PORT_FACE  port;
 561 //      kern_return_t   kr;
 562         struct vstruct_alias *alias_struct;
 563
 564         if (pager != default_pager_object)
 565                 return KERN_INVALID_ARGUMENT;
 566
 567         bs = backing_store_alloc();
 568         port = ipc_port_alloc_kernel();
 569         ipc_port_make_send(port);
 570         assert (port != IP_NULL);
 571
 572         DP_DEBUG(DEBUG_BS_EXTERNAL,
 573                  ("priority=%d clsize=%d bs_port=0x%x\n",
 574                   priority, clsize, (int) backing_store));
 575
 576         alias_struct = (struct vstruct_alias *)
 577                                 kalloc(sizeof (struct vstruct_alias));
 578         if(alias_struct != NULL) {
 579                 alias_struct->vs = (struct vstruct *)bs;
 580                 alias_struct->name = &default_pager_ops;
 581                 port->alias = (uintptr_t) alias_struct;
 582         }
 583         else {
 584                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 585                 kfree(bs, sizeof (struct backing_store));
 586                 return KERN_RESOURCE_SHORTAGE;
 587         }
 588
 589         bs->bs_port = port;
 590         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 591                 priority = BS_MAXPRI;
 592         else if (priority == BS_NOPRI)
 593                 priority = BS_MAXPRI;
 594         else
 595                 priority = BS_MINPRI;
 596         bs->bs_priority = priority;
 597
 598         bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
 599
 600         BSL_LOCK();
 601         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 602                     bs_links);
 603         BSL_UNLOCK();
 604
 605         backing_store_add(bs);
 606
 607         *backing_store = port;
 608         return KERN_SUCCESS;
 609 }
 610
 611 kern_return_t
 612 default_pager_backing_store_info(
 613         MACH_PORT_FACE          backing_store,
 614         backing_store_flavor_t  flavour,
 615         backing_store_info_t    info,
 616         mach_msg_type_number_t  *size)
 617 {
 618         backing_store_t                 bs;
 619         backing_store_basic_info_t      basic;
 620         int                             i;
 621         paging_segment_t                ps;
 622
 623         if (flavour != BACKING_STORE_BASIC_INFO ||
 624             *size < BACKING_STORE_BASIC_INFO_COUNT)
 625                 return KERN_INVALID_ARGUMENT;
 626
 627         basic = (backing_store_basic_info_t)info;
 628         *size = BACKING_STORE_BASIC_INFO_COUNT;
 629
 630         VSTATS_LOCK(&global_stats.gs_lock);
 631         basic->pageout_calls    = global_stats.gs_pageout_calls;
 632         basic->pagein_calls     = global_stats.gs_pagein_calls;
 633         basic->pages_in         = global_stats.gs_pages_in;
 634         basic->pages_out        = global_stats.gs_pages_out;
 635         basic->pages_unavail    = global_stats.gs_pages_unavail;
 636         basic->pages_init       = global_stats.gs_pages_init;
 637         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 638         VSTATS_UNLOCK(&global_stats.gs_lock);
 639
 640         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 641                 return KERN_INVALID_ARGUMENT;
 642
 643         basic->bs_pages_total   = bs->bs_pages_total;
 644         PSL_LOCK();
 645         bs->bs_pages_free = 0;
 646         for (i = 0; i <= paging_segment_max; i++) {
 647                 ps = paging_segments[i];
 648                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 649                         PS_LOCK(ps);
 650                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 651                         PS_UNLOCK(ps);
 652                 }
 653         }
 654         PSL_UNLOCK();
 655         basic->bs_pages_free    = bs->bs_pages_free;
 656         basic->bs_pages_in      = bs->bs_pages_in;
 657         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 658         basic->bs_pages_out     = bs->bs_pages_out;
 659         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 660
 661         basic->bs_priority      = bs->bs_priority;
 662         basic->bs_clsize        = ptoa_32(bs->bs_clsize);       /* in bytes */
 663
 664         BS_UNLOCK(bs);
 665
 666         return KERN_SUCCESS;
 667 }
 668
 669 int ps_delete(paging_segment_t);        /* forward */
 670 boolean_t current_thread_aborted(void);
 671
 672 int
 673 ps_delete(
 674         paging_segment_t ps)
 675 {
 676         vstruct_t       vs;
 677         kern_return_t   error = KERN_SUCCESS;
 678         int             vs_count;
 679
 680         VSL_LOCK();             /* get the lock on the list of vs's      */
 681
 682         /* The lock relationship and sequence is farily complicated      */
 683         /* this code looks at a live list, locking and unlocking the list */
 684         /* as it traverses it.  It depends on the locking behavior of    */
 685         /* default_pager_no_senders.  no_senders always locks the vstruct */
 686         /* targeted for removal before locking the vstruct list.  However */
 687         /* it will remove that member of the list without locking its    */
 688         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 689         /* it cannot be removed from the list but we must hold the list  */
 690         /* lock to be sure that its pointers to its neighbors are valid. */
 691         /* Also, we can hold off destruction of a vstruct when the list  */
 692         /* lock and the vs locks are not being held by bumping the       */
 693         /* vs_async_pending count.      */
 694
 695
 696         while(backing_store_release_trigger_disable != 0) {
 697                 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
 698         }
 699
 700         /* we will choose instead to hold a send right */
 701         vs_count = vstruct_list.vsl_count;
 702         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 703         if(vs == (vstruct_t)&vstruct_list)  {
 704                 VSL_UNLOCK();
 705                 return KERN_SUCCESS;
 706         }
 707         VS_LOCK(vs);
 708         vs_async_wait(vs);  /* wait for any pending async writes */
 709         if ((vs_count != 0) && (vs != NULL))
 710                 vs->vs_async_pending += 1;  /* hold parties calling  */
 711                                             /* vs_async_wait */
 712         VS_UNLOCK(vs);
 713         VSL_UNLOCK();
 714         while((vs_count != 0) && (vs != NULL)) {
 715                 /* We take the count of AMO's before beginning the         */
 716                 /* transfer of of the target segment.                      */
 717                 /* We are guaranteed that the target segment cannot get    */
 718                 /* more users.  We also know that queue entries are        */
 719                 /* made at the back of the list.  If some of the entries   */
 720                 /* we would check disappear while we are traversing the    */
 721                 /* list then we will either check new entries which        */
 722                 /* do not have any backing store in the target segment     */
 723                 /* or re-check old entries.  This might not be optimal     */
 724                 /* but it will always be correct. The alternative is to    */
 725                 /* take a snapshot of the list.                            */
 726                 vstruct_t       next_vs;
 727
 728                 if(dp_pages_free < cluster_transfer_minimum)
 729                         error = KERN_FAILURE;
 730                 else {
 731                         vm_object_t     transfer_object;
 732                         unsigned int    count;
 733                         upl_t           upl;
 734
 735                         transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
 736                         count = 0;
 737                         error = vm_object_upl_request(transfer_object,
 738                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 739                                 &upl, NULL, &count,
 740                                 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_LITE | UPL_SET_INTERNAL);
 741
 742                         if(error == KERN_SUCCESS) {
 743                                 error = ps_vstruct_transfer_from_segment(
 744                                                         vs, ps, upl);
 745                                 upl_commit(upl, NULL, 0);
 746                                 upl_deallocate(upl);
 747                         } else {
 748                                 error = KERN_FAILURE;
 749                         }
 750                         vm_object_deallocate(transfer_object);
 751                 }
 752                 if(error || current_thread_aborted() || backing_store_stop_compaction) {
 753                         VS_LOCK(vs);
 754                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 755                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 756                                 vs->vs_waiting_async = FALSE;
 757                                 VS_UNLOCK(vs);
 758                                 thread_wakeup(&vs->vs_async_pending);
 759                         } else {
 760                                 VS_UNLOCK(vs);
 761                         }
 762                         return KERN_FAILURE;
 763                 }
 764
 765                 VSL_LOCK();
 766
 767                 while(backing_store_release_trigger_disable != 0) {
 768                         VSL_SLEEP(&backing_store_release_trigger_disable,
 769                                   THREAD_UNINT);
 770                 }
 771
 772                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 773                 if((next_vs != (vstruct_t)&vstruct_list) &&
 774                                 (vs != next_vs) && (vs_count != 1)) {
 775                         VS_LOCK(next_vs);
 776                         vs_async_wait(next_vs);  /* wait for any  */
 777                                                  /* pending async writes */
 778                         next_vs->vs_async_pending += 1; /* hold parties  */
 779                                                 /* calling vs_async_wait */
 780                         VS_UNLOCK(next_vs);
 781                 }
 782                 VSL_UNLOCK();
 783                 VS_LOCK(vs);
 784                 vs->vs_async_pending -= 1;
 785                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 786                         vs->vs_waiting_async = FALSE;
 787                         VS_UNLOCK(vs);
 788                         thread_wakeup(&vs->vs_async_pending);
 789                 } else {
 790                         VS_UNLOCK(vs);
 791                 }
 792                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 793                         vs = NULL;
 794                 else
 795                         vs = next_vs;
 796                 vs_count--;
 797         }
 798         return KERN_SUCCESS;
 799 }
 800
 801
 802 kern_return_t
 803 default_pager_backing_store_delete_internal(
 804         MACH_PORT_FACE backing_store)
 805 {
 806         backing_store_t         bs;
 807         int                     i;
 808         paging_segment_t        ps;
 809         int                     error;
 810         int                     interim_pages_removed = 0;
 811         boolean_t               dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store );
 812
 813         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 814                 return KERN_INVALID_ARGUMENT;
 815
 816 restart:
 817         PSL_LOCK();
 818         error = KERN_SUCCESS;
 819         for (i = 0; i <= paging_segment_max; i++) {
 820                 ps = paging_segments[i];
 821                 if (ps != PAGING_SEGMENT_NULL &&
 822                     ps->ps_bs == bs &&
 823                     ! IS_PS_GOING_AWAY(ps)) {
 824                         PS_LOCK(ps);
 825
 826                         if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) {
 827                         /*
 828                          * Someone is already busy reclamining this paging segment.
 829                          * If it's the emergency segment we are looking at then check
 830                          * that someone has not already recovered it and set the right
 831                          * state i.e. online but not activated.
 832                          */
 833                                 PS_UNLOCK(ps);
 834                                 continue;
 835                         }
 836
 837                         /* disable access to this segment */
 838                         ps->ps_state &= ~PS_CAN_USE;
 839                         ps->ps_state |= PS_GOING_AWAY;
 840                         PS_UNLOCK(ps);
 841                         /*
 842                          * The "ps" segment is "off-line" now,
 843                          * we can try and delete it...
 844                          */
 845                         if(dp_pages_free < (cluster_transfer_minimum
 846                                                         + ps->ps_pgcount)) {
 847                                 error = KERN_FAILURE;
 848                                 PSL_UNLOCK();
 849                         }
 850                         else {
 851                                 /* remove all pages associated with the  */
 852                                 /* segment from the list of free pages   */
 853                                 /* when transfer is through, all target  */
 854                                 /* segment pages will appear to be free  */
 855
 856                                 dp_pages_free -=  ps->ps_pgcount;
 857                                 interim_pages_removed += ps->ps_pgcount;
 858                                 PSL_UNLOCK();
 859                                 error = ps_delete(ps);
 860                         }
 861                         if (error != KERN_SUCCESS) {
 862                                 /*
 863                                  * We couldn't delete the segment,
 864                                  * probably because there's not enough
 865                                  * virtual memory left.
 866                                  * Re-enable all the segments.
 867                                  */
 868                                 PSL_LOCK();
 869                                 break;
 870                         }
 871                         goto restart;
 872                 }
 873         }
 874
 875         if (error != KERN_SUCCESS) {
 876                 for (i = 0; i <= paging_segment_max; i++) {
 877                         ps = paging_segments[i];
 878                         if (ps != PAGING_SEGMENT_NULL &&
 879                             ps->ps_bs == bs &&
 880                             IS_PS_GOING_AWAY(ps)) {
 881                                 PS_LOCK(ps);
 882
 883                                 if( !IS_PS_GOING_AWAY(ps)) {
 884                                         PS_UNLOCK(ps);
 885                                         continue;
 886                                 }
 887                                 /* Handle the special clusters that came in while we let go the lock*/
 888                                 if( ps->ps_special_clusters) {
 889                                         dp_pages_free += ps->ps_special_clusters << ps->ps_clshift;
 890                                         ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift;
 891                                         ps->ps_clcount += ps->ps_special_clusters;
 892                                         if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) {
 893                                                 ps_select_array[ps->ps_bs->bs_priority] = 0;
 894                                         }
 895                                         ps->ps_special_clusters = 0;
 896                                 }
 897                                 /* re-enable access to this segment */
 898                                 ps->ps_state &= ~PS_GOING_AWAY;
 899                                 ps->ps_state |= PS_CAN_USE;
 900                                 PS_UNLOCK(ps);
 901                         }
 902                 }
 903                 dp_pages_free += interim_pages_removed;
 904                 PSL_UNLOCK();
 905                 BS_UNLOCK(bs);
 906                 return error;
 907         }
 908
 909         for (i = 0; i <= paging_segment_max; i++) {
 910                 ps = paging_segments[i];
 911                 if (ps != PAGING_SEGMENT_NULL &&
 912                     ps->ps_bs == bs) {
 913                         if(IS_PS_GOING_AWAY(ps)) {
 914                                 if(IS_PS_EMERGENCY_SEGMENT(ps)) {
 915                                         PS_LOCK(ps);
 916                                         ps->ps_state &= ~PS_GOING_AWAY;
 917                                         ps->ps_special_clusters = 0;
 918                                         ps->ps_pgcount = ps->ps_pgnum;
 919                                         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
 920                                         PS_UNLOCK(ps);
 921                                         dp_pages_reserve += interim_pages_removed;
 922                                 } else {
 923                                         paging_segments[i] = PAGING_SEGMENT_NULL;
 924                                         paging_segment_count--;
 925                                         PS_LOCK(ps);
 926                                         kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
 927                                         kfree(ps, sizeof *ps);
 928                                 }
 929                         }
 930                 }
 931         }
 932
 933         /* Scan the entire ps array separately to make certain we find the */
 934         /* proper paging_segment_max                                       */
 935         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 936                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 937                    paging_segment_max = i;
 938         }
 939
 940         PSL_UNLOCK();
 941
 942         if( dealing_with_emergency_segment ) {
 943                 BS_UNLOCK(bs);
 944                 return KERN_SUCCESS;
 945         }
 946
 947         /*
 948          * All the segments have been deleted.
 949          * We can remove the backing store.
 950          */
 951
 952         /*
 953          * Disable lookups of this backing store.
 954          */
 955         if((void *)bs->bs_port->alias != NULL)
 956                 kfree((void *) bs->bs_port->alias,
 957                       sizeof (struct vstruct_alias));
 958         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 959         bs->bs_port = MACH_PORT_NULL;
 960         BS_UNLOCK(bs);
 961
 962         /*
 963          * Remove backing store from backing_store list.
 964          */
 965         BSL_LOCK();
 966         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
 967                      bs_links);
 968         BSL_UNLOCK();
 969
 970         /*
 971          * Free the backing store structure.
 972          */
 973         kfree(bs, sizeof *bs);
 974
 975         return KERN_SUCCESS;
 976 }
 977
 978 kern_return_t
 979 default_pager_backing_store_delete(
 980         MACH_PORT_FACE backing_store)
 981 {
 982         if( backing_store != emergency_segment_backing_store ) {
 983                 default_pager_backing_store_delete_internal(emergency_segment_backing_store);
 984         }
 985         return(default_pager_backing_store_delete_internal(backing_store));
 986 }
 987
 988 int     ps_enter(paging_segment_t);     /* forward */
 989
 990 int
 991 ps_enter(
 992         paging_segment_t ps)
 993 {
 994         int i;
 995
 996         PSL_LOCK();
 997
 998         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 999                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
1000                         break;
1001         }
1002
1003         if (i < MAX_NUM_PAGING_SEGMENTS) {
1004                 paging_segments[i] = ps;
1005                 if (i > paging_segment_max)
1006                         paging_segment_max = i;
1007                 paging_segment_count++;
1008                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
1009                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
1010                         ps_select_array[ps->ps_bs->bs_priority] = 0;
1011                 i = 0;
1012         } else {
1013                 PSL_UNLOCK();
1014                 return KERN_RESOURCE_SHORTAGE;
1015         }
1016
1017         PSL_UNLOCK();
1018         return i;
1019 }
1020
1021 #ifdef DEVICE_PAGING
1022 kern_return_t
1023 default_pager_add_segment(
1024         MACH_PORT_FACE  backing_store,
1025         MACH_PORT_FACE  device,
1026         recnum_t        offset,
1027         recnum_t        count,
1028         int             record_size)
1029 {
1030         backing_store_t         bs;
1031         paging_segment_t        ps;
1032         int                     i;
1033         int                     error;
1034
1035         if ((bs = backing_store_lookup(backing_store))
1036             == BACKING_STORE_NULL)
1037                 return KERN_INVALID_ARGUMENT;
1038
1039         PSL_LOCK();
1040         for (i = 0; i <= paging_segment_max; i++) {
1041                 ps = paging_segments[i];
1042                 if (ps == PAGING_SEGMENT_NULL)
1043                         continue;
1044
1045                 /*
1046                  * Check for overlap on same device.
1047                  */
1048                 if (!(ps->ps_device != device
1049                       || offset >= ps->ps_offset + ps->ps_recnum
1050                       || offset + count <= ps->ps_offset)) {
1051                         PSL_UNLOCK();
1052                         BS_UNLOCK(bs);
1053                         return KERN_INVALID_ARGUMENT;
1054                 }
1055         }
1056         PSL_UNLOCK();
1057
1058         /*
1059          * Set up the paging segment
1060          */
1061         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1062         if (ps == PAGING_SEGMENT_NULL) {
1063                 BS_UNLOCK(bs);
1064                 return KERN_RESOURCE_SHORTAGE;
1065         }
1066
1067         ps->ps_segtype = PS_PARTITION;
1068         ps->ps_device = device;
1069         ps->ps_offset = offset;
1070         ps->ps_record_shift = local_log2(vm_page_size / record_size);
1071         ps->ps_recnum = count;
1072         ps->ps_pgnum = count >> ps->ps_record_shift;
1073
1074         ps->ps_pgcount = ps->ps_pgnum;
1075         ps->ps_clshift = local_log2(bs->bs_clsize);
1076         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1077         ps->ps_hint = 0;
1078
1079         PS_LOCK_INIT(ps);
1080         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1081         if (!ps->ps_bmap) {
1082                 kfree(ps, sizeof *ps);
1083                 BS_UNLOCK(bs);
1084                 return KERN_RESOURCE_SHORTAGE;
1085         }
1086         for (i = 0; i < ps->ps_ncls; i++) {
1087                 clrbit(ps->ps_bmap, i);
1088         }
1089
1090         if(paging_segment_count == 0) {
1091                 ps->ps_state = PS_EMERGENCY_SEGMENT;
1092                 if(use_emergency_swap_file_first) {
1093                         ps->ps_state |= PS_CAN_USE;
1094                 }
1095         } else {
1096                 ps->ps_state = PS_CAN_USE;
1097         }
1098
1099         ps->ps_bs = bs;
1100
1101         if ((error = ps_enter(ps)) != 0) {
1102                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1103                 kfree(ps, sizeof *ps);
1104                 BS_UNLOCK(bs);
1105                 return KERN_RESOURCE_SHORTAGE;
1106         }
1107
1108         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1109         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1110         BS_UNLOCK(bs);
1111
1112         PSL_LOCK();
1113         if(IS_PS_OK_TO_USE(ps)) {
1114                 dp_pages_free += ps->ps_pgcount;
1115         } else {
1116                 dp_pages_reserve += ps->ps_pgcount;
1117         }
1118         PSL_UNLOCK();
1119
1120         bs_more_space(ps->ps_clcount);
1121
1122         DP_DEBUG(DEBUG_BS_INTERNAL,
1123                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1124                   device, offset, count, record_size,
1125                   ps->ps_record_shift, ps->ps_pgnum));
1126
1127         return KERN_SUCCESS;
1128 }
1129
1130 boolean_t
1131 bs_add_device(
1132         char            *dev_name,
1133         MACH_PORT_FACE  master)
1134 {
1135         security_token_t        null_security_token = {
1136                 { 0, 0 }
1137         };
1138         MACH_PORT_FACE  device;
1139         int             info[DEV_GET_SIZE_COUNT];
1140         mach_msg_type_number_t info_count;
1141         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1142         unsigned int    rec_size;
1143         recnum_t        count;
1144         int             clsize;
1145         MACH_PORT_FACE  reply_port;
1146
1147         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1148                         null_security_token, dev_name, &device))
1149                 return FALSE;
1150
1151         info_count = DEV_GET_SIZE_COUNT;
1152         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1153                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1154                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1155                 clsize = bs_get_global_clsize(0);
1156                 if (!default_pager_backing_store_create(
1157                                         default_pager_object,
1158                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1159                                         (clsize * vm_page_size),
1160                                         &bs)) {
1161                         if (!default_pager_add_segment(bs, device,
1162                                                        0, count, rec_size)) {
1163                                 return TRUE;
1164                         }
1165                         ipc_port_release_receive(bs);
1166                 }
1167         }
1168
1169         ipc_port_release_send(device);
1170         return FALSE;
1171 }
1172 #endif /* DEVICE_PAGING */
1173
1174 #if     VS_ASYNC_REUSE
1175
1176 struct vs_async *
1177 vs_alloc_async(void)
1178 {
1179         struct vs_async *vsa;
1180         MACH_PORT_FACE  reply_port;
1181 //      kern_return_t   kr;
1182
1183         VS_ASYNC_LOCK();
1184         if (vs_async_free_list == NULL) {
1185                 VS_ASYNC_UNLOCK();
1186                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1187                 if (vsa != NULL) {
1188                         /*
1189                          * Try allocating a reply port named after the
1190                          * address of the vs_async structure.
1191                          */
1192                         struct vstruct_alias    *alias_struct;
1193
1194                         reply_port = ipc_port_alloc_kernel();
1195                         alias_struct = (struct vstruct_alias *)
1196                                 kalloc(sizeof (struct vstruct_alias));
1197                         if(alias_struct != NULL) {
1198                                 alias_struct->vs = (struct vstruct *)vsa;
1199                                 alias_struct->name = &default_pager_ops;
1200                                 reply_port->alias = (uintptr_t) alias_struct;
1201                                 vsa->reply_port = reply_port;
1202                                 vs_alloc_async_count++;
1203                         }
1204                         else {
1205                                 vs_alloc_async_failed++;
1206                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1207                                                                 (reply_port));
1208                                 kfree(vsa, sizeof (struct vs_async));
1209                                 vsa = NULL;
1210                         }
1211                 }
1212         } else {
1213                 vsa = vs_async_free_list;
1214                 vs_async_free_list = vs_async_free_list->vsa_next;
1215                 VS_ASYNC_UNLOCK();
1216         }
1217
1218         return vsa;
1219 }
1220
1221 void
1222 vs_free_async(
1223         struct vs_async *vsa)
1224 {
1225         VS_ASYNC_LOCK();
1226         vsa->vsa_next = vs_async_free_list;
1227         vs_async_free_list = vsa;
1228         VS_ASYNC_UNLOCK();
1229 }
1230
1231 #else   /* VS_ASYNC_REUSE */
1232
1233 struct vs_async *
1234 vs_alloc_async(void)
1235 {
1236         struct vs_async *vsa;
1237         MACH_PORT_FACE  reply_port;
1238         kern_return_t   kr;
1239
1240         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1241         if (vsa != NULL) {
1242                 /*
1243                  * Try allocating a reply port named after the
1244                  * address of the vs_async structure.
1245                  */
1246                         reply_port = ipc_port_alloc_kernel();
1247                         alias_struct = (vstruct_alias *)
1248                                 kalloc(sizeof (struct vstruct_alias));
1249                         if(alias_struct != NULL) {
1250                                 alias_struct->vs = reply_port;
1251                                 alias_struct->name = &default_pager_ops;
1252                                 reply_port->alias = (int) vsa;
1253                                 vsa->reply_port = reply_port;
1254                                 vs_alloc_async_count++;
1255                         }
1256                         else {
1257                                 vs_alloc_async_failed++;
1258                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1259                                                                 (reply_port));
1260                                 kfree(vsa, sizeof (struct vs_async));
1261                                 vsa = NULL;
1262                         }
1263         }
1264
1265         return vsa;
1266 }
1267
1268 void
1269 vs_free_async(
1270         struct vs_async *vsa)
1271 {
1272         MACH_PORT_FACE  reply_port;
1273         kern_return_t   kr;
1274
1275         reply_port = vsa->reply_port;
1276         kfree(reply_port->alias, sizeof (struct vstuct_alias));
1277         kfree(vsa, sizeof (struct vs_async));
1278         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1279 #if 0
1280         VS_ASYNC_LOCK();
1281         vs_alloc_async_count--;
1282         VS_ASYNC_UNLOCK();
1283 #endif
1284 }
1285
1286 #endif  /* VS_ASYNC_REUSE */
1287
1288 zone_t  vstruct_zone;
1289
1290 vstruct_t
1291 ps_vstruct_create(
1292         dp_size_t size)
1293 {
1294         vstruct_t       vs;
1295         unsigned int    i;
1296
1297         vs = (vstruct_t) zalloc(vstruct_zone);
1298         if (vs == VSTRUCT_NULL) {
1299                 return VSTRUCT_NULL;
1300         }
1301
1302         VS_LOCK_INIT(vs);
1303
1304         /*
1305          * The following fields will be provided later.
1306          */
1307         vs->vs_pager_ops = NULL;
1308         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1309         vs->vs_references = 1;
1310         vs->vs_seqno = 0;
1311
1312         vs->vs_waiting_seqno = FALSE;
1313         vs->vs_waiting_read = FALSE;
1314         vs->vs_waiting_write = FALSE;
1315         vs->vs_waiting_async = FALSE;
1316
1317         vs->vs_readers = 0;
1318         vs->vs_writers = 0;
1319
1320         vs->vs_errors = 0;
1321
1322         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1323         vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1324         vs->vs_async_pending = 0;
1325
1326         /*
1327          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1328          * depending on the size of the memory object.
1329          */
1330         if (INDIRECT_CLMAP(vs->vs_size)) {
1331                 vs->vs_imap = (struct vs_map **)
1332                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1333                 vs->vs_indirect = TRUE;
1334         } else {
1335                 vs->vs_dmap = (struct vs_map *)
1336                         kalloc(CLMAP_SIZE(vs->vs_size));
1337                 vs->vs_indirect = FALSE;
1338         }
1339         vs->vs_xfer_pending = FALSE;
1340         DP_DEBUG(DEBUG_VS_INTERNAL,
1341                  ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1342
1343         /*
1344          * Check to see that we got the space.
1345          */
1346         if (!vs->vs_dmap) {
1347                 kfree(vs, sizeof *vs);
1348                 return VSTRUCT_NULL;
1349         }
1350
1351         /*
1352          * Zero the indirect pointers, or clear the direct pointers.
1353          */
1354         if (vs->vs_indirect)
1355                 memset(vs->vs_imap, 0,
1356                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1357         else
1358                 for (i = 0; i < vs->vs_size; i++)
1359                         VSM_CLR(vs->vs_dmap[i]);
1360
1361         VS_MAP_LOCK_INIT(vs);
1362
1363         bs_commit(vs->vs_size);
1364
1365         return vs;
1366 }
1367
1368 paging_segment_t ps_select_segment(unsigned int, int *);        /* forward */
1369
1370 paging_segment_t
1371 ps_select_segment(
1372         unsigned int    shift,
1373         int             *psindex)
1374 {
1375         paging_segment_t        ps;
1376         int                     i;
1377         int                     j;
1378
1379         /*
1380          * Optimize case where there's only one segment.
1381          * paging_segment_max will index the one and only segment.
1382          */
1383
1384         PSL_LOCK();
1385         if (paging_segment_count == 1) {
1386                 paging_segment_t lps = PAGING_SEGMENT_NULL;     /* used to avoid extra PS_UNLOCK */
1387                 ipc_port_t trigger = IP_NULL;
1388
1389                 ps = paging_segments[paging_segment_max];
1390                 *psindex = paging_segment_max;
1391                 PS_LOCK(ps);
1392                 if( !IS_PS_EMERGENCY_SEGMENT(ps) ) {
1393                         panic("Emergency paging segment missing\n");
1394                 }
1395                 ASSERT(ps->ps_clshift >= shift);
1396                 if(IS_PS_OK_TO_USE(ps)) {
1397                         if (ps->ps_clcount) {
1398                                 ps->ps_clcount--;
1399                                 dp_pages_free -=  1 << ps->ps_clshift;
1400                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1401                                 if(min_pages_trigger_port &&
1402                                   (dp_pages_free < minimum_pages_remaining)) {
1403                                         trigger = min_pages_trigger_port;
1404                                         min_pages_trigger_port = NULL;
1405                                         bs_low = TRUE;
1406                                 }
1407                                 lps = ps;
1408                         }
1409                 }
1410                 PS_UNLOCK(ps);
1411
1412                 if( lps == PAGING_SEGMENT_NULL ) {
1413                         if(dp_pages_free) {
1414                                 dp_pages_free_drift_count++;
1415                                 if(dp_pages_free > dp_pages_free_drifted_max) {
1416                                         dp_pages_free_drifted_max = dp_pages_free;
1417                                 }
1418                                 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1419                         }
1420                         dp_pages_free = 0;
1421                 }
1422
1423                 PSL_UNLOCK();
1424
1425                 if (trigger != IP_NULL) {
1426                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1427                         ipc_port_release_send(trigger);
1428                 }
1429                 return lps;
1430         }
1431
1432         if (paging_segment_count == 0) {
1433                 if(dp_pages_free) {
1434                         dp_pages_free_drift_count++;
1435                         if(dp_pages_free > dp_pages_free_drifted_max) {
1436                                 dp_pages_free_drifted_max = dp_pages_free;
1437                         }
1438                         dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1439                 }
1440                 dp_pages_free = 0;
1441                 PSL_UNLOCK();
1442                 return PAGING_SEGMENT_NULL;
1443         }
1444
1445         for (i = BS_MAXPRI;
1446              i >= BS_MINPRI; i--) {
1447                 int start_index;
1448
1449                 if ((ps_select_array[i] == BS_NOPRI) ||
1450                                 (ps_select_array[i] == BS_FULLPRI))
1451                         continue;
1452                 start_index = ps_select_array[i];
1453
1454                 if(!(paging_segments[start_index])) {
1455                         j = start_index+1;
1456                         physical_transfer_cluster_count = 0;
1457                 }
1458                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1459                                 (((paging_segments[start_index])->ps_clshift)
1460                                 + vm_page_shift))) {
1461                         physical_transfer_cluster_count = 0;
1462                         j = start_index + 1;
1463                 } else {
1464                         physical_transfer_cluster_count+=1;
1465                         j = start_index;
1466                         if(start_index == 0)
1467                                 start_index = paging_segment_max;
1468                         else
1469                                 start_index = start_index - 1;
1470                 }
1471
1472                 while (1) {
1473                         if (j > paging_segment_max)
1474                                 j = 0;
1475                         if ((ps = paging_segments[j]) &&
1476                             (ps->ps_bs->bs_priority == i)) {
1477                                 /*
1478                                  * Force the ps cluster size to be
1479                                  * >= that of the vstruct.
1480                                  */
1481                                 PS_LOCK(ps);
1482                                 if (IS_PS_OK_TO_USE(ps)) {
1483                                         if ((ps->ps_clcount) &&
1484                                                    (ps->ps_clshift >= shift)) {
1485                                                 ipc_port_t trigger = IP_NULL;
1486
1487                                                 ps->ps_clcount--;
1488                                                 dp_pages_free -=  1 << ps->ps_clshift;
1489                                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1490                                                 if(min_pages_trigger_port &&
1491                                                         (dp_pages_free <
1492                                                         minimum_pages_remaining)) {
1493                                                         trigger = min_pages_trigger_port;
1494                                                         min_pages_trigger_port = NULL;
1495                                                 }
1496                                                 PS_UNLOCK(ps);
1497                                                 /*
1498                                                  * found one, quit looking.
1499                                                  */
1500                                                 ps_select_array[i] = j;
1501                                                 PSL_UNLOCK();
1502
1503                                                 if (trigger != IP_NULL) {
1504                                                         default_pager_space_alert(
1505                                                                 trigger,
1506                                                                 HI_WAT_ALERT);
1507                                                         ipc_port_release_send(trigger);
1508                                                 }
1509                                                 *psindex = j;
1510                                                 return ps;
1511                                         }
1512                                 }
1513                                 PS_UNLOCK(ps);
1514                         }
1515                         if (j == start_index) {
1516                                 /*
1517                                  * none at this priority -- mark it full
1518                                  */
1519                                 ps_select_array[i] = BS_FULLPRI;
1520                                 break;
1521                         }
1522                         j++;
1523                 }
1524         }
1525
1526         if(dp_pages_free) {
1527                 dp_pages_free_drift_count++;
1528                 if(dp_pages_free > dp_pages_free_drifted_max) {
1529                         dp_pages_free_drifted_max = dp_pages_free;
1530                 }
1531                 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free));
1532         }
1533         dp_pages_free = 0;
1534         PSL_UNLOCK();
1535         return PAGING_SEGMENT_NULL;
1536 }
1537
1538 dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1539
1540 dp_offset_t
1541 ps_allocate_cluster(
1542         vstruct_t               vs,
1543         int                     *psindex,
1544         paging_segment_t        use_ps)
1545 {
1546         unsigned int            byte_num;
1547         int                     bit_num = 0;
1548         paging_segment_t        ps;
1549         dp_offset_t             cluster;
1550         ipc_port_t              trigger = IP_NULL;
1551
1552         /*
1553          * Find best paging segment.
1554          * ps_select_segment will decrement cluster count on ps.
1555          * Must pass cluster shift to find the most appropriate segment.
1556          */
1557         /* NOTE:  The addition of paging segment delete capability threatened
1558          * to seriously complicate the treatment of paging segments in this
1559          * module and the ones that call it (notably ps_clmap), because of the
1560          * difficulty in assuring that the paging segment would continue to
1561          * exist between being unlocked and locked.   This was
1562          * avoided because all calls to this module are based in either
1563          * dp_memory_object calls which rely on the vs lock, or by
1564          * the transfer function which is part of the segment delete path.
1565          * The transfer function which is part of paging segment delete is
1566          * protected from multiple callers by the backing store lock.
1567          * The paging segment delete function treats mappings to a paging
1568          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1569          * while data is transferred to the remaining segments.  This is in
1570          * line with the view that incomplete or in-transition mappings between
1571          * data, a vstruct, and backing store are protected by the vs lock.
1572          * This and the ordering of the paging segment "going_away" bit setting
1573          * protects us.
1574          */
1575 retry:
1576         if (use_ps != PAGING_SEGMENT_NULL) {
1577                 ps = use_ps;
1578                 PSL_LOCK();
1579                 PS_LOCK(ps);
1580
1581                 ASSERT(ps->ps_clcount != 0);
1582
1583                 ps->ps_clcount--;
1584                 dp_pages_free -=  1 << ps->ps_clshift;
1585                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1586                 if(min_pages_trigger_port &&
1587                                 (dp_pages_free < minimum_pages_remaining)) {
1588                         trigger = min_pages_trigger_port;
1589                         min_pages_trigger_port = NULL;
1590                 }
1591                 PSL_UNLOCK();
1592                 PS_UNLOCK(ps);
1593                 if (trigger != IP_NULL) {
1594                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1595                         ipc_port_release_send(trigger);
1596                 }
1597
1598         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1599                    PAGING_SEGMENT_NULL) {
1600                 static clock_sec_t lastnotify = 0;
1601                 clock_sec_t now;
1602                 clock_nsec_t nanoseconds_dummy;
1603
1604                 /*
1605                  * Don't immediately jump to the emergency segment. Give the
1606                  * dynamic pager a chance to create it's first normal swap file.
1607                  * Unless, of course the very first normal swap file can't be
1608                  * created due to some problem and we didn't expect that problem
1609                  * i.e. use_emergency_swap_file_first was never set to true initially.
1610                  * It then gets set in the swap file creation error handling.
1611                  */
1612                 if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) {
1613
1614                         ps = paging_segments[EMERGENCY_PSEG_INDEX];
1615                         if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) {
1616                                 PSL_LOCK();
1617                                 PS_LOCK(ps);
1618
1619                                 if(IS_PS_GOING_AWAY(ps)) {
1620                                         /* Someone de-activated the emergency paging segment*/
1621                                         PS_UNLOCK(ps);
1622                                         PSL_UNLOCK();
1623
1624                                 } else if(dp_pages_free) {
1625                                         /*
1626                                          * Someone has already activated the emergency paging segment
1627                                          * OR
1628                                          * Between us having rec'd a NULL segment from ps_select_segment
1629                                          * and reaching here a new normal segment could have been added.
1630                                          * E.g. we get NULL segment and another thread just added the
1631                                          * new swap file. Hence check to see if we have more dp_pages_free
1632                                          * before activating the emergency segment.
1633                                          */
1634                                         PS_UNLOCK(ps);
1635                                         PSL_UNLOCK();
1636                                         goto retry;
1637
1638                                 } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) {
1639                                         /*
1640                                          * PS_CAN_USE is only reset from the emergency segment when it's
1641                                          * been successfully recovered. So it's legal to have an emergency
1642                                          * segment that has PS_CAN_USE but no clusters because it's recovery
1643                                          * failed.
1644                                          */
1645                                         backing_store_t bs = ps->ps_bs;
1646                                         ps->ps_state |= PS_CAN_USE;
1647                                         if(ps_select_array[bs->bs_priority] == BS_FULLPRI ||
1648                                                 ps_select_array[bs->bs_priority] == BS_NOPRI) {
1649                                                 ps_select_array[bs->bs_priority] = 0;
1650                                         }
1651                                         dp_pages_free += ps->ps_pgcount;
1652                                         dp_pages_reserve -= ps->ps_pgcount;
1653                                         PS_UNLOCK(ps);
1654                                         PSL_UNLOCK();
1655                                         dprintf(("Switching ON Emergency paging segment\n"));
1656                                         goto retry;
1657                                 }
1658
1659                                 PS_UNLOCK(ps);
1660                                 PSL_UNLOCK();
1661                         }
1662                 }
1663
1664                 /*
1665                  * Emit a notification of the low-paging resource condition
1666                  * but don't issue it more than once every five seconds.  This
1667                  * prevents us from overflowing logs with thousands of
1668                  * repetitions of the message.
1669                  */
1670                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1671                 if (paging_segment_count > 1 && (now > lastnotify + 5)) {
1672                         /* With an activated emergency paging segment we still
1673                          * didn't get any clusters. This could mean that the
1674                          * emergency paging segment is exhausted.
1675                          */
1676                         dprintf(("System is out of paging space.\n"));
1677                         lastnotify = now;
1678                 }
1679
1680                 PSL_LOCK();
1681
1682                 if(min_pages_trigger_port) {
1683                         trigger = min_pages_trigger_port;
1684                         min_pages_trigger_port = NULL;
1685                         bs_low = TRUE;
1686                 }
1687                 PSL_UNLOCK();
1688                 if (trigger != IP_NULL) {
1689                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1690                         ipc_port_release_send(trigger);
1691                 }
1692                 return (dp_offset_t) -1;
1693         }
1694
1695         /*
1696          * Look for an available cluster.  At the end of the loop,
1697          * byte_num is the byte offset and bit_num is the bit offset of the
1698          * first zero bit in the paging segment bitmap.
1699          */
1700         PS_LOCK(ps);
1701         byte_num = ps->ps_hint;
1702         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1703                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1704                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1705                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1706                                         break;
1707                         }
1708                         ASSERT(bit_num != NBBY);
1709                         break;
1710                 }
1711         }
1712         ps->ps_hint = byte_num;
1713         cluster = (byte_num*NBBY) + bit_num;
1714
1715         /* Space was reserved, so this must be true */
1716         ASSERT(cluster < ps->ps_ncls);
1717
1718         setbit(ps->ps_bmap, cluster);
1719         PS_UNLOCK(ps);
1720
1721         return cluster;
1722 }
1723
1724 void ps_deallocate_cluster(paging_segment_t, dp_offset_t);      /* forward */
1725
1726 void
1727 ps_deallocate_cluster(
1728         paging_segment_t        ps,
1729         dp_offset_t             cluster)
1730 {
1731
1732         if (cluster >= ps->ps_ncls)
1733                 panic("ps_deallocate_cluster: Invalid cluster number");
1734
1735         /*
1736          * Lock the paging segment, clear the cluster's bitmap and increment the
1737          * number of free cluster.
1738          */
1739         PSL_LOCK();
1740         PS_LOCK(ps);
1741         clrbit(ps->ps_bmap, cluster);
1742         if( IS_PS_OK_TO_USE(ps)) {
1743                 ++ps->ps_clcount;
1744                 ps->ps_pgcount +=  1 << ps->ps_clshift;
1745                 dp_pages_free +=  1 << ps->ps_clshift;
1746         } else {
1747                 ps->ps_special_clusters += 1;
1748         }
1749
1750         /*
1751          * Move the hint down to the freed cluster if it is
1752          * less than the current hint.
1753          */
1754         if ((cluster/NBBY) < ps->ps_hint) {
1755                 ps->ps_hint = (cluster/NBBY);
1756         }
1757
1758
1759         /*
1760          * If we're freeing space on a full priority, reset the array.
1761          */
1762         if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1763                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1764         PS_UNLOCK(ps);
1765         PSL_UNLOCK();
1766
1767         return;
1768 }
1769
1770 void ps_dealloc_vsmap(struct vs_map *, dp_size_t);      /* forward */
1771
1772 void
1773 ps_dealloc_vsmap(
1774         struct vs_map   *vsmap,
1775         dp_size_t       size)
1776 {
1777         unsigned int i;
1778         for (i = 0; i < size; i++)
1779                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1780                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1781                                               VSM_CLOFF(vsmap[i]));
1782 }
1783
1784 void
1785 ps_vstruct_dealloc(
1786         vstruct_t vs)
1787 {
1788         unsigned int    i;
1789 //      spl_t   s;
1790
1791         VS_MAP_LOCK(vs);
1792
1793         /*
1794          * If this is an indirect structure, then we walk through the valid
1795          * (non-zero) indirect pointers and deallocate the clusters
1796          * associated with each used map entry (via ps_dealloc_vsmap).
1797          * When all of the clusters in an indirect block have been
1798          * freed, we deallocate the block.  When all of the indirect
1799          * blocks have been deallocated we deallocate the memory
1800          * holding the indirect pointers.
1801          */
1802         if (vs->vs_indirect) {
1803                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1804                         if (vs->vs_imap[i] != NULL) {
1805                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1806                                 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1807                         }
1808                 }
1809                 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1810         } else {
1811                 /*
1812                  * Direct map.  Free used clusters, then memory.
1813                  */
1814                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1815                 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1816         }
1817         VS_MAP_UNLOCK(vs);
1818
1819         bs_commit(- vs->vs_size);
1820
1821         zfree(vstruct_zone, vs);
1822 }
1823
1824 int ps_map_extend(vstruct_t, unsigned int);     /* forward */
1825
1826 int ps_map_extend(
1827         vstruct_t       vs,
1828         unsigned int    new_size)
1829 {
1830         struct vs_map   **new_imap;
1831         struct vs_map   *new_dmap = NULL;
1832         int             newdsize;
1833         int             i;
1834         void            *old_map = NULL;
1835         int             old_map_size = 0;
1836
1837         if (vs->vs_size >= new_size) {
1838                 /*
1839                  * Someone has already done the work.
1840                  */
1841                 return 0;
1842         }
1843
1844         /*
1845          * If the new size extends into the indirect range, then we have one
1846          * of two cases: we are going from indirect to indirect, or we are
1847          * going from direct to indirect.  If we are going from indirect to
1848          * indirect, then it is possible that the new size will fit in the old
1849          * indirect map.  If this is the case, then just reset the size of the
1850          * vstruct map and we are done.  If the new size will not
1851          * fit into the old indirect map, then we have to allocate a new
1852          * indirect map and copy the old map pointers into this new map.
1853          *
1854          * If we are going from direct to indirect, then we have to allocate a
1855          * new indirect map and copy the old direct pages into the first
1856          * indirect page of the new map.
1857          * NOTE: allocating memory here is dangerous, as we're in the
1858          * pageout path.
1859          */
1860         if (INDIRECT_CLMAP(new_size)) {
1861                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1862
1863                 /*
1864                  * Get a new indirect map and zero it.
1865                  */
1866                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1867                 if (vs->vs_indirect &&
1868                     (new_map_size == old_map_size)) {
1869                         bs_commit(new_size - vs->vs_size);
1870                         vs->vs_size = new_size;
1871                         return 0;
1872                 }
1873
1874                 new_imap = (struct vs_map **)kalloc(new_map_size);
1875                 if (new_imap == NULL) {
1876                         return -1;
1877                 }
1878                 memset(new_imap, 0, new_map_size);
1879
1880                 if (vs->vs_indirect) {
1881                         /* Copy old entries into new map */
1882                         memcpy(new_imap, vs->vs_imap, old_map_size);
1883                         /* Arrange to free the old map */
1884                         old_map = (void *) vs->vs_imap;
1885                         newdsize = 0;
1886                 } else {        /* Old map was a direct map */
1887                         /* Allocate an indirect page */
1888                         if ((new_imap[0] = (struct vs_map *)
1889                              kalloc(CLMAP_THRESHOLD)) == NULL) {
1890                                 kfree(new_imap, new_map_size);
1891                                 return -1;
1892                         }
1893                         new_dmap = new_imap[0];
1894                         newdsize = CLMAP_ENTRIES;
1895                 }
1896         } else {
1897                 new_imap = NULL;
1898                 newdsize = new_size;
1899                 /*
1900                  * If the new map is a direct map, then the old map must
1901                  * also have been a direct map.  All we have to do is
1902                  * to allocate a new direct map, copy the old entries
1903                  * into it and free the old map.
1904                  */
1905                 if ((new_dmap = (struct vs_map *)
1906                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
1907                         return -1;
1908                 }
1909         }
1910         if (newdsize) {
1911
1912                 /* Free the old map */
1913                 old_map = (void *) vs->vs_dmap;
1914                 old_map_size = CLMAP_SIZE(vs->vs_size);
1915
1916                 /* Copy info from the old map into the new map */
1917                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1918
1919                 /* Initialize the rest of the new map */
1920                 for (i = vs->vs_size; i < newdsize; i++)
1921                         VSM_CLR(new_dmap[i]);
1922         }
1923         if (new_imap) {
1924                 vs->vs_imap = new_imap;
1925                 vs->vs_indirect = TRUE;
1926         } else
1927                 vs->vs_dmap = new_dmap;
1928         bs_commit(new_size - vs->vs_size);
1929         vs->vs_size = new_size;
1930         if (old_map)
1931                 kfree(old_map, old_map_size);
1932         return 0;
1933 }
1934
1935 dp_offset_t
1936 ps_clmap(
1937         vstruct_t       vs,
1938         dp_offset_t     offset,
1939         struct clmap    *clmap,
1940         int             flag,
1941         dp_size_t       size,
1942         int             error)
1943 {
1944         dp_offset_t     cluster;        /* The cluster of offset.       */
1945         dp_offset_t     newcl;          /* The new cluster allocated.   */
1946         dp_offset_t     newoff;
1947         unsigned int    i;
1948         struct vs_map   *vsmap;
1949
1950         VS_MAP_LOCK(vs);
1951
1952         ASSERT(vs->vs_dmap);
1953         cluster = atop_32(offset) >> vs->vs_clshift;
1954
1955         /*
1956          * Initialize cluster error value
1957          */
1958         clmap->cl_error = 0;
1959
1960         /*
1961          * If the object has grown, extend the page map.
1962          */
1963         if (cluster >= vs->vs_size) {
1964                 if (flag == CL_FIND) {
1965                         /* Do not allocate if just doing a lookup */
1966                         VS_MAP_UNLOCK(vs);
1967                         return (dp_offset_t) -1;
1968                 }
1969                 if (ps_map_extend(vs, cluster + 1)) {
1970                         VS_MAP_UNLOCK(vs);
1971                         return (dp_offset_t) -1;
1972                 }
1973         }
1974
1975         /*
1976          * Look for the desired cluster.  If the map is indirect, then we
1977          * have a two level lookup.  First find the indirect block, then
1978          * find the actual cluster.  If the indirect block has not yet
1979          * been allocated, then do so.  If the cluster has not yet been
1980          * allocated, then do so.
1981          *
1982          * If any of the allocations fail, then return an error.
1983          * Don't allocate if just doing a lookup.
1984          */
1985         if (vs->vs_indirect) {
1986                 long    ind_block = cluster/CLMAP_ENTRIES;
1987
1988                 /* Is the indirect block allocated? */
1989                 vsmap = vs->vs_imap[ind_block];
1990                 if (vsmap == NULL) {
1991                         if (flag == CL_FIND) {
1992                                 VS_MAP_UNLOCK(vs);
1993                                 return (dp_offset_t) -1;
1994                         }
1995
1996                         /* Allocate the indirect block */
1997                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1998                         if (vsmap == NULL) {
1999                                 VS_MAP_UNLOCK(vs);
2000                                 return (dp_offset_t) -1;
2001                         }
2002                         /* Initialize the cluster offsets */
2003                         for (i = 0; i < CLMAP_ENTRIES; i++)
2004                                 VSM_CLR(vsmap[i]);
2005                         vs->vs_imap[ind_block] = vsmap;
2006                 }
2007         } else
2008                 vsmap = vs->vs_dmap;
2009
2010         ASSERT(vsmap);
2011         vsmap += cluster%CLMAP_ENTRIES;
2012
2013         /*
2014          * At this point, vsmap points to the struct vs_map desired.
2015          *
2016          * Look in the map for the cluster, if there was an error on a
2017          * previous write, flag it and return.  If it is not yet
2018          * allocated, then allocate it, if we're writing; if we're
2019          * doing a lookup and the cluster's not allocated, return error.
2020          */
2021         if (VSM_ISERR(*vsmap)) {
2022                 clmap->cl_error = VSM_GETERR(*vsmap);
2023                 VS_MAP_UNLOCK(vs);
2024                 return (dp_offset_t) -1;
2025         } else if (VSM_ISCLR(*vsmap)) {
2026                 int psindex;
2027
2028                 if (flag == CL_FIND) {
2029                         /*
2030                          * If there's an error and the entry is clear, then
2031                          * we've run out of swap space.  Record the error
2032                          * here and return.
2033                          */
2034                         if (error) {
2035                                 VSM_SETERR(*vsmap, error);
2036                         }
2037                         VS_MAP_UNLOCK(vs);
2038                         return (dp_offset_t) -1;
2039                 } else {
2040                         /*
2041                          * Attempt to allocate a cluster from the paging segment
2042                          */
2043                         newcl = ps_allocate_cluster(vs, &psindex,
2044                                                     PAGING_SEGMENT_NULL);
2045                         if (newcl == (dp_offset_t) -1) {
2046                                 VS_MAP_UNLOCK(vs);
2047                                 return (dp_offset_t) -1;
2048                         }
2049                         VSM_CLR(*vsmap);
2050                         VSM_SETCLOFF(*vsmap, newcl);
2051                         VSM_SETPS(*vsmap, psindex);
2052                 }
2053         } else
2054                 newcl = VSM_CLOFF(*vsmap);
2055
2056         /*
2057          * Fill in pertinent fields of the clmap
2058          */
2059         clmap->cl_ps = VSM_PS(*vsmap);
2060         clmap->cl_numpages = VSCLSIZE(vs);
2061         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
2062
2063         /*
2064          * Byte offset in paging segment is byte offset to cluster plus
2065          * byte offset within cluster.  It looks ugly, but should be
2066          * relatively quick.
2067          */
2068         ASSERT(trunc_page(offset) == offset);
2069         newcl = ptoa_32(newcl) << vs->vs_clshift;
2070         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
2071         if (flag == CL_ALLOC) {
2072                 /*
2073                  * set bits in the allocation bitmap according to which
2074                  * pages were requested.  size is in bytes.
2075                  */
2076                 i = atop_32(newoff);
2077                 while ((size > 0) && (i < VSCLSIZE(vs))) {
2078                         VSM_SETALLOC(*vsmap, i);
2079                         i++;
2080                         size -= vm_page_size;
2081                 }
2082         }
2083         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
2084         if (newoff) {
2085                 /*
2086                  * Offset is not cluster aligned, so number of pages
2087                  * and bitmaps must be adjusted
2088                  */
2089                 clmap->cl_numpages -= atop_32(newoff);
2090                 CLMAP_SHIFT(clmap, vs);
2091                 CLMAP_SHIFTALLOC(clmap, vs);
2092         }
2093
2094         /*
2095          *
2096          * The setting of valid bits and handling of write errors
2097          * must be done here, while we hold the lock on the map.
2098          * It logically should be done in ps_vs_write_complete().
2099          * The size and error information has been passed from
2100          * ps_vs_write_complete().  If the size parameter is non-zero,
2101          * then there is work to be done.  If error is also non-zero,
2102          * then the error number is recorded in the cluster and the
2103          * entire cluster is in error.
2104          */
2105         if (size && flag == CL_FIND) {
2106                 dp_offset_t off = (dp_offset_t) 0;
2107
2108                 if (!error) {
2109                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
2110                              i++) {
2111                                 VSM_SETPG(*vsmap, i);
2112                                 size -= vm_page_size;
2113                         }
2114                         ASSERT(i <= VSCLSIZE(vs));
2115                 } else {
2116                         BS_STAT(clmap->cl_ps->ps_bs,
2117                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
2118                                         atop_32(size));
2119                         off = VSM_CLOFF(*vsmap);
2120                         VSM_SETERR(*vsmap, error);
2121                 }
2122                 /*
2123                  * Deallocate cluster if error, and no valid pages
2124                  * already present.
2125                  */
2126                 if (off != (dp_offset_t) 0)
2127                         ps_deallocate_cluster(clmap->cl_ps, off);
2128                 VS_MAP_UNLOCK(vs);
2129                 return (dp_offset_t) 0;
2130         } else
2131                 VS_MAP_UNLOCK(vs);
2132
2133         DP_DEBUG(DEBUG_VS_INTERNAL,
2134                  ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2135                   newcl+newoff, (int) vs, (int) vsmap, flag));
2136         DP_DEBUG(DEBUG_VS_INTERNAL,
2137                  ("     clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2138                   (int) clmap->cl_ps, clmap->cl_numpages,
2139                   (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
2140
2141         return (newcl + newoff);
2142 }
2143
2144 void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t);     /* forward */
2145
2146 void
2147 ps_clunmap(
2148         vstruct_t       vs,
2149         dp_offset_t     offset,
2150         dp_size_t       length)
2151 {
2152         dp_offset_t             cluster; /* The cluster number of offset */
2153         struct vs_map           *vsmap;
2154
2155         VS_MAP_LOCK(vs);
2156
2157         /*
2158          * Loop through all clusters in this range, freeing paging segment
2159          * clusters and map entries as encountered.
2160          */
2161         while (length > 0) {
2162                 dp_offset_t     newoff;
2163                 unsigned int    i;
2164
2165                 cluster = atop_32(offset) >> vs->vs_clshift;
2166                 if (vs->vs_indirect)    /* indirect map */
2167                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2168                 else
2169                         vsmap = vs->vs_dmap;
2170                 if (vsmap == NULL) {
2171                         VS_MAP_UNLOCK(vs);
2172                         return;
2173                 }
2174                 vsmap += cluster%CLMAP_ENTRIES;
2175                 if (VSM_ISCLR(*vsmap)) {
2176                         length -= vm_page_size;
2177                         offset += vm_page_size;
2178                         continue;
2179                 }
2180                 /*
2181                  * We've got a valid mapping.  Clear it and deallocate
2182                  * paging segment cluster pages.
2183                  * Optimize for entire cluster cleraing.
2184                  */
2185                 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2186                         /*
2187                          * Not cluster aligned.
2188                          */
2189                         ASSERT(trunc_page(newoff) == newoff);
2190                         i = atop_32(newoff);
2191                 } else
2192                         i = 0;
2193                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2194                         VSM_CLRPG(*vsmap, i);
2195                         VSM_CLRALLOC(*vsmap, i);
2196                         length -= vm_page_size;
2197                         offset += vm_page_size;
2198                         i++;
2199                 }
2200
2201                 /*
2202                  * If map entry is empty, clear and deallocate cluster.
2203                  */
2204                 if (!VSM_ALLOC(*vsmap)) {
2205                         ps_deallocate_cluster(VSM_PS(*vsmap),
2206                                               VSM_CLOFF(*vsmap));
2207                         VSM_CLR(*vsmap);
2208                 }
2209         }
2210
2211         VS_MAP_UNLOCK(vs);
2212 }
2213
2214 void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */
2215
2216 void
2217 ps_vs_write_complete(
2218         vstruct_t       vs,
2219         dp_offset_t     offset,
2220         dp_size_t       size,
2221         int             error)
2222 {
2223         struct clmap    clmap;
2224
2225         /*
2226          * Get the struct vsmap for this cluster.
2227          * Use READ, even though it was written, because the
2228          * cluster MUST be present, unless there was an error
2229          * in the original ps_clmap (e.g. no space), in which
2230          * case, nothing happens.
2231          *
2232          * Must pass enough information to ps_clmap to allow it
2233          * to set the vs_map structure bitmap under lock.
2234          */
2235         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2236 }
2237
2238 void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int);    /* forward */
2239
2240 void
2241 vs_cl_write_complete(
2242         vstruct_t                       vs,
2243         __unused paging_segment_t       ps,
2244         dp_offset_t                     offset,
2245         __unused vm_offset_t            addr,
2246         dp_size_t                       size,
2247         boolean_t                       async,
2248         int                             error)
2249 {
2250 //      kern_return_t   kr;
2251
2252         if (error) {
2253                 /*
2254                  * For internal objects, the error is recorded on a
2255                  * per-cluster basis by ps_clmap() which is called
2256                  * by ps_vs_write_complete() below.
2257                  */
2258                 dprintf(("write failed error = 0x%x\n", error));
2259                 /* add upl_abort code here */
2260         } else
2261                 GSTAT(global_stats.gs_pages_out += atop_32(size));
2262         /*
2263          * Notify the vstruct mapping code, so it can do its accounting.
2264          */
2265         ps_vs_write_complete(vs, offset, size, error);
2266
2267         if (async) {
2268                 VS_LOCK(vs);
2269                 ASSERT(vs->vs_async_pending > 0);
2270                 vs->vs_async_pending -= size;
2271                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2272                         vs->vs_waiting_async = FALSE;
2273                         VS_UNLOCK(vs);
2274                         thread_wakeup(&vs->vs_async_pending);
2275                 } else {
2276                         VS_UNLOCK(vs);
2277                 }
2278         }
2279 }
2280
2281 #ifdef DEVICE_PAGING
2282 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2283
2284 kern_return_t
2285 device_write_reply(
2286         MACH_PORT_FACE  reply_port,
2287         kern_return_t   device_code,
2288         io_buf_len_t    bytes_written)
2289 {
2290         struct vs_async *vsa;
2291
2292         vsa = (struct vs_async *)
2293                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2294
2295         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2296                 device_code = KERN_FAILURE;
2297         }
2298
2299         vsa->vsa_error = device_code;
2300
2301
2302         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2303         if(vsa->vsa_flags & VSA_TRANSFER) {
2304                 /* revisit when async disk segments redone */
2305                 if(vsa->vsa_error) {
2306                    /* need to consider error condition.  re-write data or */
2307                    /* throw it away here. */
2308                    vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2309                 }
2310                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2311                                                 vsa->vsa_size, vsa->vsa_error);
2312         } else {
2313                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2314                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2315                              vsa->vsa_error);
2316         }
2317         VS_FREE_ASYNC(vsa);
2318
2319         return KERN_SUCCESS;
2320 }
2321
2322 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2323 kern_return_t
2324 device_write_reply_inband(
2325         MACH_PORT_FACE          reply_port,
2326         kern_return_t           return_code,
2327         io_buf_len_t            bytes_written)
2328 {
2329         panic("device_write_reply_inband: illegal");
2330         return KERN_SUCCESS;
2331 }
2332
2333 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2334 kern_return_t
2335 device_read_reply(
2336         MACH_PORT_FACE          reply_port,
2337         kern_return_t           return_code,
2338         io_buf_ptr_t            data,
2339         mach_msg_type_number_t  dataCnt)
2340 {
2341         struct vs_async *vsa;
2342         vsa = (struct vs_async *)
2343                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2344         vsa->vsa_addr = (vm_offset_t)data;
2345         vsa->vsa_size = (vm_size_t)dataCnt;
2346         vsa->vsa_error = return_code;
2347         thread_wakeup(&vsa);
2348         return KERN_SUCCESS;
2349 }
2350
2351 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2352 kern_return_t
2353 device_read_reply_inband(
2354         MACH_PORT_FACE          reply_port,
2355         kern_return_t           return_code,
2356         io_buf_ptr_inband_t     data,
2357         mach_msg_type_number_t  dataCnt)
2358 {
2359         panic("device_read_reply_inband: illegal");
2360         return KERN_SUCCESS;
2361 }
2362
2363 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2364 kern_return_t
2365 device_read_reply_overwrite(
2366         MACH_PORT_FACE          reply_port,
2367         kern_return_t           return_code,
2368         io_buf_len_t            bytes_read)
2369 {
2370         panic("device_read_reply_overwrite: illegal\n");
2371         return KERN_SUCCESS;
2372 }
2373
2374 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2375 kern_return_t
2376 device_open_reply(
2377         MACH_PORT_FACE          reply_port,
2378         kern_return_t           return_code,
2379         MACH_PORT_FACE          device_port)
2380 {
2381         panic("device_open_reply: illegal\n");
2382         return KERN_SUCCESS;
2383 }
2384
2385 kern_return_t
2386 ps_read_device(
2387         paging_segment_t        ps,
2388         dp_offset_t             offset,
2389         vm_offset_t             *bufferp,
2390         unsigned int            size,
2391         unsigned int            *residualp,
2392         int                     flags)
2393 {
2394         kern_return_t   kr;
2395         recnum_t        dev_offset;
2396         unsigned int    bytes_wanted;
2397         unsigned int    bytes_read;
2398         unsigned int    total_read;
2399         vm_offset_t     dev_buffer;
2400         vm_offset_t     buf_ptr;
2401         unsigned int    records_read;
2402         struct vs_async *vsa;
2403
2404         device_t        device;
2405         vm_map_copy_t   device_data = NULL;
2406         default_pager_thread_t *dpt = NULL;
2407
2408         device = dev_port_lookup(ps->ps_device);
2409         clustered_reads[atop_32(size)]++;
2410
2411         dev_offset = (ps->ps_offset +
2412                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2413         bytes_wanted = size;
2414         total_read = 0;
2415         *bufferp = (vm_offset_t)NULL;
2416
2417         do {
2418                 vsa = VS_ALLOC_ASYNC();
2419                 if (vsa) {
2420                         vsa->vsa_vs = NULL;
2421                         vsa->vsa_addr = 0;
2422                         vsa->vsa_offset = 0;
2423                         vsa->vsa_size = 0;
2424                         vsa->vsa_ps = NULL;
2425                 }
2426                 ip_lock(vsa->reply_port);
2427                 vsa->reply_port->ip_sorights++;
2428                 ip_reference(vsa->reply_port);
2429                 ip_unlock(vsa->reply_port);
2430                 kr = ds_device_read_common(device,
2431                                  vsa->reply_port,
2432                                  (mach_msg_type_name_t)
2433                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2434                                  (dev_mode_t) 0,
2435                                  dev_offset,
2436                                  bytes_wanted,
2437                                  (IO_READ | IO_CALL),
2438                                  (io_buf_ptr_t *) &dev_buffer,
2439                                  (mach_msg_type_number_t *) &bytes_read);
2440                 if(kr == MIG_NO_REPLY) {
2441                         assert_wait(&vsa, THREAD_UNINT);
2442                         thread_block(THREAD_CONTINUE_NULL);
2443
2444                         dev_buffer = vsa->vsa_addr;
2445                         bytes_read = (unsigned int)vsa->vsa_size;
2446                         kr = vsa->vsa_error;
2447                 }
2448                 VS_FREE_ASYNC(vsa);
2449                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2450                         break;
2451                 }
2452                 total_read += bytes_read;
2453
2454                 /*
2455                  * If we got the entire range, use the returned dev_buffer.
2456                  */
2457                 if (bytes_read == size) {
2458                         *bufferp = (vm_offset_t)dev_buffer;
2459                         break;
2460                 }
2461
2462 #if 1
2463                 dprintf(("read only %d bytes out of %d\n",
2464                          bytes_read, bytes_wanted));
2465 #endif
2466                 if(dpt == NULL) {
2467                         dpt = get_read_buffer();
2468                         buf_ptr = dpt->dpt_buffer;
2469                         *bufferp = (vm_offset_t)buf_ptr;
2470                 }
2471                 /*
2472                  * Otherwise, copy the data into the provided buffer (*bufferp)
2473                  * and append the rest of the range as it comes in.
2474                  */
2475                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2476                 buf_ptr += bytes_read;
2477                 bytes_wanted -= bytes_read;
2478                 records_read = (bytes_read >>
2479                                 (vm_page_shift - ps->ps_record_shift));
2480                 dev_offset += records_read;
2481                 DP_DEBUG(DEBUG_VS_INTERNAL,
2482                          ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2483                           dev_buffer, bytes_read));
2484                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2485                     != KERN_SUCCESS)
2486                         Panic("dealloc buf");
2487         } while (bytes_wanted);
2488
2489         *residualp = size - total_read;
2490         if((dev_buffer != *bufferp) && (total_read != 0)) {
2491                 vm_offset_t temp_buffer;
2492                 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2493                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2494                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2495                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2496                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2497                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2498                         (vm_map_copy_t *)&device_data, FALSE))
2499                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2500         }
2501         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2502                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2503                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2504                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2505                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2506                         (vm_map_copy_t *)&device_data, FALSE))
2507                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2508         }
2509         else {
2510                 device_data = NULL;
2511         }
2512         *bufferp = (vm_offset_t)device_data;
2513
2514         if(dpt != NULL) {
2515                 /* Free the receive buffer */
2516                 dpt->checked_out = 0;
2517                 thread_wakeup(&dpt_array);
2518         }
2519         return KERN_SUCCESS;
2520 }
2521
2522 kern_return_t
2523 ps_write_device(
2524         paging_segment_t        ps,
2525         dp_offset_t             offset,
2526         vm_offset_t             addr,
2527         unsigned int            size,
2528         struct vs_async         *vsa)
2529 {
2530         recnum_t        dev_offset;
2531         io_buf_len_t    bytes_to_write, bytes_written;
2532         recnum_t        records_written;
2533         kern_return_t   kr;
2534         MACH_PORT_FACE  reply_port;
2535
2536
2537
2538         clustered_writes[atop_32(size)]++;
2539
2540         dev_offset = (ps->ps_offset +
2541                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2542         bytes_to_write = size;
2543
2544         if (vsa) {
2545                 /*
2546                  * Asynchronous write.
2547                  */
2548                 reply_port = vsa->reply_port;
2549                 ip_lock(reply_port);
2550                 reply_port->ip_sorights++;
2551                 ip_reference(reply_port);
2552                 ip_unlock(reply_port);
2553                 {
2554                 device_t        device;
2555                 device = dev_port_lookup(ps->ps_device);
2556
2557                 vsa->vsa_addr = addr;
2558                 kr=ds_device_write_common(device,
2559                         reply_port,
2560                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2561                         (dev_mode_t) 0,
2562                         dev_offset,
2563                         (io_buf_ptr_t)  addr,
2564                         size,
2565                         (IO_WRITE | IO_CALL),
2566                         &bytes_written);
2567                 }
2568                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2569                         if (verbose)
2570                                 dprintf(("%s0x%x, addr=0x%x,"
2571                                          "size=0x%x,offset=0x%x\n",
2572                                          "device_write_request returned ",
2573                                          kr, addr, size, offset));
2574                         BS_STAT(ps->ps_bs,
2575                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2576                         /* do the completion notification to free resources */
2577                         device_write_reply(reply_port, kr, 0);
2578                         return PAGER_ERROR;
2579                 }
2580         } else do {
2581                 /*
2582                  * Synchronous write.
2583                  */
2584                 {
2585                 device_t        device;
2586                 device = dev_port_lookup(ps->ps_device);
2587                 kr=ds_device_write_common(device,
2588                         IP_NULL, 0,
2589                         (dev_mode_t) 0,
2590                         dev_offset,
2591                         (io_buf_ptr_t)  addr,
2592                         size,
2593                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2594                         &bytes_written);
2595                 }
2596                 if (kr != KERN_SUCCESS) {
2597                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2598                                  "device_write returned ",
2599                                  kr, addr, size, offset));
2600                         BS_STAT(ps->ps_bs,
2601                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2602                         return PAGER_ERROR;
2603                 }
2604                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2605                         Panic("fragmented write");
2606                 records_written = (bytes_written >>
2607                                    (vm_page_shift - ps->ps_record_shift));
2608                 dev_offset += records_written;
2609 #if 1
2610                 if (bytes_written != bytes_to_write) {
2611                         dprintf(("wrote only %d bytes out of %d\n",
2612                                  bytes_written, bytes_to_write));
2613                 }
2614 #endif
2615                 bytes_to_write -= bytes_written;
2616                 addr += bytes_written;
2617         } while (bytes_to_write > 0);
2618
2619         return PAGER_SUCCESS;
2620 }
2621
2622
2623 #else /* !DEVICE_PAGING */
2624
2625 kern_return_t
2626 ps_read_device(
2627         __unused paging_segment_t       ps,
2628         __unused dp_offset_t            offset,
2629         __unused vm_offset_t            *bufferp,
2630         __unused unsigned int           size,
2631         __unused unsigned int           *residualp,
2632         __unused int                            flags)
2633 {
2634   panic("ps_read_device not supported");
2635   return KERN_FAILURE;
2636 }
2637
2638 kern_return_t
2639 ps_write_device(
2640         __unused paging_segment_t       ps,
2641         __unused dp_offset_t            offset,
2642         __unused vm_offset_t            addr,
2643         __unused unsigned int           size,
2644         __unused struct vs_async        *vsa)
2645 {
2646   panic("ps_write_device not supported");
2647   return KERN_FAILURE;
2648 }
2649
2650 #endif /* DEVICE_PAGING */
2651 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t);      /* forward */
2652
2653 void
2654 pvs_object_data_provided(
2655         __unused vstruct_t              vs,
2656         __unused upl_t                  upl,
2657         __unused upl_offset_t   offset,
2658         upl_size_t                              size)
2659 {
2660
2661         DP_DEBUG(DEBUG_VS_INTERNAL,
2662                  ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2663                   upl, offset, size));
2664
2665         ASSERT(size > 0);
2666         GSTAT(global_stats.gs_pages_in += atop_32(size));
2667
2668
2669 #if     USE_PRECIOUS
2670         ps_clunmap(vs, offset, size);
2671 #endif  /* USE_PRECIOUS */
2672
2673 }
2674
2675 static memory_object_offset_t   last_start;
2676 static vm_size_t                last_length;
2677
2678 kern_return_t
2679 pvs_cluster_read(
2680         vstruct_t       vs,
2681         dp_offset_t     vs_offset,
2682         dp_size_t       cnt,
2683         void            *fault_info)
2684 {
2685         kern_return_t           error = KERN_SUCCESS;
2686         unsigned int            size;
2687         unsigned int            residual;
2688         unsigned int            request_flags;
2689         int                     io_flags = 0;
2690         int                     seg_index;
2691         int                     pages_in_cl;
2692         int                     cl_size;
2693         int                     cl_mask;
2694         int                     cl_index;
2695         unsigned int            xfer_size;
2696         dp_offset_t             orig_vs_offset;
2697         dp_offset_t       ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2698         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2699         struct clmap            clmap;
2700         upl_t                   upl;
2701         unsigned int            page_list_count;
2702         memory_object_offset_t  cluster_start;
2703         vm_size_t               cluster_length;
2704         uint32_t                io_streaming;
2705
2706         pages_in_cl = 1 << vs->vs_clshift;
2707         cl_size = pages_in_cl * vm_page_size;
2708         cl_mask = cl_size - 1;
2709
2710 #if     USE_PRECIOUS
2711         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2712 #else
2713         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2714 #endif
2715         cl_index = (vs_offset & cl_mask) / vm_page_size;
2716
2717         if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) ||
2718             !CLMAP_ISSET(clmap, cl_index)) {
2719                 /*
2720                  * the needed page doesn't exist in the backing store...
2721                  * we don't want to try to do any I/O, just abort the
2722                  * page and let the fault handler provide a zero-fill
2723                  */
2724                 if (cnt == 0) {
2725                         /*
2726                          * The caller was just poking at us to see if
2727                          * the page has been paged out.  No need to
2728                          * mess with the page at all.
2729                          * Just let the caller know we don't have that page.
2730                          */
2731                         return KERN_FAILURE;
2732                 }
2733
2734                 page_list_count = 0;
2735
2736                 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2737                                                 PAGE_SIZE, PAGE_SIZE,
2738                                                 &upl, NULL, &page_list_count,
2739                                                 request_flags);
2740
2741                 if (clmap.cl_error)
2742                         upl_abort(upl, UPL_ABORT_ERROR);
2743                 else
2744                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2745                 upl_deallocate(upl);
2746
2747                 return KERN_SUCCESS;
2748         }
2749
2750         if (cnt == 0) {
2751                 /*
2752                  * The caller was just poking at us to see if
2753                  * the page has been paged out.  No need to
2754                  * mess with the page at all.
2755                  * Just let the caller know we do have that page.
2756                  */
2757                 return KERN_SUCCESS;
2758         }
2759
2760         assert(dp_encryption_inited);
2761         if (dp_encryption) {
2762                 /*
2763                  * ENCRYPTED SWAP:
2764                  * request that the UPL be prepared for
2765                  * decryption.
2766                  */
2767                 request_flags |= UPL_ENCRYPT;
2768         }
2769         orig_vs_offset = vs_offset;
2770
2771         assert(cnt != 0);
2772         cnt = VM_SUPER_CLUSTER;
2773         cluster_start = (memory_object_offset_t) vs_offset;
2774         cluster_length = (vm_size_t) cnt;
2775         io_streaming = 0;
2776
2777         /*
2778          * determine how big a speculative I/O we should try for...
2779          */
2780         if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
2781                 assert(vs_offset >= (dp_offset_t) cluster_start &&
2782                        vs_offset < (dp_offset_t) (cluster_start + cluster_length));
2783                 vs_offset = (dp_offset_t) cluster_start;
2784                 cnt = (dp_size_t) cluster_length;
2785         } else {
2786                 cluster_length = PAGE_SIZE;
2787                 cnt = PAGE_SIZE;
2788         }
2789
2790         if (io_streaming)
2791                 io_flags |= UPL_IOSTREAMING;
2792
2793         last_start = cluster_start;
2794         last_length = cluster_length;
2795
2796         /*
2797          * This loop will be executed multiple times until the entire
2798          * range has been looked at or we issue an I/O... if the request spans cluster
2799          * boundaries, the clusters will be checked for logical continunity,
2800          * if contiguous the I/O request will span multiple clusters...
2801          * at most only 1 I/O will be issued... it will encompass the original offset
2802          */
2803         while (cnt && error == KERN_SUCCESS) {
2804                 int     ps_info_valid;
2805
2806                 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
2807                         size = VM_SUPER_CLUSTER;
2808                         size -= vs_offset & cl_mask;
2809                 } else if (cnt > VM_SUPER_CLUSTER)
2810                         size = VM_SUPER_CLUSTER;
2811                 else
2812                         size = cnt;
2813
2814                 cnt -= size;
2815
2816                 ps_info_valid = 0;
2817                 seg_index     = 0;
2818
2819                 while (size > 0 && error == KERN_SUCCESS) {
2820                         unsigned int  abort_size;
2821                         int           failed_size;
2822                         int           beg_pseg;
2823                         int           beg_indx;
2824                         dp_offset_t   cur_offset;
2825
2826                         if ( !ps_info_valid) {
2827                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2828                                 psp[seg_index]       = CLMAP_PS(clmap);
2829                                 ps_info_valid = 1;
2830                         }
2831                         /*
2832                          * skip over unallocated physical segments
2833                          */
2834                         if (ps_offset[seg_index] == (dp_offset_t) -1) {
2835                                 abort_size = cl_size - (vs_offset & cl_mask);
2836                                 abort_size = MIN(abort_size, size);
2837
2838                                 size      -= abort_size;
2839                                 vs_offset += abort_size;
2840
2841                                 seg_index++;
2842                                 ps_info_valid = 0;
2843
2844                                 continue;
2845                         }
2846                         cl_index = (vs_offset & cl_mask) / vm_page_size;
2847
2848                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2849                                 /*
2850                                  * skip over unallocated pages
2851                                  */
2852                                 if (CLMAP_ISSET(clmap, cl_index))
2853                                         break;
2854                                 abort_size += vm_page_size;
2855                         }
2856                         if (abort_size) {
2857                                 size      -= abort_size;
2858                                 vs_offset += abort_size;
2859
2860                                 if (cl_index == pages_in_cl) {
2861                                         /*
2862                                          * if we're at the end of this physical cluster
2863                                          * then bump to the next one and continue looking
2864                                          */
2865                                         seg_index++;
2866                                         ps_info_valid = 0;
2867
2868                                         continue;
2869                                 }
2870                                 if (size == 0)
2871                                         break;
2872                         }
2873                         /*
2874                          * remember the starting point of the first allocated page
2875                          * for the I/O we're about to issue
2876                          */
2877                         beg_pseg   = seg_index;
2878                         beg_indx   = cl_index;
2879                         cur_offset = vs_offset;
2880
2881                         /*
2882                          * calculate the size of the I/O that we can do...
2883                          * this may span multiple physical segments if
2884                          * they are contiguous
2885                          */
2886                         for (xfer_size = 0; xfer_size < size; ) {
2887
2888                                 while (cl_index < pages_in_cl && xfer_size < size) {
2889                                         /*
2890                                          * accumulate allocated pages within
2891                                          * a physical segment
2892                                          */
2893                                         if (CLMAP_ISSET(clmap, cl_index)) {
2894                                                 xfer_size  += vm_page_size;
2895                                                 cur_offset += vm_page_size;
2896                                                 cl_index++;
2897
2898                                                 BS_STAT(psp[seg_index]->ps_bs,
2899                                                         psp[seg_index]->ps_bs->bs_pages_in++);
2900                                         } else
2901                                                 break;
2902                                 }
2903                                 if (cl_index < pages_in_cl || xfer_size >= size) {
2904                                         /*
2905                                          * we've hit an unallocated page or
2906                                          * the end of this request... see if
2907                                          * it's time to fire the I/O
2908                                          */
2909                                         break;
2910                                 }
2911                                 /*
2912                                  * we've hit the end of the current physical
2913                                  * segment and there's more to do, so try
2914                                  * moving to the next one
2915                                  */
2916                                 seg_index++;
2917
2918                                 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2919                                 psp[seg_index] = CLMAP_PS(clmap);
2920                                 ps_info_valid = 1;
2921
2922                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2923                                         /*
2924                                          * if the physical segment we're about
2925                                          * to step into is not contiguous to
2926                                          * the one we're currently in, or it's
2927                                          * in a different paging file, or
2928                                          * it hasn't been allocated....
2929                                          * we stop this run and go check
2930                                          * to see if it's time to fire the I/O
2931                                          */
2932                                         break;
2933                                 }
2934                                 /*
2935                                  * start with first page of the next physical
2936                                  * segment
2937                                  */
2938                                 cl_index = 0;
2939                         }
2940                         if (xfer_size == 0) {
2941                                 /*
2942                                  * no I/O to generate for this segment
2943                                  */
2944                                 continue;
2945                         }
2946                         if (cur_offset <= orig_vs_offset) {
2947                                 /*
2948                                  * we've hit a hole in our speculative cluster
2949                                  * before the offset that we're really after...
2950                                  * don't issue the I/O since it doesn't encompass
2951                                  * the original offset and we're looking to only
2952                                  * pull in the speculative pages if they can be
2953                                  * made part of a single I/O
2954                                  */
2955                                 size      -= xfer_size;
2956                                 vs_offset += xfer_size;
2957
2958                                 continue;
2959                         }
2960                         /*
2961                          * we have a contiguous range of allocated pages
2962                          * to read from that encompasses the original offset
2963                          */
2964                         page_list_count = 0;
2965                         memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2966                                                         xfer_size, xfer_size,
2967                                                         &upl, NULL, &page_list_count,
2968                                                         request_flags | UPL_SET_INTERNAL | UPL_NOBLOCK);
2969
2970                         error = ps_read_file(psp[beg_pseg],
2971                                              upl, (upl_offset_t) 0,
2972                                              ps_offset[beg_pseg] + (beg_indx * vm_page_size),
2973                                              xfer_size, &residual, io_flags);
2974
2975                         failed_size = 0;
2976
2977                         /*
2978                          * Adjust counts and send response to VM.  Optimize
2979                          * for the common case, i.e. no error and/or partial
2980                          * data. If there was an error, then we need to error
2981                          * the entire range, even if some data was successfully
2982                          * read. If there was a partial read we may supply some
2983                          * data and may error some as well.  In all cases the
2984                          * VM must receive some notification for every page
2985                          * in the range.
2986                          */
2987                         if ((error == KERN_SUCCESS) && (residual == 0)) {
2988                                 /*
2989                                  * Got everything we asked for, supply the data
2990                                  * to the VM.  Note that as a side effect of
2991                                  * supplying the data, the buffer holding the
2992                                  * supplied data is deallocated from the pager's
2993                                  *  address space.
2994                                  */
2995                                 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
2996                         } else {
2997                                 failed_size = xfer_size;
2998
2999                                 if (error == KERN_SUCCESS) {
3000                                         if (residual == xfer_size) {
3001                                                 /*
3002                                                  * If a read operation returns no error
3003                                                  * and no data moved, we turn it into
3004                                                  * an error, assuming we're reading at
3005                                                  * or beyong EOF.
3006                                                  * Fall through and error the entire range.
3007                                                  */
3008                                                 error = KERN_FAILURE;
3009                                         } else {
3010                                                 /*
3011                                                  * Otherwise, we have partial read. If
3012                                                  * the part read is a integral number
3013                                                  * of pages supply it. Otherwise round
3014                                                  * it up to a page boundary, zero fill
3015                                                  * the unread part, and supply it.
3016                                                  * Fall through and error the remainder
3017                                                  * of the range, if any.
3018                                                  */
3019                                                 int fill;
3020                                                 unsigned int lsize;
3021
3022                                                 fill = residual & ~vm_page_size;
3023                                                 lsize = (xfer_size - residual) + fill;
3024
3025                                                 pvs_object_data_provided(vs, upl, vs_offset, lsize);
3026
3027                                                 if (lsize < xfer_size) {
3028                                                         failed_size = xfer_size - lsize;
3029                                                         error = KERN_FAILURE;
3030                                                 }
3031                                         }
3032                                 }
3033                         }
3034                         if (error != KERN_SUCCESS) {
3035                                 /*
3036                                  * There was an error in some part of the range, tell
3037                                  * the VM. Note that error is explicitly checked again
3038                                  * since it can be modified above.
3039                                  */
3040                                 BS_STAT(psp[beg_pseg]->ps_bs,
3041                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
3042                         }
3043                         /*
3044                          * we've issued a single I/O that encompassed the original offset
3045                          * at this point we either met our speculative request length or
3046                          * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3047                          * not present or not physically contiguous to the previous one), so
3048                          * we're done issuing I/O at this point
3049                          */
3050                         return (error);
3051                 }
3052         }
3053         return error;
3054 }
3055
3056 int vs_do_async_write = 1;
3057
3058 kern_return_t
3059 vs_cluster_write(
3060         vstruct_t       vs,
3061         upl_t           internal_upl,
3062         upl_offset_t    offset,
3063         upl_size_t      cnt,
3064         boolean_t       dp_internal,
3065         int             flags)
3066 {
3067         upl_size_t      transfer_size;
3068         int             error = 0;
3069         struct clmap    clmap;
3070
3071         dp_offset_t     actual_offset;  /* Offset within paging segment */
3072         paging_segment_t ps;
3073         dp_offset_t     mobj_base_addr;
3074         dp_offset_t     mobj_target_addr;
3075
3076         upl_t           upl;
3077         upl_page_info_t *pl;
3078         int             page_index;
3079         int             list_size;
3080         int             pages_in_cl;
3081         unsigned int    cl_size;
3082         int             base_index;
3083         unsigned int    seg_size;
3084         unsigned int    upl_offset_in_object;
3085
3086         pages_in_cl = 1 << vs->vs_clshift;
3087         cl_size = pages_in_cl * vm_page_size;
3088
3089         if (!dp_internal) {
3090                 unsigned int page_list_count;
3091                 int          request_flags;
3092                 unsigned int super_size;
3093                 int          first_dirty;
3094                 int          num_dirty;
3095                 int          num_of_pages;
3096                 int          seg_index;
3097                 upl_offset_t  upl_offset;
3098                 dp_offset_t  seg_offset;
3099                 dp_offset_t  ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
3100                 paging_segment_t   psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
3101
3102
3103                 if (bs_low) {
3104                         super_size = cl_size;
3105
3106                         request_flags = UPL_NOBLOCK |
3107                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
3108                                 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
3109                 } else {
3110                         super_size = VM_SUPER_CLUSTER;
3111
3112                         request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
3113                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
3114                                 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
3115                 }
3116
3117                 if (!dp_encryption_inited) {
3118                         /*
3119                          * ENCRYPTED SWAP:
3120                          * Once we've started using swap, we
3121                          * can't change our mind on whether
3122                          * it needs to be encrypted or
3123                          * not.
3124                          */
3125                         dp_encryption_inited = TRUE;
3126                 }
3127                 if (dp_encryption) {
3128                         /*
3129                          * ENCRYPTED SWAP:
3130                          * request that the UPL be prepared for
3131                          * encryption.
3132                          */
3133                         request_flags |= UPL_ENCRYPT;
3134                         flags |= UPL_PAGING_ENCRYPTED;
3135                 }
3136
3137                 page_list_count = 0;
3138                 memory_object_super_upl_request(vs->vs_control,
3139                                 (memory_object_offset_t)offset,
3140                                 cnt, super_size,
3141                                 &upl, NULL, &page_list_count,
3142                                 request_flags | UPL_FOR_PAGEOUT);
3143
3144                 /*
3145                  * The default pager does not handle objects larger than
3146                  * 4GB, so it does not deal with offset that don't fit in
3147                  * 32-bit.  Cast down upl->offset now and make sure we
3148                  * did not lose any valuable bits.
3149                  */
3150                 upl_offset_in_object = (unsigned int) upl->offset;
3151                 assert(upl->offset == upl_offset_in_object);
3152
3153                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3154
3155                 seg_size = cl_size - (upl_offset_in_object % cl_size);
3156                 upl_offset = upl_offset_in_object & ~(cl_size - 1);
3157
3158                 for (seg_index = 0, transfer_size = upl->size;
3159                                                 transfer_size > 0; ) {
3160                         ps_offset[seg_index] =
3161                                 ps_clmap(vs,
3162                                         upl_offset,
3163                                         &clmap, CL_ALLOC,
3164                                         cl_size, 0);
3165
3166                         if (ps_offset[seg_index] == (dp_offset_t) -1) {
3167                                 upl_abort(upl, 0);
3168                                 upl_deallocate(upl);
3169
3170                                 return KERN_FAILURE;
3171
3172                         }
3173                         psp[seg_index] = CLMAP_PS(clmap);
3174
3175                         if (transfer_size > seg_size) {
3176                                 transfer_size -= seg_size;
3177                                 upl_offset += cl_size;
3178                                 seg_size    = cl_size;
3179                                 seg_index++;
3180                         } else
3181                                 transfer_size = 0;
3182                 }
3183                 /*
3184                  * Ignore any non-present pages at the end of the
3185                  * UPL.
3186                  */
3187                 for (page_index = upl->size / vm_page_size; page_index > 0;)
3188                         if (UPL_PAGE_PRESENT(pl, --page_index))
3189                                 break;
3190                 num_of_pages = page_index + 1;
3191
3192                 base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE;
3193
3194                 for (page_index = 0; page_index < num_of_pages; ) {
3195                         /*
3196                          * skip over non-dirty pages
3197                          */
3198                         for ( ; page_index < num_of_pages; page_index++) {
3199                                 if (UPL_DIRTY_PAGE(pl, page_index)
3200                                         || UPL_PRECIOUS_PAGE(pl, page_index))
3201                                         /*
3202                                          * this is a page we need to write
3203                                          * go see if we can buddy it up with
3204                                          * others that are contiguous to it
3205                                          */
3206                                         break;
3207                                 /*
3208                                  * if the page is not-dirty, but present we
3209                                  * need to commit it...  This is an unusual
3210                                  * case since we only asked for dirty pages
3211                                  */
3212                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
3213                                         boolean_t empty = FALSE;
3214                                         upl_commit_range(upl,
3215                                                  page_index * vm_page_size,
3216                                                  vm_page_size,
3217                                                  UPL_COMMIT_NOTIFY_EMPTY,
3218                                                  pl,
3219                                                  page_list_count,
3220                                                  &empty);
3221                                         if (empty) {
3222                                                 assert(page_index ==
3223                                                        num_of_pages - 1);
3224                                                 upl_deallocate(upl);
3225                                         }
3226                                 }
3227                         }
3228                         if (page_index == num_of_pages)
3229                                 /*
3230                                  * no more pages to look at, we're out of here
3231                                  */
3232                                 break;
3233
3234                         /*
3235                          * gather up contiguous dirty pages... we have at
3236                          * least 1 * otherwise we would have bailed above
3237                          * make sure that each physical segment that we step
3238                          * into is contiguous to the one we're currently in
3239                          * if it's not, we have to stop and write what we have
3240                          */
3241                         for (first_dirty = page_index;
3242                                         page_index < num_of_pages; ) {
3243                                 if ( !UPL_DIRTY_PAGE(pl, page_index)
3244                                         && !UPL_PRECIOUS_PAGE(pl, page_index))
3245                                         break;
3246                                 page_index++;
3247                                 /*
3248                                  * if we just looked at the last page in the UPL
3249                                  * we don't need to check for physical segment
3250                                  * continuity
3251                                  */
3252                                 if (page_index < num_of_pages) {
3253                                         int cur_seg;
3254                                         int nxt_seg;
3255
3256                                         cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3257                                         nxt_seg = (base_index + page_index)/pages_in_cl;
3258
3259                                         if (cur_seg != nxt_seg) {
3260                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3261                                                 /*
3262                                                  * if the segment we're about
3263                                                  * to step into is not
3264                                                  * contiguous to the one we're
3265                                                  * currently in, or it's in a
3266                                                  * different paging file....
3267                                                  * we stop here and generate
3268                                                  * the I/O
3269                                                  */
3270                                                         break;
3271                                         }
3272                                 }
3273                         }
3274                         num_dirty = page_index - first_dirty;
3275
3276                         if (num_dirty) {
3277                                 upl_offset = first_dirty * vm_page_size;
3278                                 transfer_size = num_dirty * vm_page_size;
3279
3280                                 while (transfer_size) {
3281
3282                                         if ((seg_size = cl_size -
3283                                                 ((upl_offset_in_object +
3284                                                   upl_offset) % cl_size))
3285                                                         > transfer_size)
3286                                                 seg_size = transfer_size;
3287
3288                                         ps_vs_write_complete(
3289                                                 vs,
3290                                                 (upl_offset_in_object +
3291                                                  upl_offset),
3292                                                 seg_size, error);
3293
3294                                         transfer_size -= seg_size;
3295                                         upl_offset += seg_size;
3296                                 }
3297                                 upl_offset = first_dirty * vm_page_size;
3298                                 transfer_size = num_dirty * vm_page_size;
3299
3300                                 seg_index  = (base_index + first_dirty) / pages_in_cl;
3301                                 seg_offset = (upl_offset_in_object + upl_offset) % cl_size;
3302
3303                                 error = ps_write_file(psp[seg_index],
3304                                                 upl, upl_offset,
3305                                                 ps_offset[seg_index]
3306                                                                 + seg_offset,
3307                                                 transfer_size, flags);
3308                         } else {
3309                                 boolean_t empty = FALSE;
3310                                 upl_abort_range(upl,
3311                                                 first_dirty * vm_page_size,
3312                                                 num_dirty   * vm_page_size,
3313                                                 UPL_ABORT_NOTIFY_EMPTY,
3314                                                 &empty);
3315                                 if (empty) {
3316                                         assert(page_index == num_of_pages);
3317                                         upl_deallocate(upl);
3318                                 }
3319                         }
3320                 }
3321
3322         } else {
3323                 assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift));
3324                 list_size = cnt;
3325
3326                 page_index = 0;
3327                 /* The caller provides a mapped_data which is derived  */
3328                 /* from a temporary object.  The targeted pages are    */
3329                 /* guaranteed to be set at offset 0 in the mapped_data */
3330                 /* The actual offset however must still be derived     */
3331                 /* from the offset in the vs in question               */
3332                 mobj_base_addr = offset;
3333                 mobj_target_addr = mobj_base_addr;
3334
3335                 for (transfer_size = list_size; transfer_size != 0;) {
3336                         actual_offset = ps_clmap(vs, mobj_target_addr,
3337                                 &clmap, CL_ALLOC,
3338                                 transfer_size < cl_size ?
3339                                         transfer_size : cl_size, 0);
3340                         if(actual_offset == (dp_offset_t) -1) {
3341                                 error = 1;
3342                                 break;
3343                         }
3344                         cnt = MIN(transfer_size,
3345                                   (unsigned) CLMAP_NPGS(clmap) * vm_page_size);
3346                         ps = CLMAP_PS(clmap);
3347                         /* Assume that the caller has given us contiguous */
3348                         /* pages */
3349                         if(cnt) {
3350                                 ps_vs_write_complete(vs, mobj_target_addr,
3351                                                                 cnt, error);
3352                                 error = ps_write_file(ps, internal_upl,
3353                                                 0, actual_offset,
3354                                                 cnt, flags);
3355                                 if (error)
3356                                         break;
3357                            }
3358                         if (error)
3359                                 break;
3360                         actual_offset += cnt;
3361                         mobj_target_addr += cnt;
3362                         transfer_size -= cnt;
3363                         cnt = 0;
3364
3365                         if (error)
3366                                 break;
3367                 }
3368         }
3369         if(error)
3370                 return KERN_FAILURE;
3371         else
3372                 return KERN_SUCCESS;
3373 }
3374
3375 vm_size_t
3376 ps_vstruct_allocated_size(
3377         vstruct_t       vs)
3378 {
3379         int             num_pages;
3380         struct vs_map   *vsmap;
3381         unsigned int    i, j, k;
3382
3383         num_pages = 0;
3384         if (vs->vs_indirect) {
3385                 /* loop on indirect maps */
3386                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3387                         vsmap = vs->vs_imap[i];
3388                         if (vsmap == NULL)
3389                                 continue;
3390                         /* loop on clusters in this indirect map */
3391                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3392                                 if (VSM_ISCLR(vsmap[j]) ||
3393                                     VSM_ISERR(vsmap[j]))
3394                                         continue;
3395                                 /* loop on pages in this cluster */
3396                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3397                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3398                                                 num_pages++;
3399                                 }
3400                         }
3401                 }
3402         } else {
3403                 vsmap = vs->vs_dmap;
3404                 if (vsmap == NULL)
3405                         return 0;
3406                 /* loop on clusters in the direct map */
3407                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3408                         if (VSM_ISCLR(vsmap[j]) ||
3409                             VSM_ISERR(vsmap[j]))
3410                                 continue;
3411                         /* loop on pages in this cluster */
3412                         for (k = 0; k < VSCLSIZE(vs); k++) {
3413                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3414                                         num_pages++;
3415                         }
3416                 }
3417         }
3418
3419         return ptoa_32(num_pages);
3420 }
3421
3422 unsigned int
3423 ps_vstruct_allocated_pages(
3424         vstruct_t               vs,
3425         default_pager_page_t    *pages,
3426         unsigned int            pages_size)
3427 {
3428         unsigned int    num_pages;
3429         struct vs_map   *vsmap;
3430         dp_offset_t     offset;
3431         unsigned int    i, j, k;
3432
3433         num_pages = 0;
3434         offset = 0;
3435         if (vs->vs_indirect) {
3436                 /* loop on indirect maps */
3437                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3438                         vsmap = vs->vs_imap[i];
3439                         if (vsmap == NULL) {
3440                                 offset += (vm_page_size * CLMAP_ENTRIES *
3441                                            VSCLSIZE(vs));
3442                                 continue;
3443                         }
3444                         /* loop on clusters in this indirect map */
3445                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3446                                 if (VSM_ISCLR(vsmap[j]) ||
3447                                     VSM_ISERR(vsmap[j])) {
3448                                         offset += vm_page_size * VSCLSIZE(vs);
3449                                         continue;
3450                                 }
3451                                 /* loop on pages in this cluster */
3452                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3453                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3454                                                 num_pages++;
3455                                                 if (num_pages < pages_size)
3456                                                         pages++->dpp_offset =
3457                                                                 offset;
3458                                         }
3459                                         offset += vm_page_size;
3460                                 }
3461                         }
3462                 }
3463         } else {
3464                 vsmap = vs->vs_dmap;
3465                 if (vsmap == NULL)
3466                         return 0;
3467                 /* loop on clusters in the direct map */
3468                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3469                         if (VSM_ISCLR(vsmap[j]) ||
3470                             VSM_ISERR(vsmap[j])) {
3471                                 offset += vm_page_size * VSCLSIZE(vs);
3472                                 continue;
3473                         }
3474                         /* loop on pages in this cluster */
3475                         for (k = 0; k < VSCLSIZE(vs); k++) {
3476                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3477                                         num_pages++;
3478                                         if (num_pages < pages_size)
3479                                                 pages++->dpp_offset = offset;
3480                                 }
3481                                 offset += vm_page_size;
3482                         }
3483                 }
3484         }
3485
3486         return num_pages;
3487 }
3488
3489
3490 kern_return_t
3491 ps_vstruct_transfer_from_segment(
3492         vstruct_t        vs,
3493         paging_segment_t segment,
3494         upl_t            upl)
3495 {
3496         struct vs_map   *vsmap;
3497 //      struct vs_map   old_vsmap;
3498 //      struct vs_map   new_vsmap;
3499         unsigned int    i, j;
3500
3501         VS_LOCK(vs);    /* block all work on this vstruct */
3502                         /* can't allow the normal multiple write */
3503                         /* semantic because writes may conflict */
3504         vs->vs_xfer_pending = TRUE;
3505         vs_wait_for_sync_writers(vs);
3506         vs_start_write(vs);
3507         vs_wait_for_readers(vs);
3508         /* we will unlock the vs to allow other writes while transferring */
3509         /* and will be guaranteed of the persistance of the vs struct     */
3510         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3511         /* vs_async_pending */
3512         /* OK we now have guaranteed no other parties are accessing this */
3513         /* vs.  Now that we are also supporting simple lock versions of  */
3514         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3515         /* our purpose in holding it before was the multiple write case */
3516         /* we now use the boolean xfer_pending to do that.  We can use  */
3517         /* a boolean instead of a count because we have guaranteed single */
3518         /* file access to this code in its caller */
3519         VS_UNLOCK(vs);
3520 vs_changed:
3521         if (vs->vs_indirect) {
3522                 unsigned int    vsmap_size;
3523                 int             clmap_off;
3524                 /* loop on indirect maps */
3525                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3526                         vsmap = vs->vs_imap[i];
3527                         if (vsmap == NULL)
3528                                 continue;
3529                         /* loop on clusters in this indirect map */
3530                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3531                                            VSCLSIZE(vs) * i);
3532                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3533                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3534                         else
3535                                 vsmap_size = CLMAP_ENTRIES;
3536                         for (j = 0; j < vsmap_size; j++) {
3537                                 if (VSM_ISCLR(vsmap[j]) ||
3538                                     VSM_ISERR(vsmap[j]) ||
3539                                     (VSM_PS(vsmap[j]) != segment))
3540                                         continue;
3541                                 if(vs_cluster_transfer(vs,
3542                                         (vm_page_size * (j << vs->vs_clshift))
3543                                         + clmap_off,
3544                                         vm_page_size << vs->vs_clshift,
3545                                         upl)
3546                                                 != KERN_SUCCESS) {
3547                                    VS_LOCK(vs);
3548                                    vs->vs_xfer_pending = FALSE;
3549                                    VS_UNLOCK(vs);
3550                                    vs_finish_write(vs);
3551                                    return KERN_FAILURE;
3552                                 }
3553                                 /* allow other readers/writers during transfer*/
3554                                 VS_LOCK(vs);
3555                                 vs->vs_xfer_pending = FALSE;
3556                                 VS_UNLOCK(vs);
3557                                 vs_finish_write(vs);
3558                                 VS_LOCK(vs);
3559                                 vs->vs_xfer_pending = TRUE;
3560                                 vs_wait_for_sync_writers(vs);
3561                                 vs_start_write(vs);
3562                                 vs_wait_for_readers(vs);
3563                                 VS_UNLOCK(vs);
3564                                 if (!(vs->vs_indirect)) {
3565                                         goto vs_changed;
3566                                 }
3567                         }
3568                 }
3569         } else {
3570                 vsmap = vs->vs_dmap;
3571                 if (vsmap == NULL) {
3572                         VS_LOCK(vs);
3573                         vs->vs_xfer_pending = FALSE;
3574                         VS_UNLOCK(vs);
3575                         vs_finish_write(vs);
3576                         return KERN_SUCCESS;
3577                 }
3578                 /* loop on clusters in the direct map */
3579                 for (j = 0; j < vs->vs_size; j++) {
3580                         if (VSM_ISCLR(vsmap[j]) ||
3581                             VSM_ISERR(vsmap[j]) ||
3582                             (VSM_PS(vsmap[j]) != segment))
3583                                 continue;
3584                         if(vs_cluster_transfer(vs,
3585                                 vm_page_size * (j << vs->vs_clshift),
3586                                 vm_page_size << vs->vs_clshift,
3587                                 upl) != KERN_SUCCESS) {
3588                            VS_LOCK(vs);
3589                            vs->vs_xfer_pending = FALSE;
3590                            VS_UNLOCK(vs);
3591                            vs_finish_write(vs);
3592                            return KERN_FAILURE;
3593                         }
3594                         /* allow other readers/writers during transfer*/
3595                         VS_LOCK(vs);
3596                         vs->vs_xfer_pending = FALSE;
3597                         VS_UNLOCK(vs);
3598                         vs_finish_write(vs);
3599                         VS_LOCK(vs);
3600                         vs->vs_xfer_pending = TRUE;
3601                         vs_wait_for_sync_writers(vs);
3602                         vs_start_write(vs);
3603                         vs_wait_for_readers(vs);
3604                         VS_UNLOCK(vs);
3605                         if (vs->vs_indirect) {
3606                                 goto vs_changed;
3607                         }
3608                 }
3609         }
3610
3611         VS_LOCK(vs);
3612         vs->vs_xfer_pending = FALSE;
3613         VS_UNLOCK(vs);
3614         vs_finish_write(vs);
3615         return KERN_SUCCESS;
3616 }
3617
3618
3619
3620 vs_map_t
3621 vs_get_map_entry(
3622         vstruct_t       vs,
3623         dp_offset_t     offset)
3624 {
3625         struct vs_map   *vsmap;
3626         dp_offset_t     cluster;
3627
3628         cluster = atop_32(offset) >> vs->vs_clshift;
3629         if (vs->vs_indirect) {
3630                 long    ind_block = cluster/CLMAP_ENTRIES;
3631
3632                 /* Is the indirect block allocated? */
3633                 vsmap = vs->vs_imap[ind_block];
3634                 if(vsmap == (vs_map_t) NULL)
3635                         return vsmap;
3636         } else
3637                 vsmap = vs->vs_dmap;
3638         vsmap += cluster%CLMAP_ENTRIES;
3639         return vsmap;
3640 }
3641
3642 kern_return_t
3643 vs_cluster_transfer(
3644         vstruct_t       vs,
3645         dp_offset_t     offset,
3646         dp_size_t       cnt,
3647         upl_t           upl)
3648 {
3649         dp_offset_t             actual_offset;
3650         paging_segment_t        ps;
3651         struct clmap            clmap;
3652         kern_return_t           error = KERN_SUCCESS;
3653         unsigned int            size, size_wanted;
3654         int                     i;
3655         unsigned int            residual = 0;
3656         unsigned int            unavail_size;
3657 //      default_pager_thread_t  *dpt;
3658 //      boolean_t               dealloc;
3659         struct  vs_map          *vsmap_ptr = NULL;
3660         struct  vs_map          read_vsmap;
3661         struct  vs_map          original_read_vsmap;
3662         struct  vs_map          write_vsmap;
3663 //      upl_t                           sync_upl;
3664 //      vm_offset_t                     ioaddr;
3665
3666         /* vs_cluster_transfer reads in the pages of a cluster and
3667          * then writes these pages back to new backing store.  The
3668          * segment the pages are being read from is assumed to have
3669          * been taken off-line and is no longer considered for new
3670          * space requests.
3671          */
3672
3673         /*
3674          * This loop will be executed once per cluster referenced.
3675          * Typically this means once, since it's unlikely that the
3676          * VM system will ask for anything spanning cluster boundaries.
3677          *
3678          * If there are holes in a cluster (in a paging segment), we stop
3679          * reading at the hole, then loop again, hoping to
3680          * find valid pages later in the cluster.  This continues until
3681          * the entire range has been examined, and read, if present.  The
3682          * pages are written as they are read.  If a failure occurs after
3683          * some pages are written the unmap call at the bottom of the loop
3684          * recovers the backing store and the old backing store remains
3685          * in effect.
3686          */
3687
3688         VSM_CLR(write_vsmap);
3689         VSM_CLR(original_read_vsmap);
3690         /* grab the actual object's pages to sync with I/O */
3691         while (cnt && (error == KERN_SUCCESS)) {
3692                 vsmap_ptr = vs_get_map_entry(vs, offset);
3693                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3694
3695                 if (actual_offset == (dp_offset_t) -1) {
3696
3697                         /*
3698                          * Nothing left to write in this cluster at least
3699                          * set write cluster information for any previous
3700                          * write, clear for next cluster, if there is one
3701                          */
3702                         unsigned int local_size, clmask, clsize;
3703
3704                         clsize = vm_page_size << vs->vs_clshift;
3705                         clmask = clsize - 1;
3706                         local_size = clsize - (offset & clmask);
3707                         ASSERT(local_size);
3708                         local_size = MIN(local_size, cnt);
3709
3710                         /* This cluster has no data in it beyond what may */
3711                         /* have been found on a previous iteration through */
3712                         /* the loop "write_vsmap" */
3713                         *vsmap_ptr = write_vsmap;
3714                         VSM_CLR(write_vsmap);
3715                         VSM_CLR(original_read_vsmap);
3716
3717                         cnt -= local_size;
3718                         offset += local_size;
3719                         continue;
3720                 }
3721
3722                 /*
3723                  * Count up contiguous available or unavailable
3724                  * pages.
3725                  */
3726                 ps = CLMAP_PS(clmap);
3727                 ASSERT(ps);
3728                 size = 0;
3729                 unavail_size = 0;
3730                 for (i = 0;
3731                      (size < cnt) && (unavail_size < cnt) &&
3732                      (i < CLMAP_NPGS(clmap)); i++) {
3733                         if (CLMAP_ISSET(clmap, i)) {
3734                                 if (unavail_size != 0)
3735                                         break;
3736                                 size += vm_page_size;
3737                                 BS_STAT(ps->ps_bs,
3738                                         ps->ps_bs->bs_pages_in++);
3739                         } else {
3740                                 if (size != 0)
3741                                         break;
3742                                 unavail_size += vm_page_size;
3743                         }
3744                 }
3745
3746                 if (size == 0) {
3747                         ASSERT(unavail_size);
3748                         ps_clunmap(vs, offset, unavail_size);
3749                         cnt -= unavail_size;
3750                         offset += unavail_size;
3751                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3752                                 == 0) {
3753                                 /* There is no more to transfer in this
3754                                    cluster
3755                                 */
3756                                 *vsmap_ptr = write_vsmap;
3757                                 VSM_CLR(write_vsmap);
3758                                 VSM_CLR(original_read_vsmap);
3759                         }
3760                         continue;
3761                 }
3762
3763                 if(VSM_ISCLR(original_read_vsmap))
3764                         original_read_vsmap = *vsmap_ptr;
3765
3766                 if(ps->ps_segtype == PS_PARTITION) {
3767                         panic("swap partition not supported\n");
3768                         /*NOTREACHED*/
3769                         error = KERN_FAILURE;
3770                         residual = size;
3771 /*
3772                         NEED TO ISSUE WITH SYNC & NO COMMIT
3773                         error = ps_read_device(ps, actual_offset, &buffer,
3774                                        size, &residual, flags);
3775 */
3776                 } else {
3777                         /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3778                         error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
3779                                         size, &residual,
3780                                         (UPL_IOSYNC | UPL_NOCOMMIT));
3781                 }
3782
3783                 read_vsmap = *vsmap_ptr;
3784
3785
3786                 /*
3787                  * Adjust counts and put data in new BS.  Optimize for the
3788                  * common case, i.e. no error and/or partial data.
3789                  * If there was an error, then we need to error the entire
3790                  * range, even if some data was successfully read.
3791                  *
3792                  */
3793                 if ((error == KERN_SUCCESS) && (residual == 0)) {
3794
3795                         /*
3796                          * Got everything we asked for, supply the data to
3797                          * the new BS.  Note that as a side effect of supplying
3798                          * the data, the buffer holding the supplied data is
3799                          * deallocated from the pager's address space unless
3800                          * the write is unsuccessful.
3801                          */
3802
3803                         /* note buffer will be cleaned up in all cases by */
3804                         /* internal_cluster_write or if an error on write */
3805                         /* the vm_map_copy_page_discard call              */
3806                         *vsmap_ptr = write_vsmap;
3807
3808                         if(vs_cluster_write(vs, upl, offset,
3809                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3810                                 error = KERN_FAILURE;
3811                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
3812                                         /* unmap the new backing store object */
3813                                         ps_clunmap(vs, offset, size);
3814                                 }
3815                                 /* original vsmap */
3816                                 *vsmap_ptr = original_read_vsmap;
3817                                 VSM_CLR(write_vsmap);
3818                         } else {
3819                                if((offset + size) &
3820                                         ((vm_page_size << vs->vs_clshift)
3821                                         - 1)) {
3822                                         /* There is more to transfer in this
3823                                            cluster
3824                                         */
3825                                         write_vsmap = *vsmap_ptr;
3826                                         *vsmap_ptr = read_vsmap;
3827                                         ps_clunmap(vs, offset, size);
3828                                 } else {
3829                                         /* discard the old backing object */
3830                                         write_vsmap = *vsmap_ptr;
3831                                         *vsmap_ptr = read_vsmap;
3832                                         ps_clunmap(vs, offset, size);
3833                                         *vsmap_ptr = write_vsmap;
3834                                         VSM_CLR(write_vsmap);
3835                                         VSM_CLR(original_read_vsmap);
3836                                 }
3837                         }
3838                 } else {
3839                         size_wanted = size;
3840                         if (error == KERN_SUCCESS) {
3841                                 if (residual == size) {
3842                                         /*
3843                                          * If a read operation returns no error
3844                                          * and no data moved, we turn it into
3845                                          * an error, assuming we're reading at
3846                                          * or beyond EOF.
3847                                          * Fall through and error the entire
3848                                          * range.
3849                                          */
3850                                         error = KERN_FAILURE;
3851                                         *vsmap_ptr = write_vsmap;
3852                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3853                                         /* unmap the new backing store object */
3854                                         ps_clunmap(vs, offset, size);
3855                                         }
3856                                         *vsmap_ptr = original_read_vsmap;
3857                                         VSM_CLR(write_vsmap);
3858                                         continue;
3859                                 } else {
3860                                         /*
3861                                          * Otherwise, we have partial read.
3862                                          * This is also considered an error
3863                                          * for the purposes of cluster transfer
3864                                          */
3865                                         error = KERN_FAILURE;
3866                                         *vsmap_ptr = write_vsmap;
3867                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3868                                         /* unmap the new backing store object */
3869                                         ps_clunmap(vs, offset, size);
3870                                         }
3871                                         *vsmap_ptr = original_read_vsmap;
3872                                         VSM_CLR(write_vsmap);
3873                                         continue;
3874                                 }
3875                         }
3876
3877                 }
3878                 cnt -= size;
3879                 offset += size;
3880
3881         } /* END while (cnt && (error == 0)) */
3882         if(!VSM_ISCLR(write_vsmap))
3883                 *vsmap_ptr = write_vsmap;
3884
3885         return error;
3886 }
3887
3888 kern_return_t
3889 default_pager_add_file(
3890         MACH_PORT_FACE  backing_store,
3891         vnode_ptr_t     vp,
3892         int             record_size,
3893         vm_size_t       size)
3894 {
3895         backing_store_t         bs;
3896         paging_segment_t        ps;
3897         int                     i;
3898         unsigned int            j;
3899         int                     error;
3900
3901         if ((bs = backing_store_lookup(backing_store))
3902             == BACKING_STORE_NULL)
3903                 return KERN_INVALID_ARGUMENT;
3904
3905         PSL_LOCK();
3906         for (i = 0; i <= paging_segment_max; i++) {
3907                 ps = paging_segments[i];
3908                 if (ps == PAGING_SEGMENT_NULL)
3909                         continue;
3910                 if (ps->ps_segtype != PS_FILE)
3911                         continue;
3912
3913                 /*
3914                  * Check for overlap on same device.
3915                  */
3916                 if (ps->ps_vnode == (struct vnode *)vp) {
3917                         PSL_UNLOCK();
3918                         BS_UNLOCK(bs);
3919                         return KERN_INVALID_ARGUMENT;
3920                 }
3921         }
3922         PSL_UNLOCK();
3923
3924         /*
3925          * Set up the paging segment
3926          */
3927         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3928         if (ps == PAGING_SEGMENT_NULL) {
3929                 BS_UNLOCK(bs);
3930                 return KERN_RESOURCE_SHORTAGE;
3931         }
3932
3933         ps->ps_segtype = PS_FILE;
3934         ps->ps_vnode = (struct vnode *)vp;
3935         ps->ps_offset = 0;
3936         ps->ps_record_shift = local_log2(vm_page_size / record_size);
3937         assert((dp_size_t) size == size);
3938         ps->ps_recnum = (dp_size_t) size;
3939         ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift;
3940
3941         ps->ps_pgcount = ps->ps_pgnum;
3942         ps->ps_clshift = local_log2(bs->bs_clsize);
3943         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3944         ps->ps_special_clusters = 0;
3945         ps->ps_hint = 0;
3946
3947         PS_LOCK_INIT(ps);
3948         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3949         if (!ps->ps_bmap) {
3950                 kfree(ps, sizeof *ps);
3951                 BS_UNLOCK(bs);
3952                 return KERN_RESOURCE_SHORTAGE;
3953         }
3954         for (j = 0; j < ps->ps_ncls; j++) {
3955                 clrbit(ps->ps_bmap, j);
3956         }
3957
3958         if(paging_segment_count == 0) {
3959                 ps->ps_state = PS_EMERGENCY_SEGMENT;
3960                 if(use_emergency_swap_file_first) {
3961                         ps->ps_state |= PS_CAN_USE;
3962                 }
3963                 emergency_segment_backing_store = backing_store;
3964         } else {
3965                 ps->ps_state = PS_CAN_USE;
3966         }
3967
3968         ps->ps_bs = bs;
3969
3970         if ((error = ps_enter(ps)) != 0) {
3971                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3972                 kfree(ps, sizeof *ps);
3973                 BS_UNLOCK(bs);
3974                 return KERN_RESOURCE_SHORTAGE;
3975         }
3976
3977         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3978         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3979         PSL_LOCK();
3980         if(IS_PS_OK_TO_USE(ps)) {
3981                 dp_pages_free += ps->ps_pgcount;
3982         } else {
3983                 dp_pages_reserve += ps->ps_pgcount;
3984         }
3985         PSL_UNLOCK();
3986
3987         BS_UNLOCK(bs);
3988
3989         bs_more_space(ps->ps_clcount);
3990
3991         /*
3992          * If the paging segment being activated is not the emergency
3993          * segment and we notice that the emergency segment is being
3994          * used then we help recover it. If all goes well, the
3995          * emergency segment will be back to its original state of
3996          * online but not activated (till it's needed the next time).
3997          */
3998         ps = paging_segments[EMERGENCY_PSEG_INDEX];
3999         if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) {
4000                 if(default_pager_backing_store_delete(emergency_segment_backing_store)) {
4001                         dprintf(("Failed to recover emergency paging segment\n"));
4002                 } else {
4003                         dprintf(("Recovered emergency paging segment\n"));
4004                 }
4005         }
4006
4007         DP_DEBUG(DEBUG_BS_INTERNAL,
4008                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
4009                   device, offset, (dp_size_t) size, record_size,
4010                   ps->ps_record_shift, ps->ps_pgnum));
4011
4012         return KERN_SUCCESS;
4013 }
4014
4015
4016
4017 kern_return_t
4018 ps_read_file(
4019         paging_segment_t        ps,
4020         upl_t                   upl,
4021         upl_offset_t            upl_offset,
4022         dp_offset_t             offset,
4023         upl_size_t              size,
4024         unsigned int            *residualp,
4025         int                     flags)
4026 {
4027         vm_object_offset_t      f_offset;
4028         int                     error = 0;
4029         int                     result;
4030
4031         assert(dp_encryption_inited);
4032
4033         clustered_reads[atop_32(size)]++;
4034
4035         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4036
4037         /*
4038          * for transfer case we need to pass uploffset and flags
4039          */
4040         assert((upl_size_t) size == size);
4041         error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL);
4042
4043         /* The vnode_pagein semantic is somewhat at odds with the existing   */
4044         /* device_read semantic.  Partial reads are not experienced at this  */
4045         /* level.  It is up to the bit map code and cluster read code to     */
4046         /* check that requested data locations are actually backed, and the  */
4047         /* pagein code to either read all of the requested data or return an */
4048         /* error. */
4049
4050         if (error)
4051                 result = KERN_FAILURE;
4052         else {
4053                 *residualp = 0;
4054                 result = KERN_SUCCESS;
4055         }
4056         return result;
4057 }
4058
4059 kern_return_t
4060 ps_write_file(
4061         paging_segment_t        ps,
4062         upl_t                   upl,
4063         upl_offset_t            upl_offset,
4064         dp_offset_t             offset,
4065         unsigned int            size,
4066         int                     flags)
4067 {
4068         vm_object_offset_t      f_offset;
4069         kern_return_t           result;
4070
4071         assert(dp_encryption_inited);
4072
4073         clustered_writes[atop_32(size)]++;
4074         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4075
4076         if (flags & UPL_PAGING_ENCRYPTED) {
4077                 /*
4078                  * ENCRYPTED SWAP:
4079                  * encrypt all the pages that we're going
4080                  * to pageout.
4081                  */
4082                 upl_encrypt(upl, upl_offset, size);
4083         }
4084         assert((upl_size_t) size == size);
4085         if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL))
4086                 result = KERN_FAILURE;
4087         else
4088                 result = KERN_SUCCESS;
4089
4090         return result;
4091 }
4092
4093 kern_return_t
4094 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
4095         int             hi_wat,
4096         int             lo_wat,
4097         int             flags,
4098         MACH_PORT_FACE  trigger_port)
4099 {
4100         MACH_PORT_FACE release;
4101         kern_return_t kr;
4102         clock_sec_t now;
4103         clock_nsec_t nanoseconds_dummy;
4104         static clock_sec_t error_notify = 0;
4105
4106         PSL_LOCK();
4107         if (flags == SWAP_ENCRYPT_ON) {
4108                 /* ENCRYPTED SWAP: turn encryption on */
4109                 release = trigger_port;
4110                 if (!dp_encryption_inited) {
4111                         dp_encryption_inited = TRUE;
4112                         dp_encryption = TRUE;
4113                         kr = KERN_SUCCESS;
4114                 } else {
4115                         kr = KERN_FAILURE;
4116                 }
4117         } else if (flags == SWAP_ENCRYPT_OFF) {
4118                 /* ENCRYPTED SWAP: turn encryption off */
4119                 release = trigger_port;
4120                 if (!dp_encryption_inited) {
4121                         dp_encryption_inited = TRUE;
4122                         dp_encryption = FALSE;
4123                         kr = KERN_SUCCESS;
4124                 } else {
4125                         kr = KERN_FAILURE;
4126                 }
4127         } else if (flags == HI_WAT_ALERT) {
4128                 release = min_pages_trigger_port;
4129                 min_pages_trigger_port = trigger_port;
4130                 minimum_pages_remaining = hi_wat/vm_page_size;
4131                 bs_low = FALSE;
4132                 kr = KERN_SUCCESS;
4133         } else if (flags ==  LO_WAT_ALERT) {
4134                 release = max_pages_trigger_port;
4135                 max_pages_trigger_port = trigger_port;
4136                 maximum_pages_free = lo_wat/vm_page_size;
4137                 kr = KERN_SUCCESS;
4138         } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) {
4139                 use_emergency_swap_file_first = TRUE;
4140                 release = trigger_port;
4141                 kr = KERN_SUCCESS;
4142         } else if (flags == SWAP_FILE_CREATION_ERROR) {
4143                 release = trigger_port;
4144                 kr = KERN_SUCCESS;
4145                 if( paging_segment_count == 1) {
4146                         use_emergency_swap_file_first = TRUE;
4147                 }
4148                 no_paging_space_action();
4149                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
4150                 if (now > error_notify + 5) {
4151                         dprintf(("Swap File Error.\n"));
4152                         error_notify = now;
4153                 }
4154         } else {
4155                 release = trigger_port;
4156                 kr =  KERN_INVALID_ARGUMENT;
4157         }
4158         PSL_UNLOCK();
4159
4160         if (IP_VALID(release))
4161                 ipc_port_release_send(release);
4162
4163         return kr;
4164 }
4165
4166 /*
4167  * Monitor the amount of available backing store vs. the amount of
4168  * required backing store, notify a listener (if present) when
4169  * backing store may safely be removed.
4170  *
4171  * We attempt to avoid the situation where backing store is
4172  * discarded en masse, as this can lead to thrashing as the
4173  * backing store is compacted.
4174  */
4175
4176 #define PF_INTERVAL     3       /* time between free level checks */
4177 #define PF_LATENCY      10      /* number of intervals before release */
4178
4179 static int dp_pages_free_low_count = 0;
4180 thread_call_t default_pager_backing_store_monitor_callout;
4181
4182 void
4183 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
4184                                                                         __unused thread_call_param_t p2)
4185 {
4186 //      unsigned long long      average;
4187         ipc_port_t              trigger;
4188         uint64_t                deadline;
4189
4190         /*
4191          * We determine whether it will be safe to release some
4192          * backing store by watching the free page level.  If
4193          * it remains below the maximum_pages_free threshold for
4194          * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4195          * then we deem it safe.
4196          *
4197          * Note that this establishes a maximum rate at which backing
4198          * store will be released, as each notification (currently)
4199          * only results in a single backing store object being
4200          * released.
4201          */
4202         if (dp_pages_free > maximum_pages_free) {
4203                 dp_pages_free_low_count++;
4204         } else {
4205                 dp_pages_free_low_count = 0;
4206         }
4207
4208         /* decide whether to send notification */
4209         trigger = IP_NULL;
4210         if (max_pages_trigger_port &&
4211             (backing_store_release_trigger_disable == 0) &&
4212             (dp_pages_free_low_count > PF_LATENCY)) {
4213                 trigger = max_pages_trigger_port;
4214                 max_pages_trigger_port = NULL;
4215         }
4216
4217         /* send notification */
4218         if (trigger != IP_NULL) {
4219                 VSL_LOCK();
4220                 if(backing_store_release_trigger_disable != 0) {
4221                         assert_wait((event_t)
4222                                     &backing_store_release_trigger_disable,
4223                                     THREAD_UNINT);
4224                         VSL_UNLOCK();
4225                         thread_block(THREAD_CONTINUE_NULL);
4226                 } else {
4227                         VSL_UNLOCK();
4228                 }
4229                 default_pager_space_alert(trigger, LO_WAT_ALERT);
4230                 ipc_port_release_send(trigger);
4231                 dp_pages_free_low_count = 0;
4232         }
4233
4234         clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
4235         thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
4236 }