osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56
  57 /*
  58  *      Default Pager.
  59  *              Paging File Management.
  60  */
  61
  62 #include <mach/host_priv.h>
  63 #include <mach/memory_object_control.h>
  64 #include <mach/memory_object_server.h>
  65 #include <mach/upl.h>
  66 #include <default_pager/default_pager_internal.h>
  67 #include <default_pager/default_pager_alerts.h>
  68 #include <default_pager/default_pager_object_server.h>
  69
  70 #include <ipc/ipc_types.h>
  71 #include <ipc/ipc_port.h>
  72 #include <ipc/ipc_space.h>
  73
  74 #include <kern/kern_types.h>
  75 #include <kern/host.h>
  76 #include <kern/queue.h>
  77 #include <kern/counters.h>
  78 #include <kern/sched_prim.h>
  79
  80 #include <vm/vm_kern.h>
  81 #include <vm/vm_pageout.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/vm_object.h>
  84 #include <vm/vm_protos.h>
  85
  86
  87 /* todo - need large internal object support */
  88
  89 /*
  90  * ALLOC_STRIDE... the maximum number of bytes allocated from
  91  * a swap file before moving on to the next swap file... if
  92  * all swap files reside on a single disk, this value should
  93  * be very large (this is the default assumption)... if the
  94  * swap files are spread across multiple disks, than this value
  95  * should be small (128 * 1024)...
  96  *
  97  * This should be determined dynamically in the future
  98  */
  99
 100 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
 101 int physical_transfer_cluster_count = 0;
 102
 103 #define VM_SUPER_CLUSTER        0x40000
 104 #define VM_SUPER_PAGES          (VM_SUPER_CLUSTER / PAGE_SIZE)
 105
 106 /*
 107  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
 108  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
 109  */
 110 #define VSTRUCT_MIN_CLSHIFT     0
 111
 112 #define VSTRUCT_DEF_CLSHIFT     2
 113 int default_pager_clsize = 0;
 114
 115 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
 116
 117 /* statistics */
 118 unsigned int clustered_writes[VM_SUPER_PAGES+1];
 119 unsigned int clustered_reads[VM_SUPER_PAGES+1];
 120
 121 /*
 122  * Globals used for asynchronous paging operations:
 123  *      vs_async_list:  head of list of to-be-completed I/O ops
 124  *      async_num_queued: number of pages completed, but not yet
 125  *              processed by async thread.
 126  *      async_requests_out: number of pages of requests not completed.
 127  */
 128
 129 #if 0
 130 struct vs_async *vs_async_list;
 131 int     async_num_queued;
 132 int     async_requests_out;
 133 #endif
 134
 135
 136 #define VS_ASYNC_REUSE 1
 137 struct vs_async *vs_async_free_list;
 138
 139 lck_mtx_t       default_pager_async_lock;       /* Protects globals above */
 140
 141
 142 int vs_alloc_async_failed = 0;                  /* statistics */
 143 int vs_alloc_async_count = 0;                   /* statistics */
 144 struct vs_async *vs_alloc_async(void);          /* forward */
 145 void vs_free_async(struct vs_async *vsa);       /* forward */
 146
 147
 148 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 149 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 150
 151 #define VS_ASYNC_LOCK()         lck_mtx_lock(&default_pager_async_lock)
 152 #define VS_ASYNC_UNLOCK()       lck_mtx_unlock(&default_pager_async_lock)
 153 #define VS_ASYNC_LOCK_INIT()    lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
 154 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 155 /*
 156  *  Paging Space Hysteresis triggers and the target notification port
 157  *
 158  */
 159 unsigned int    dp_pages_free_drift_count = 0;
 160 unsigned int    dp_pages_free_drifted_max = 0;
 161 unsigned int    minimum_pages_remaining = 0;
 162 unsigned int    maximum_pages_free = 0;
 163 ipc_port_t      min_pages_trigger_port = NULL;
 164 ipc_port_t      max_pages_trigger_port = NULL;
 165
 166 boolean_t       use_emergency_swap_file_first = FALSE;
 167 boolean_t       bs_low = FALSE;
 168 int             backing_store_release_trigger_disable = 0;
 169 boolean_t       backing_store_stop_compaction = FALSE;
 170
 171
 172 /* Have we decided if swap needs to be encrypted yet ? */
 173 boolean_t       dp_encryption_inited = FALSE;
 174 /* Should we encrypt swap ? */
 175 boolean_t       dp_encryption = FALSE;
 176
 177 boolean_t       dp_isssd = FALSE;
 178
 179
 180 /*
 181  * Object sizes are rounded up to the next power of 2,
 182  * unless they are bigger than a given maximum size.
 183  */
 184 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 185
 186 /*
 187  * List of all backing store and segments.
 188  */
 189 MACH_PORT_FACE          emergency_segment_backing_store;
 190 struct backing_store_list_head backing_store_list;
 191 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 192 lck_mtx_t                       paging_segments_lock;
 193 int                     paging_segment_max = 0;
 194 int                     paging_segment_count = 0;
 195 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 196
 197
 198 /*
 199  * Total pages free in system
 200  * This differs from clusters committed/avail which is a measure of the
 201  * over commitment of paging segments to backing store.  An idea which is
 202  * likely to be deprecated.
 203  */
 204 unsigned  int   dp_pages_free = 0;
 205 unsigned  int   dp_pages_reserve = 0;
 206 unsigned  int   cluster_transfer_minimum = 100;
 207
 208 /* forward declarations */
 209 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int);     /* forward */
 210 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int);     /* forward */
 211 default_pager_thread_t *get_read_buffer( void );
 212 kern_return_t ps_vstruct_transfer_from_segment(
 213         vstruct_t        vs,
 214         paging_segment_t segment,
 215         upl_t            upl);
 216 kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
 217 kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
 218 kern_return_t vs_cluster_transfer(
 219         vstruct_t       vs,
 220         dp_offset_t     offset,
 221         dp_size_t       cnt,
 222         upl_t           upl);
 223 vs_map_t vs_get_map_entry(
 224         vstruct_t       vs,
 225         dp_offset_t     offset);
 226
 227 kern_return_t
 228 default_pager_backing_store_delete_internal( MACH_PORT_FACE );
 229
 230 default_pager_thread_t *
 231 get_read_buffer( void )
 232 {
 233         int     i;
 234
 235         DPT_LOCK(dpt_lock);
 236         while(TRUE) {
 237                 for (i=0; i<default_pager_internal_count; i++) {
 238                         if(dpt_array[i]->checked_out == FALSE) {
 239                           dpt_array[i]->checked_out = TRUE;
 240                           DPT_UNLOCK(dpt_lock);
 241                           return  dpt_array[i];
 242                         }
 243                 }
 244                 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
 245         }
 246 }
 247
 248 void
 249 bs_initialize(void)
 250 {
 251         int i;
 252
 253         /*
 254          * List of all backing store.
 255          */
 256         BSL_LOCK_INIT();
 257         queue_init(&backing_store_list.bsl_queue);
 258         PSL_LOCK_INIT();
 259
 260         VS_ASYNC_LOCK_INIT();
 261 #if     VS_ASYNC_REUSE
 262         vs_async_free_list = NULL;
 263 #endif  /* VS_ASYNC_REUSE */
 264
 265         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 266                 clustered_writes[i] = 0;
 267                 clustered_reads[i] = 0;
 268         }
 269
 270 }
 271
 272 /*
 273  * When things do not quite workout...
 274  */
 275 void bs_no_paging_space(boolean_t);     /* forward */
 276
 277 void
 278 bs_no_paging_space(
 279         boolean_t out_of_memory)
 280 {
 281
 282         if (out_of_memory)
 283                 dprintf(("*** OUT OF MEMORY ***\n"));
 284         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 285 }
 286
 287 void bs_more_space(int);        /* forward */
 288 void bs_commit(int);            /* forward */
 289
 290 boolean_t       user_warned = FALSE;
 291 unsigned int    clusters_committed = 0;
 292 unsigned int    clusters_available = 0;
 293 unsigned int    clusters_committed_peak = 0;
 294
 295 void
 296 bs_more_space(
 297         int     nclusters)
 298 {
 299         BSL_LOCK();
 300         /*
 301          * Account for new paging space.
 302          */
 303         clusters_available += nclusters;
 304
 305         if (clusters_available >= clusters_committed) {
 306                 if (verbose && user_warned) {
 307                         printf("%s%s - %d excess clusters now.\n",
 308                                my_name,
 309                                "paging space is OK now",
 310                                clusters_available - clusters_committed);
 311                         user_warned = FALSE;
 312                         clusters_committed_peak = 0;
 313                 }
 314         } else {
 315                 if (verbose && user_warned) {
 316                         printf("%s%s - still short of %d clusters.\n",
 317                                my_name,
 318                                "WARNING: paging space over-committed",
 319                                clusters_committed - clusters_available);
 320                         clusters_committed_peak -= nclusters;
 321                 }
 322         }
 323         BSL_UNLOCK();
 324
 325         return;
 326 }
 327
 328 void
 329 bs_commit(
 330         int     nclusters)
 331 {
 332         BSL_LOCK();
 333         clusters_committed += nclusters;
 334         if (clusters_committed > clusters_available) {
 335                 if (verbose && !user_warned) {
 336                         user_warned = TRUE;
 337                         printf("%s%s - short of %d clusters.\n",
 338                                my_name,
 339                                "WARNING: paging space over-committed",
 340                                clusters_committed - clusters_available);
 341                 }
 342                 if (clusters_committed > clusters_committed_peak) {
 343                         clusters_committed_peak = clusters_committed;
 344                 }
 345         } else {
 346                 if (verbose && user_warned) {
 347                         printf("%s%s - was short of up to %d clusters.\n",
 348                                my_name,
 349                                "paging space is OK now",
 350                                clusters_committed_peak - clusters_available);
 351                         user_warned = FALSE;
 352                         clusters_committed_peak = 0;
 353                 }
 354         }
 355         BSL_UNLOCK();
 356
 357         return;
 358 }
 359
 360 int default_pager_info_verbose = 1;
 361
 362 void
 363 bs_global_info(
 364         uint64_t        *totalp,
 365         uint64_t        *freep)
 366 {
 367         uint64_t                pages_total, pages_free;
 368         paging_segment_t        ps;
 369         int                     i;
 370
 371         PSL_LOCK();
 372         pages_total = pages_free = 0;
 373         for (i = 0; i <= paging_segment_max; i++) {
 374                 ps = paging_segments[i];
 375                 if (ps == PAGING_SEGMENT_NULL)
 376                         continue;
 377
 378                 /*
 379                  * no need to lock: by the time this data
 380                  * gets back to any remote requestor it
 381                  * will be obsolete anyways
 382                  */
 383                 pages_total += ps->ps_pgnum;
 384                 pages_free += ps->ps_clcount << ps->ps_clshift;
 385                 DP_DEBUG(DEBUG_BS_INTERNAL,
 386                          ("segment #%d: %d total, %d free\n",
 387                           i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 388         }
 389         *totalp = pages_total;
 390         *freep = pages_free;
 391         if (verbose && user_warned && default_pager_info_verbose) {
 392                 if (clusters_available < clusters_committed) {
 393                         printf("%s %d clusters committed, %d available.\n",
 394                                my_name,
 395                                clusters_committed,
 396                                clusters_available);
 397                 }
 398         }
 399         PSL_UNLOCK();
 400 }
 401
 402 backing_store_t backing_store_alloc(void);      /* forward */
 403
 404 backing_store_t
 405 backing_store_alloc(void)
 406 {
 407         backing_store_t bs;
 408
 409         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 410         if (bs == BACKING_STORE_NULL)
 411                 panic("backing_store_alloc: no memory");
 412
 413         BS_LOCK_INIT(bs);
 414         bs->bs_port = MACH_PORT_NULL;
 415         bs->bs_priority = 0;
 416         bs->bs_clsize = 0;
 417         bs->bs_pages_total = 0;
 418         bs->bs_pages_in = 0;
 419         bs->bs_pages_in_fail = 0;
 420         bs->bs_pages_out = 0;
 421         bs->bs_pages_out_fail = 0;
 422
 423         return bs;
 424 }
 425
 426 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 427
 428 /* Even in both the component space and external versions of this pager, */
 429 /* backing_store_lookup will be called from tasks in the application space */
 430 backing_store_t
 431 backing_store_lookup(
 432         MACH_PORT_FACE port)
 433 {
 434         backing_store_t bs;
 435
 436 /*
 437         port is currently backed with a vs structure in the alias field
 438         we could create an ISBS alias and a port_is_bs call but frankly
 439         I see no reason for the test, the bs->port == port check below
 440         will work properly on junk entries.
 441
 442         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 443 */
 444         if ((port == MACH_PORT_NULL))
 445                 return BACKING_STORE_NULL;
 446
 447         BSL_LOCK();
 448         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 449                       bs_links) {
 450                 BS_LOCK(bs);
 451                 if (bs->bs_port == port) {
 452                         BSL_UNLOCK();
 453                         /* Success, return it locked. */
 454                         return bs;
 455                 }
 456                 BS_UNLOCK(bs);
 457         }
 458         BSL_UNLOCK();
 459         return BACKING_STORE_NULL;
 460 }
 461
 462 void backing_store_add(backing_store_t);        /* forward */
 463
 464 void
 465 backing_store_add(
 466         __unused backing_store_t bs)
 467 {
 468 //      MACH_PORT_FACE          port = bs->bs_port;
 469 //      MACH_PORT_FACE          pset = default_pager_default_set;
 470         kern_return_t           kr = KERN_SUCCESS;
 471
 472         if (kr != KERN_SUCCESS)
 473                 panic("backing_store_add: add to set");
 474
 475 }
 476
 477 /*
 478  * Set up default page shift, but only if not already
 479  * set and argument is within range.
 480  */
 481 boolean_t
 482 bs_set_default_clsize(unsigned int npages)
 483 {
 484         switch(npages){
 485             case 1:
 486             case 2:
 487             case 4:
 488             case 8:
 489                 if (default_pager_clsize == 0)  /* if not yet set */
 490                         vstruct_def_clshift = local_log2(npages);
 491                 return(TRUE);
 492         }
 493         return(FALSE);
 494 }
 495
 496 int bs_get_global_clsize(int clsize);   /* forward */
 497
 498 int
 499 bs_get_global_clsize(
 500         int     clsize)
 501 {
 502         int                     i;
 503         memory_object_default_t dmm;
 504         kern_return_t           kr;
 505
 506         /*
 507          * Only allow setting of cluster size once. If called
 508          * with no cluster size (default), we use the compiled-in default
 509          * for the duration. The same cluster size is used for all
 510          * paging segments.
 511          */
 512         if (default_pager_clsize == 0) {
 513                 /*
 514                  * Keep cluster size in bit shift because it's quicker
 515                  * arithmetic, and easier to keep at a power of 2.
 516                  */
 517                 if (clsize != NO_CLSIZE) {
 518                         for (i = 0; (1 << i) < clsize; i++);
 519                         if (i > MAX_CLUSTER_SHIFT)
 520                                 i = MAX_CLUSTER_SHIFT;
 521                         vstruct_def_clshift = i;
 522                 }
 523                 default_pager_clsize = (1 << vstruct_def_clshift);
 524
 525                 /*
 526                  * Let the user know the new (and definitive) cluster size.
 527                  */
 528                 if (verbose)
 529                         printf("%scluster size = %d page%s\n",
 530                                 my_name, default_pager_clsize,
 531                                 (default_pager_clsize == 1) ? "" : "s");
 532
 533                 /*
 534                  * Let the kernel know too, in case it hasn't used the
 535                  * default value provided in main() yet.
 536                  */
 537                 dmm = default_pager_object;
 538                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 539                 kr = host_default_memory_manager(host_priv_self(),
 540                                                  &dmm,
 541                                                  clsize);
 542                 memory_object_default_deallocate(dmm);
 543
 544                 if (kr != KERN_SUCCESS) {
 545                    panic("bs_get_global_cl_size:host_default_memory_manager");
 546                 }
 547                 if (dmm != default_pager_object) {
 548                   panic("bs_get_global_cl_size:there is another default pager");
 549                 }
 550         }
 551         ASSERT(default_pager_clsize > 0 &&
 552                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 553
 554         return default_pager_clsize;
 555 }
 556
 557 kern_return_t
 558 default_pager_backing_store_create(
 559         memory_object_default_t pager,
 560         int                     priority,
 561         int                     clsize,         /* in bytes */
 562         MACH_PORT_FACE          *backing_store)
 563 {
 564         backing_store_t bs;
 565         MACH_PORT_FACE  port;
 566 //      kern_return_t   kr;
 567         struct vstruct_alias *alias_struct;
 568
 569         if (pager != default_pager_object)
 570                 return KERN_INVALID_ARGUMENT;
 571
 572         bs = backing_store_alloc();
 573         port = ipc_port_alloc_kernel();
 574         ipc_port_make_send(port);
 575         assert (port != IP_NULL);
 576
 577         DP_DEBUG(DEBUG_BS_EXTERNAL,
 578                  ("priority=%d clsize=%d bs_port=0x%x\n",
 579                   priority, clsize, (int) backing_store));
 580
 581         alias_struct = (struct vstruct_alias *)
 582                                 kalloc(sizeof (struct vstruct_alias));
 583         if(alias_struct != NULL) {
 584                 alias_struct->vs = (struct vstruct *)bs;
 585                 alias_struct->name = &default_pager_ops;
 586                 port->alias = (uintptr_t) alias_struct;
 587         }
 588         else {
 589                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 590                 kfree(bs, sizeof (struct backing_store));
 591                 return KERN_RESOURCE_SHORTAGE;
 592         }
 593
 594         bs->bs_port = port;
 595         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 596                 priority = BS_MAXPRI;
 597         else if (priority == BS_NOPRI)
 598                 priority = BS_MAXPRI;
 599         else
 600                 priority = BS_MINPRI;
 601         bs->bs_priority = priority;
 602
 603         bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
 604
 605         BSL_LOCK();
 606         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 607                     bs_links);
 608         BSL_UNLOCK();
 609
 610         backing_store_add(bs);
 611
 612         *backing_store = port;
 613         return KERN_SUCCESS;
 614 }
 615
 616 kern_return_t
 617 default_pager_backing_store_info(
 618         MACH_PORT_FACE          backing_store,
 619         backing_store_flavor_t  flavour,
 620         backing_store_info_t    info,
 621         mach_msg_type_number_t  *size)
 622 {
 623         backing_store_t                 bs;
 624         backing_store_basic_info_t      basic;
 625         int                             i;
 626         paging_segment_t                ps;
 627
 628         if (flavour != BACKING_STORE_BASIC_INFO ||
 629             *size < BACKING_STORE_BASIC_INFO_COUNT)
 630                 return KERN_INVALID_ARGUMENT;
 631
 632         basic = (backing_store_basic_info_t)info;
 633         *size = BACKING_STORE_BASIC_INFO_COUNT;
 634
 635         VSTATS_LOCK(&global_stats.gs_lock);
 636         basic->pageout_calls    = global_stats.gs_pageout_calls;
 637         basic->pagein_calls     = global_stats.gs_pagein_calls;
 638         basic->pages_in         = global_stats.gs_pages_in;
 639         basic->pages_out        = global_stats.gs_pages_out;
 640         basic->pages_unavail    = global_stats.gs_pages_unavail;
 641         basic->pages_init       = global_stats.gs_pages_init;
 642         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 643         VSTATS_UNLOCK(&global_stats.gs_lock);
 644
 645         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 646                 return KERN_INVALID_ARGUMENT;
 647
 648         basic->bs_pages_total   = bs->bs_pages_total;
 649         PSL_LOCK();
 650         bs->bs_pages_free = 0;
 651         for (i = 0; i <= paging_segment_max; i++) {
 652                 ps = paging_segments[i];
 653                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 654                         PS_LOCK(ps);
 655                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 656                         PS_UNLOCK(ps);
 657                 }
 658         }
 659         PSL_UNLOCK();
 660         basic->bs_pages_free    = bs->bs_pages_free;
 661         basic->bs_pages_in      = bs->bs_pages_in;
 662         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 663         basic->bs_pages_out     = bs->bs_pages_out;
 664         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 665
 666         basic->bs_priority      = bs->bs_priority;
 667         basic->bs_clsize        = ptoa_32(bs->bs_clsize);       /* in bytes */
 668
 669         BS_UNLOCK(bs);
 670
 671         return KERN_SUCCESS;
 672 }
 673
 674 int ps_delete(paging_segment_t);        /* forward */
 675 boolean_t current_thread_aborted(void);
 676
 677 int
 678 ps_delete(
 679         paging_segment_t ps)
 680 {
 681         vstruct_t       vs;
 682         kern_return_t   error = KERN_SUCCESS;
 683         int             vs_count;
 684
 685         VSL_LOCK();             /* get the lock on the list of vs's      */
 686
 687         /* The lock relationship and sequence is farily complicated      */
 688         /* this code looks at a live list, locking and unlocking the list */
 689         /* as it traverses it.  It depends on the locking behavior of    */
 690         /* default_pager_no_senders.  no_senders always locks the vstruct */
 691         /* targeted for removal before locking the vstruct list.  However */
 692         /* it will remove that member of the list without locking its    */
 693         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 694         /* it cannot be removed from the list but we must hold the list  */
 695         /* lock to be sure that its pointers to its neighbors are valid. */
 696         /* Also, we can hold off destruction of a vstruct when the list  */
 697         /* lock and the vs locks are not being held by bumping the       */
 698         /* vs_async_pending count.      */
 699
 700
 701         while(backing_store_release_trigger_disable != 0) {
 702                 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
 703         }
 704
 705         /* we will choose instead to hold a send right */
 706         vs_count = vstruct_list.vsl_count;
 707         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 708         if(vs == (vstruct_t)&vstruct_list)  {
 709                 VSL_UNLOCK();
 710                 return KERN_SUCCESS;
 711         }
 712         VS_LOCK(vs);
 713         vs_async_wait(vs);  /* wait for any pending async writes */
 714         if ((vs_count != 0) && (vs != NULL))
 715                 vs->vs_async_pending += 1;  /* hold parties calling  */
 716                                             /* vs_async_wait */
 717         VS_UNLOCK(vs);
 718         VSL_UNLOCK();
 719         while((vs_count != 0) && (vs != NULL)) {
 720                 /* We take the count of AMO's before beginning the         */
 721                 /* transfer of of the target segment.                      */
 722                 /* We are guaranteed that the target segment cannot get    */
 723                 /* more users.  We also know that queue entries are        */
 724                 /* made at the back of the list.  If some of the entries   */
 725                 /* we would check disappear while we are traversing the    */
 726                 /* list then we will either check new entries which        */
 727                 /* do not have any backing store in the target segment     */
 728                 /* or re-check old entries.  This might not be optimal     */
 729                 /* but it will always be correct. The alternative is to    */
 730                 /* take a snapshot of the list.                            */
 731                 vstruct_t       next_vs;
 732
 733                 if(dp_pages_free < cluster_transfer_minimum)
 734                         error = KERN_FAILURE;
 735                 else {
 736                         vm_object_t     transfer_object;
 737                         unsigned int    count;
 738                         upl_t           upl;
 739
 740                         transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
 741                         count = 0;
 742                         error = vm_object_upl_request(transfer_object,
 743                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 744                                 &upl, NULL, &count,
 745                                 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_LITE | UPL_SET_INTERNAL);
 746
 747                         if(error == KERN_SUCCESS) {
 748                                 error = ps_vstruct_transfer_from_segment(
 749                                                         vs, ps, upl);
 750                                 upl_commit(upl, NULL, 0);
 751                                 upl_deallocate(upl);
 752                         } else {
 753                                 error = KERN_FAILURE;
 754                         }
 755                         vm_object_deallocate(transfer_object);
 756                 }
 757                 if(error || current_thread_aborted() || backing_store_stop_compaction) {
 758                         VS_LOCK(vs);
 759                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 760                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 761                                 vs->vs_waiting_async = FALSE;
 762                                 VS_UNLOCK(vs);
 763                                 thread_wakeup(&vs->vs_async_pending);
 764                         } else {
 765                                 VS_UNLOCK(vs);
 766                         }
 767                         return KERN_FAILURE;
 768                 }
 769
 770                 VSL_LOCK();
 771
 772                 while(backing_store_release_trigger_disable != 0) {
 773                         VSL_SLEEP(&backing_store_release_trigger_disable,
 774                                   THREAD_UNINT);
 775                 }
 776
 777                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 778                 if((next_vs != (vstruct_t)&vstruct_list) &&
 779                                 (vs != next_vs) && (vs_count != 1)) {
 780                         VS_LOCK(next_vs);
 781                         vs_async_wait(next_vs);  /* wait for any  */
 782                                                  /* pending async writes */
 783                         next_vs->vs_async_pending += 1; /* hold parties  */
 784                                                 /* calling vs_async_wait */
 785                         VS_UNLOCK(next_vs);
 786                 }
 787                 VSL_UNLOCK();
 788                 VS_LOCK(vs);
 789                 vs->vs_async_pending -= 1;
 790                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 791                         vs->vs_waiting_async = FALSE;
 792                         VS_UNLOCK(vs);
 793                         thread_wakeup(&vs->vs_async_pending);
 794                 } else {
 795                         VS_UNLOCK(vs);
 796                 }
 797                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 798                         vs = NULL;
 799                 else
 800                         vs = next_vs;
 801                 vs_count--;
 802         }
 803         return KERN_SUCCESS;
 804 }
 805
 806
 807 kern_return_t
 808 default_pager_backing_store_delete_internal(
 809         MACH_PORT_FACE backing_store)
 810 {
 811         backing_store_t         bs;
 812         int                     i;
 813         paging_segment_t        ps;
 814         int                     error;
 815         int                     interim_pages_removed = 0;
 816         boolean_t               dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store );
 817
 818         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 819                 return KERN_INVALID_ARGUMENT;
 820
 821 restart:
 822         PSL_LOCK();
 823         error = KERN_SUCCESS;
 824         for (i = 0; i <= paging_segment_max; i++) {
 825                 ps = paging_segments[i];
 826                 if (ps != PAGING_SEGMENT_NULL &&
 827                     ps->ps_bs == bs &&
 828                     ! IS_PS_GOING_AWAY(ps)) {
 829                         PS_LOCK(ps);
 830
 831                         if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) {
 832                         /*
 833                          * Someone is already busy reclamining this paging segment.
 834                          * If it's the emergency segment we are looking at then check
 835                          * that someone has not already recovered it and set the right
 836                          * state i.e. online but not activated.
 837                          */
 838                                 PS_UNLOCK(ps);
 839                                 continue;
 840                         }
 841
 842                         /* disable access to this segment */
 843                         ps->ps_state &= ~PS_CAN_USE;
 844                         ps->ps_state |= PS_GOING_AWAY;
 845                         PS_UNLOCK(ps);
 846                         /*
 847                          * The "ps" segment is "off-line" now,
 848                          * we can try and delete it...
 849                          */
 850                         if(dp_pages_free < (cluster_transfer_minimum
 851                                                         + ps->ps_pgcount)) {
 852                                 error = KERN_FAILURE;
 853                                 PSL_UNLOCK();
 854                         }
 855                         else {
 856                                 /* remove all pages associated with the  */
 857                                 /* segment from the list of free pages   */
 858                                 /* when transfer is through, all target  */
 859                                 /* segment pages will appear to be free  */
 860
 861                                 dp_pages_free -=  ps->ps_pgcount;
 862                                 interim_pages_removed += ps->ps_pgcount;
 863                                 PSL_UNLOCK();
 864                                 error = ps_delete(ps);
 865                         }
 866                         if (error != KERN_SUCCESS) {
 867                                 /*
 868                                  * We couldn't delete the segment,
 869                                  * probably because there's not enough
 870                                  * virtual memory left.
 871                                  * Re-enable all the segments.
 872                                  */
 873                                 PSL_LOCK();
 874                                 break;
 875                         }
 876                         goto restart;
 877                 }
 878         }
 879
 880         if (error != KERN_SUCCESS) {
 881                 for (i = 0; i <= paging_segment_max; i++) {
 882                         ps = paging_segments[i];
 883                         if (ps != PAGING_SEGMENT_NULL &&
 884                             ps->ps_bs == bs &&
 885                             IS_PS_GOING_AWAY(ps)) {
 886                                 PS_LOCK(ps);
 887
 888                                 if( !IS_PS_GOING_AWAY(ps)) {
 889                                         PS_UNLOCK(ps);
 890                                         continue;
 891                                 }
 892                                 /* Handle the special clusters that came in while we let go the lock*/
 893                                 if( ps->ps_special_clusters) {
 894                                         dp_pages_free += ps->ps_special_clusters << ps->ps_clshift;
 895                                         ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift;
 896                                         ps->ps_clcount += ps->ps_special_clusters;
 897                                         if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) {
 898                                                 ps_select_array[ps->ps_bs->bs_priority] = 0;
 899                                         }
 900                                         ps->ps_special_clusters = 0;
 901                                 }
 902                                 /* re-enable access to this segment */
 903                                 ps->ps_state &= ~PS_GOING_AWAY;
 904                                 ps->ps_state |= PS_CAN_USE;
 905                                 PS_UNLOCK(ps);
 906                         }
 907                 }
 908                 dp_pages_free += interim_pages_removed;
 909                 PSL_UNLOCK();
 910                 BS_UNLOCK(bs);
 911                 return error;
 912         }
 913
 914         for (i = 0; i <= paging_segment_max; i++) {
 915                 ps = paging_segments[i];
 916                 if (ps != PAGING_SEGMENT_NULL &&
 917                     ps->ps_bs == bs) {
 918                         if(IS_PS_GOING_AWAY(ps)) {
 919                                 if(IS_PS_EMERGENCY_SEGMENT(ps)) {
 920                                         PS_LOCK(ps);
 921                                         ps->ps_state &= ~PS_GOING_AWAY;
 922                                         ps->ps_special_clusters = 0;
 923                                         ps->ps_pgcount = ps->ps_pgnum;
 924                                         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
 925                                         dp_pages_reserve += ps->ps_pgcount;
 926                                         PS_UNLOCK(ps);
 927                                 } else {
 928                                         paging_segments[i] = PAGING_SEGMENT_NULL;
 929                                         paging_segment_count--;
 930                                         PS_LOCK(ps);
 931                                         kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
 932                                         kfree(ps, sizeof *ps);
 933                                 }
 934                         }
 935                 }
 936         }
 937
 938         /* Scan the entire ps array separately to make certain we find the */
 939         /* proper paging_segment_max                                       */
 940         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 941                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 942                    paging_segment_max = i;
 943         }
 944
 945         PSL_UNLOCK();
 946
 947         if( dealing_with_emergency_segment ) {
 948                 BS_UNLOCK(bs);
 949                 return KERN_SUCCESS;
 950         }
 951
 952         /*
 953          * All the segments have been deleted.
 954          * We can remove the backing store.
 955          */
 956
 957         /*
 958          * Disable lookups of this backing store.
 959          */
 960         if((void *)bs->bs_port->alias != NULL)
 961                 kfree((void *) bs->bs_port->alias,
 962                       sizeof (struct vstruct_alias));
 963         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 964         bs->bs_port = MACH_PORT_NULL;
 965         BS_UNLOCK(bs);
 966
 967         /*
 968          * Remove backing store from backing_store list.
 969          */
 970         BSL_LOCK();
 971         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
 972                      bs_links);
 973         BSL_UNLOCK();
 974
 975         /*
 976          * Free the backing store structure.
 977          */
 978         kfree(bs, sizeof *bs);
 979
 980         return KERN_SUCCESS;
 981 }
 982
 983 kern_return_t
 984 default_pager_backing_store_delete(
 985         MACH_PORT_FACE backing_store)
 986 {
 987         if( backing_store != emergency_segment_backing_store ) {
 988                 default_pager_backing_store_delete_internal(emergency_segment_backing_store);
 989         }
 990         return(default_pager_backing_store_delete_internal(backing_store));
 991 }
 992
 993 int     ps_enter(paging_segment_t);     /* forward */
 994
 995 int
 996 ps_enter(
 997         paging_segment_t ps)
 998 {
 999         int i;
1000
1001         PSL_LOCK();
1002
1003         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
1004                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
1005                         break;
1006         }
1007
1008         if (i < MAX_NUM_PAGING_SEGMENTS) {
1009                 paging_segments[i] = ps;
1010                 if (i > paging_segment_max)
1011                         paging_segment_max = i;
1012                 paging_segment_count++;
1013                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
1014                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
1015                         ps_select_array[ps->ps_bs->bs_priority] = 0;
1016                 i = 0;
1017         } else {
1018                 PSL_UNLOCK();
1019                 return KERN_RESOURCE_SHORTAGE;
1020         }
1021
1022         PSL_UNLOCK();
1023         return i;
1024 }
1025
1026 #ifdef DEVICE_PAGING
1027 kern_return_t
1028 default_pager_add_segment(
1029         MACH_PORT_FACE  backing_store,
1030         MACH_PORT_FACE  device,
1031         recnum_t        offset,
1032         recnum_t        count,
1033         int             record_size)
1034 {
1035         backing_store_t         bs;
1036         paging_segment_t        ps;
1037         int                     i;
1038         int                     error;
1039
1040         if ((bs = backing_store_lookup(backing_store))
1041             == BACKING_STORE_NULL)
1042                 return KERN_INVALID_ARGUMENT;
1043
1044         PSL_LOCK();
1045         for (i = 0; i <= paging_segment_max; i++) {
1046                 ps = paging_segments[i];
1047                 if (ps == PAGING_SEGMENT_NULL)
1048                         continue;
1049
1050                 /*
1051                  * Check for overlap on same device.
1052                  */
1053                 if (!(ps->ps_device != device
1054                       || offset >= ps->ps_offset + ps->ps_recnum
1055                       || offset + count <= ps->ps_offset)) {
1056                         PSL_UNLOCK();
1057                         BS_UNLOCK(bs);
1058                         return KERN_INVALID_ARGUMENT;
1059                 }
1060         }
1061         PSL_UNLOCK();
1062
1063         /*
1064          * Set up the paging segment
1065          */
1066         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1067         if (ps == PAGING_SEGMENT_NULL) {
1068                 BS_UNLOCK(bs);
1069                 return KERN_RESOURCE_SHORTAGE;
1070         }
1071
1072         ps->ps_segtype = PS_PARTITION;
1073         ps->ps_device = device;
1074         ps->ps_offset = offset;
1075         ps->ps_record_shift = local_log2(vm_page_size / record_size);
1076         ps->ps_recnum = count;
1077         ps->ps_pgnum = count >> ps->ps_record_shift;
1078
1079         ps->ps_pgcount = ps->ps_pgnum;
1080         ps->ps_clshift = local_log2(bs->bs_clsize);
1081         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1082         ps->ps_hint = 0;
1083
1084         PS_LOCK_INIT(ps);
1085         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1086         if (!ps->ps_bmap) {
1087                 kfree(ps, sizeof *ps);
1088                 BS_UNLOCK(bs);
1089                 return KERN_RESOURCE_SHORTAGE;
1090         }
1091         for (i = 0; i < ps->ps_ncls; i++) {
1092                 clrbit(ps->ps_bmap, i);
1093         }
1094
1095         if(paging_segment_count == 0) {
1096                 ps->ps_state = PS_EMERGENCY_SEGMENT;
1097                 if(use_emergency_swap_file_first) {
1098                         ps->ps_state |= PS_CAN_USE;
1099                 }
1100         } else {
1101                 ps->ps_state = PS_CAN_USE;
1102         }
1103
1104         ps->ps_bs = bs;
1105
1106         if ((error = ps_enter(ps)) != 0) {
1107                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1108                 kfree(ps, sizeof *ps);
1109                 BS_UNLOCK(bs);
1110                 return KERN_RESOURCE_SHORTAGE;
1111         }
1112
1113         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1114         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1115         BS_UNLOCK(bs);
1116
1117         PSL_LOCK();
1118         if(IS_PS_OK_TO_USE(ps)) {
1119                 dp_pages_free += ps->ps_pgcount;
1120         } else {
1121                 dp_pages_reserve += ps->ps_pgcount;
1122         }
1123         PSL_UNLOCK();
1124
1125         bs_more_space(ps->ps_clcount);
1126
1127         DP_DEBUG(DEBUG_BS_INTERNAL,
1128                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1129                   device, offset, count, record_size,
1130                   ps->ps_record_shift, ps->ps_pgnum));
1131
1132         return KERN_SUCCESS;
1133 }
1134
1135 boolean_t
1136 bs_add_device(
1137         char            *dev_name,
1138         MACH_PORT_FACE  master)
1139 {
1140         security_token_t        null_security_token = {
1141                 { 0, 0 }
1142         };
1143         MACH_PORT_FACE  device;
1144         int             info[DEV_GET_SIZE_COUNT];
1145         mach_msg_type_number_t info_count;
1146         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1147         unsigned int    rec_size;
1148         recnum_t        count;
1149         int             clsize;
1150         MACH_PORT_FACE  reply_port;
1151
1152         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1153                         null_security_token, dev_name, &device))
1154                 return FALSE;
1155
1156         info_count = DEV_GET_SIZE_COUNT;
1157         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1158                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1159                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1160                 clsize = bs_get_global_clsize(0);
1161                 if (!default_pager_backing_store_create(
1162                                         default_pager_object,
1163                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1164                                         (clsize * vm_page_size),
1165                                         &bs)) {
1166                         if (!default_pager_add_segment(bs, device,
1167                                                        0, count, rec_size)) {
1168                                 return TRUE;
1169                         }
1170                         ipc_port_release_receive(bs);
1171                 }
1172         }
1173
1174         ipc_port_release_send(device);
1175         return FALSE;
1176 }
1177 #endif /* DEVICE_PAGING */
1178
1179 #if     VS_ASYNC_REUSE
1180
1181 struct vs_async *
1182 vs_alloc_async(void)
1183 {
1184         struct vs_async *vsa;
1185         MACH_PORT_FACE  reply_port;
1186 //      kern_return_t   kr;
1187
1188         VS_ASYNC_LOCK();
1189         if (vs_async_free_list == NULL) {
1190                 VS_ASYNC_UNLOCK();
1191                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1192                 if (vsa != NULL) {
1193                         /*
1194                          * Try allocating a reply port named after the
1195                          * address of the vs_async structure.
1196                          */
1197                         struct vstruct_alias    *alias_struct;
1198
1199                         reply_port = ipc_port_alloc_kernel();
1200                         alias_struct = (struct vstruct_alias *)
1201                                 kalloc(sizeof (struct vstruct_alias));
1202                         if(alias_struct != NULL) {
1203                                 alias_struct->vs = (struct vstruct *)vsa;
1204                                 alias_struct->name = &default_pager_ops;
1205                                 reply_port->alias = (uintptr_t) alias_struct;
1206                                 vsa->reply_port = reply_port;
1207                                 vs_alloc_async_count++;
1208                         }
1209                         else {
1210                                 vs_alloc_async_failed++;
1211                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1212                                                                 (reply_port));
1213                                 kfree(vsa, sizeof (struct vs_async));
1214                                 vsa = NULL;
1215                         }
1216                 }
1217         } else {
1218                 vsa = vs_async_free_list;
1219                 vs_async_free_list = vs_async_free_list->vsa_next;
1220                 VS_ASYNC_UNLOCK();
1221         }
1222
1223         return vsa;
1224 }
1225
1226 void
1227 vs_free_async(
1228         struct vs_async *vsa)
1229 {
1230         VS_ASYNC_LOCK();
1231         vsa->vsa_next = vs_async_free_list;
1232         vs_async_free_list = vsa;
1233         VS_ASYNC_UNLOCK();
1234 }
1235
1236 #else   /* VS_ASYNC_REUSE */
1237
1238 struct vs_async *
1239 vs_alloc_async(void)
1240 {
1241         struct vs_async *vsa;
1242         MACH_PORT_FACE  reply_port;
1243         kern_return_t   kr;
1244
1245         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1246         if (vsa != NULL) {
1247                 /*
1248                  * Try allocating a reply port named after the
1249                  * address of the vs_async structure.
1250                  */
1251                         reply_port = ipc_port_alloc_kernel();
1252                         alias_struct = (vstruct_alias *)
1253                                 kalloc(sizeof (struct vstruct_alias));
1254                         if(alias_struct != NULL) {
1255                                 alias_struct->vs = reply_port;
1256                                 alias_struct->name = &default_pager_ops;
1257                                 reply_port->alias = (int) vsa;
1258                                 vsa->reply_port = reply_port;
1259                                 vs_alloc_async_count++;
1260                         }
1261                         else {
1262                                 vs_alloc_async_failed++;
1263                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1264                                                                 (reply_port));
1265                                 kfree(vsa, sizeof (struct vs_async));
1266                                 vsa = NULL;
1267                         }
1268         }
1269
1270         return vsa;
1271 }
1272
1273 void
1274 vs_free_async(
1275         struct vs_async *vsa)
1276 {
1277         MACH_PORT_FACE  reply_port;
1278         kern_return_t   kr;
1279
1280         reply_port = vsa->reply_port;
1281         kfree(reply_port->alias, sizeof (struct vstuct_alias));
1282         kfree(vsa, sizeof (struct vs_async));
1283         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1284 #if 0
1285         VS_ASYNC_LOCK();
1286         vs_alloc_async_count--;
1287         VS_ASYNC_UNLOCK();
1288 #endif
1289 }
1290
1291 #endif  /* VS_ASYNC_REUSE */
1292
1293 zone_t  vstruct_zone;
1294
1295 vstruct_t
1296 ps_vstruct_create(
1297         dp_size_t size)
1298 {
1299         vstruct_t       vs;
1300         unsigned int    i;
1301
1302         vs = (vstruct_t) zalloc(vstruct_zone);
1303         if (vs == VSTRUCT_NULL) {
1304                 return VSTRUCT_NULL;
1305         }
1306
1307         VS_LOCK_INIT(vs);
1308
1309         /*
1310          * The following fields will be provided later.
1311          */
1312         vs->vs_pager_ops = NULL;
1313         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1314         vs->vs_references = 1;
1315         vs->vs_seqno = 0;
1316
1317         vs->vs_waiting_seqno = FALSE;
1318         vs->vs_waiting_read = FALSE;
1319         vs->vs_waiting_write = FALSE;
1320         vs->vs_waiting_async = FALSE;
1321
1322         vs->vs_readers = 0;
1323         vs->vs_writers = 0;
1324
1325         vs->vs_errors = 0;
1326
1327         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1328         vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1329         vs->vs_async_pending = 0;
1330
1331         /*
1332          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1333          * depending on the size of the memory object.
1334          */
1335         if (INDIRECT_CLMAP(vs->vs_size)) {
1336                 vs->vs_imap = (struct vs_map **)
1337                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1338                 vs->vs_indirect = TRUE;
1339         } else {
1340                 vs->vs_dmap = (struct vs_map *)
1341                         kalloc(CLMAP_SIZE(vs->vs_size));
1342                 vs->vs_indirect = FALSE;
1343         }
1344         vs->vs_xfer_pending = FALSE;
1345         DP_DEBUG(DEBUG_VS_INTERNAL,
1346                  ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1347
1348         /*
1349          * Check to see that we got the space.
1350          */
1351         if (!vs->vs_dmap) {
1352                 kfree(vs, sizeof *vs);
1353                 return VSTRUCT_NULL;
1354         }
1355
1356         /*
1357          * Zero the indirect pointers, or clear the direct pointers.
1358          */
1359         if (vs->vs_indirect)
1360                 memset(vs->vs_imap, 0,
1361                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1362         else
1363                 for (i = 0; i < vs->vs_size; i++)
1364                         VSM_CLR(vs->vs_dmap[i]);
1365
1366         VS_MAP_LOCK_INIT(vs);
1367
1368         bs_commit(vs->vs_size);
1369
1370         return vs;
1371 }
1372
1373 paging_segment_t ps_select_segment(unsigned int, int *);        /* forward */
1374
1375 paging_segment_t
1376 ps_select_segment(
1377         unsigned int    shift,
1378         int             *psindex)
1379 {
1380         paging_segment_t        ps;
1381         int                     i;
1382         int                     j;
1383
1384         /*
1385          * Optimize case where there's only one segment.
1386          * paging_segment_max will index the one and only segment.
1387          */
1388
1389         PSL_LOCK();
1390         if (paging_segment_count == 1) {
1391                 paging_segment_t lps = PAGING_SEGMENT_NULL;     /* used to avoid extra PS_UNLOCK */
1392                 ipc_port_t trigger = IP_NULL;
1393
1394                 ps = paging_segments[paging_segment_max];
1395                 *psindex = paging_segment_max;
1396                 PS_LOCK(ps);
1397                 if( !IS_PS_EMERGENCY_SEGMENT(ps) ) {
1398                         panic("Emergency paging segment missing\n");
1399                 }
1400                 ASSERT(ps->ps_clshift >= shift);
1401                 if(IS_PS_OK_TO_USE(ps)) {
1402                         if (ps->ps_clcount) {
1403                                 ps->ps_clcount--;
1404                                 dp_pages_free -=  1 << ps->ps_clshift;
1405                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1406                                 if(min_pages_trigger_port &&
1407                                   (dp_pages_free < minimum_pages_remaining)) {
1408                                         trigger = min_pages_trigger_port;
1409                                         min_pages_trigger_port = NULL;
1410                                         bs_low = TRUE;
1411                                 }
1412                                 lps = ps;
1413                         }
1414                 }
1415                 PS_UNLOCK(ps);
1416
1417                 if( lps == PAGING_SEGMENT_NULL ) {
1418                         if(dp_pages_free) {
1419                                 dp_pages_free_drift_count++;
1420                                 if(dp_pages_free > dp_pages_free_drifted_max) {
1421                                         dp_pages_free_drifted_max = dp_pages_free;
1422                                 }
1423                                 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1424                         }
1425                         dp_pages_free = 0;
1426                 }
1427
1428                 PSL_UNLOCK();
1429
1430                 if (trigger != IP_NULL) {
1431                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1432                         ipc_port_release_send(trigger);
1433                 }
1434                 return lps;
1435         }
1436
1437         if (paging_segment_count == 0) {
1438                 if(dp_pages_free) {
1439                         dp_pages_free_drift_count++;
1440                         if(dp_pages_free > dp_pages_free_drifted_max) {
1441                                 dp_pages_free_drifted_max = dp_pages_free;
1442                         }
1443                         dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1444                 }
1445                 dp_pages_free = 0;
1446                 PSL_UNLOCK();
1447                 return PAGING_SEGMENT_NULL;
1448         }
1449
1450         for (i = BS_MAXPRI;
1451              i >= BS_MINPRI; i--) {
1452                 int start_index;
1453
1454                 if ((ps_select_array[i] == BS_NOPRI) ||
1455                                 (ps_select_array[i] == BS_FULLPRI))
1456                         continue;
1457                 start_index = ps_select_array[i];
1458
1459                 if(!(paging_segments[start_index])) {
1460                         j = start_index+1;
1461                         physical_transfer_cluster_count = 0;
1462                 }
1463                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1464                                 (((paging_segments[start_index])->ps_clshift)
1465                                 + vm_page_shift))) {
1466                         physical_transfer_cluster_count = 0;
1467                         j = start_index + 1;
1468                 } else {
1469                         physical_transfer_cluster_count+=1;
1470                         j = start_index;
1471                         if(start_index == 0)
1472                                 start_index = paging_segment_max;
1473                         else
1474                                 start_index = start_index - 1;
1475                 }
1476
1477                 while (1) {
1478                         if (j > paging_segment_max)
1479                                 j = 0;
1480                         if ((ps = paging_segments[j]) &&
1481                             (ps->ps_bs->bs_priority == i)) {
1482                                 /*
1483                                  * Force the ps cluster size to be
1484                                  * >= that of the vstruct.
1485                                  */
1486                                 PS_LOCK(ps);
1487                                 if (IS_PS_OK_TO_USE(ps)) {
1488                                         if ((ps->ps_clcount) &&
1489                                                    (ps->ps_clshift >= shift)) {
1490                                                 ipc_port_t trigger = IP_NULL;
1491
1492                                                 ps->ps_clcount--;
1493                                                 dp_pages_free -=  1 << ps->ps_clshift;
1494                                                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1495                                                 if(min_pages_trigger_port &&
1496                                                         (dp_pages_free <
1497                                                         minimum_pages_remaining)) {
1498                                                         trigger = min_pages_trigger_port;
1499                                                         min_pages_trigger_port = NULL;
1500                                                 }
1501                                                 PS_UNLOCK(ps);
1502                                                 /*
1503                                                  * found one, quit looking.
1504                                                  */
1505                                                 ps_select_array[i] = j;
1506                                                 PSL_UNLOCK();
1507
1508                                                 if (trigger != IP_NULL) {
1509                                                         default_pager_space_alert(
1510                                                                 trigger,
1511                                                                 HI_WAT_ALERT);
1512                                                         ipc_port_release_send(trigger);
1513                                                 }
1514                                                 *psindex = j;
1515                                                 return ps;
1516                                         }
1517                                 }
1518                                 PS_UNLOCK(ps);
1519                         }
1520                         if (j == start_index) {
1521                                 /*
1522                                  * none at this priority -- mark it full
1523                                  */
1524                                 ps_select_array[i] = BS_FULLPRI;
1525                                 break;
1526                         }
1527                         j++;
1528                 }
1529         }
1530
1531         if(dp_pages_free) {
1532                 dp_pages_free_drift_count++;
1533                 if(dp_pages_free > dp_pages_free_drifted_max) {
1534                         dp_pages_free_drifted_max = dp_pages_free;
1535                 }
1536                 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free));
1537         }
1538         dp_pages_free = 0;
1539         PSL_UNLOCK();
1540         return PAGING_SEGMENT_NULL;
1541 }
1542
1543 dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1544
1545 dp_offset_t
1546 ps_allocate_cluster(
1547         vstruct_t               vs,
1548         int                     *psindex,
1549         paging_segment_t        use_ps)
1550 {
1551         unsigned int            byte_num;
1552         int                     bit_num = 0;
1553         paging_segment_t        ps;
1554         dp_offset_t             cluster;
1555         ipc_port_t              trigger = IP_NULL;
1556
1557         /*
1558          * Find best paging segment.
1559          * ps_select_segment will decrement cluster count on ps.
1560          * Must pass cluster shift to find the most appropriate segment.
1561          */
1562         /* NOTE:  The addition of paging segment delete capability threatened
1563          * to seriously complicate the treatment of paging segments in this
1564          * module and the ones that call it (notably ps_clmap), because of the
1565          * difficulty in assuring that the paging segment would continue to
1566          * exist between being unlocked and locked.   This was
1567          * avoided because all calls to this module are based in either
1568          * dp_memory_object calls which rely on the vs lock, or by
1569          * the transfer function which is part of the segment delete path.
1570          * The transfer function which is part of paging segment delete is
1571          * protected from multiple callers by the backing store lock.
1572          * The paging segment delete function treats mappings to a paging
1573          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1574          * while data is transferred to the remaining segments.  This is in
1575          * line with the view that incomplete or in-transition mappings between
1576          * data, a vstruct, and backing store are protected by the vs lock.
1577          * This and the ordering of the paging segment "going_away" bit setting
1578          * protects us.
1579          */
1580 retry:
1581         if (use_ps != PAGING_SEGMENT_NULL) {
1582                 ps = use_ps;
1583                 PSL_LOCK();
1584                 PS_LOCK(ps);
1585
1586                 ASSERT(ps->ps_clcount != 0);
1587
1588                 ps->ps_clcount--;
1589                 dp_pages_free -=  1 << ps->ps_clshift;
1590                 ps->ps_pgcount -=  1 << ps->ps_clshift;
1591                 if(min_pages_trigger_port &&
1592                                 (dp_pages_free < minimum_pages_remaining)) {
1593                         trigger = min_pages_trigger_port;
1594                         min_pages_trigger_port = NULL;
1595                 }
1596                 PSL_UNLOCK();
1597                 PS_UNLOCK(ps);
1598                 if (trigger != IP_NULL) {
1599                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1600                         ipc_port_release_send(trigger);
1601                 }
1602
1603         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1604                    PAGING_SEGMENT_NULL) {
1605                 static clock_sec_t lastnotify = 0;
1606                 clock_sec_t now;
1607                 clock_nsec_t nanoseconds_dummy;
1608
1609                 /*
1610                  * Don't immediately jump to the emergency segment. Give the
1611                  * dynamic pager a chance to create it's first normal swap file.
1612                  * Unless, of course the very first normal swap file can't be
1613                  * created due to some problem and we didn't expect that problem
1614                  * i.e. use_emergency_swap_file_first was never set to true initially.
1615                  * It then gets set in the swap file creation error handling.
1616                  */
1617                 if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) {
1618
1619                         ps = paging_segments[EMERGENCY_PSEG_INDEX];
1620                         if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) {
1621                                 PSL_LOCK();
1622                                 PS_LOCK(ps);
1623
1624                                 if(IS_PS_GOING_AWAY(ps)) {
1625                                         /* Someone de-activated the emergency paging segment*/
1626                                         PS_UNLOCK(ps);
1627                                         PSL_UNLOCK();
1628
1629                                 } else if(dp_pages_free) {
1630                                         /*
1631                                          * Someone has already activated the emergency paging segment
1632                                          * OR
1633                                          * Between us having rec'd a NULL segment from ps_select_segment
1634                                          * and reaching here a new normal segment could have been added.
1635                                          * E.g. we get NULL segment and another thread just added the
1636                                          * new swap file. Hence check to see if we have more dp_pages_free
1637                                          * before activating the emergency segment.
1638                                          */
1639                                         PS_UNLOCK(ps);
1640                                         PSL_UNLOCK();
1641                                         goto retry;
1642
1643                                 } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) {
1644                                         /*
1645                                          * PS_CAN_USE is only reset from the emergency segment when it's
1646                                          * been successfully recovered. So it's legal to have an emergency
1647                                          * segment that has PS_CAN_USE but no clusters because it's recovery
1648                                          * failed.
1649                                          */
1650                                         backing_store_t bs = ps->ps_bs;
1651                                         ps->ps_state |= PS_CAN_USE;
1652                                         if(ps_select_array[bs->bs_priority] == BS_FULLPRI ||
1653                                                 ps_select_array[bs->bs_priority] == BS_NOPRI) {
1654                                                 ps_select_array[bs->bs_priority] = 0;
1655                                         }
1656                                         dp_pages_free += ps->ps_pgcount;
1657                                         dp_pages_reserve -= ps->ps_pgcount;
1658                                         PS_UNLOCK(ps);
1659                                         PSL_UNLOCK();
1660                                         dprintf(("Switching ON Emergency paging segment\n"));
1661                                         goto retry;
1662                                 }
1663
1664                                 PS_UNLOCK(ps);
1665                                 PSL_UNLOCK();
1666                         }
1667                 }
1668
1669                 /*
1670                  * Emit a notification of the low-paging resource condition
1671                  * but don't issue it more than once every five seconds.  This
1672                  * prevents us from overflowing logs with thousands of
1673                  * repetitions of the message.
1674                  */
1675                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1676                 if (paging_segment_count > 1 && (now > lastnotify + 5)) {
1677                         /* With an activated emergency paging segment we still
1678                          * didn't get any clusters. This could mean that the
1679                          * emergency paging segment is exhausted.
1680                          */
1681                         dprintf(("System is out of paging space.\n"));
1682                         lastnotify = now;
1683                 }
1684
1685                 PSL_LOCK();
1686
1687                 if(min_pages_trigger_port) {
1688                         trigger = min_pages_trigger_port;
1689                         min_pages_trigger_port = NULL;
1690                         bs_low = TRUE;
1691                 }
1692                 PSL_UNLOCK();
1693                 if (trigger != IP_NULL) {
1694                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1695                         ipc_port_release_send(trigger);
1696                 }
1697                 return (dp_offset_t) -1;
1698         }
1699
1700         /*
1701          * Look for an available cluster.  At the end of the loop,
1702          * byte_num is the byte offset and bit_num is the bit offset of the
1703          * first zero bit in the paging segment bitmap.
1704          */
1705         PS_LOCK(ps);
1706         byte_num = ps->ps_hint;
1707         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1708                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1709                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1710                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1711                                         break;
1712                         }
1713                         ASSERT(bit_num != NBBY);
1714                         break;
1715                 }
1716         }
1717         ps->ps_hint = byte_num;
1718         cluster = (byte_num*NBBY) + bit_num;
1719
1720         /* Space was reserved, so this must be true */
1721         ASSERT(cluster < ps->ps_ncls);
1722
1723         setbit(ps->ps_bmap, cluster);
1724         PS_UNLOCK(ps);
1725
1726         return cluster;
1727 }
1728
1729 void ps_deallocate_cluster(paging_segment_t, dp_offset_t);      /* forward */
1730
1731 void
1732 ps_deallocate_cluster(
1733         paging_segment_t        ps,
1734         dp_offset_t             cluster)
1735 {
1736
1737         if (cluster >= ps->ps_ncls)
1738                 panic("ps_deallocate_cluster: Invalid cluster number");
1739
1740         /*
1741          * Lock the paging segment, clear the cluster's bitmap and increment the
1742          * number of free cluster.
1743          */
1744         PSL_LOCK();
1745         PS_LOCK(ps);
1746         clrbit(ps->ps_bmap, cluster);
1747         if( IS_PS_OK_TO_USE(ps)) {
1748                 ++ps->ps_clcount;
1749                 ps->ps_pgcount +=  1 << ps->ps_clshift;
1750                 dp_pages_free +=  1 << ps->ps_clshift;
1751         } else {
1752                 ps->ps_special_clusters += 1;
1753         }
1754
1755         /*
1756          * Move the hint down to the freed cluster if it is
1757          * less than the current hint.
1758          */
1759         if ((cluster/NBBY) < ps->ps_hint) {
1760                 ps->ps_hint = (cluster/NBBY);
1761         }
1762
1763
1764         /*
1765          * If we're freeing space on a full priority, reset the array.
1766          */
1767         if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1768                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1769         PS_UNLOCK(ps);
1770         PSL_UNLOCK();
1771
1772         return;
1773 }
1774
1775 void ps_dealloc_vsmap(struct vs_map *, dp_size_t);      /* forward */
1776
1777 void
1778 ps_dealloc_vsmap(
1779         struct vs_map   *vsmap,
1780         dp_size_t       size)
1781 {
1782         unsigned int i;
1783         for (i = 0; i < size; i++)
1784                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1785                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1786                                               VSM_CLOFF(vsmap[i]));
1787 }
1788
1789 void
1790 ps_vstruct_dealloc(
1791         vstruct_t vs)
1792 {
1793         unsigned int    i;
1794 //      spl_t   s;
1795
1796         VS_MAP_LOCK(vs);
1797
1798         /*
1799          * If this is an indirect structure, then we walk through the valid
1800          * (non-zero) indirect pointers and deallocate the clusters
1801          * associated with each used map entry (via ps_dealloc_vsmap).
1802          * When all of the clusters in an indirect block have been
1803          * freed, we deallocate the block.  When all of the indirect
1804          * blocks have been deallocated we deallocate the memory
1805          * holding the indirect pointers.
1806          */
1807         if (vs->vs_indirect) {
1808                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1809                         if (vs->vs_imap[i] != NULL) {
1810                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1811                                 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1812                         }
1813                 }
1814                 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1815         } else {
1816                 /*
1817                  * Direct map.  Free used clusters, then memory.
1818                  */
1819                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1820                 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1821         }
1822         VS_MAP_UNLOCK(vs);
1823
1824         bs_commit(- vs->vs_size);
1825
1826         zfree(vstruct_zone, vs);
1827 }
1828
1829 int ps_map_extend(vstruct_t, unsigned int);     /* forward */
1830
1831 int ps_map_extend(
1832         vstruct_t       vs,
1833         unsigned int    new_size)
1834 {
1835         struct vs_map   **new_imap;
1836         struct vs_map   *new_dmap = NULL;
1837         int             newdsize;
1838         int             i;
1839         void            *old_map = NULL;
1840         int             old_map_size = 0;
1841
1842         if (vs->vs_size >= new_size) {
1843                 /*
1844                  * Someone has already done the work.
1845                  */
1846                 return 0;
1847         }
1848
1849         /*
1850          * If the new size extends into the indirect range, then we have one
1851          * of two cases: we are going from indirect to indirect, or we are
1852          * going from direct to indirect.  If we are going from indirect to
1853          * indirect, then it is possible that the new size will fit in the old
1854          * indirect map.  If this is the case, then just reset the size of the
1855          * vstruct map and we are done.  If the new size will not
1856          * fit into the old indirect map, then we have to allocate a new
1857          * indirect map and copy the old map pointers into this new map.
1858          *
1859          * If we are going from direct to indirect, then we have to allocate a
1860          * new indirect map and copy the old direct pages into the first
1861          * indirect page of the new map.
1862          * NOTE: allocating memory here is dangerous, as we're in the
1863          * pageout path.
1864          */
1865         if (INDIRECT_CLMAP(new_size)) {
1866                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1867
1868                 /*
1869                  * Get a new indirect map and zero it.
1870                  */
1871                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1872                 if (vs->vs_indirect &&
1873                     (new_map_size == old_map_size)) {
1874                         bs_commit(new_size - vs->vs_size);
1875                         vs->vs_size = new_size;
1876                         return 0;
1877                 }
1878
1879                 new_imap = (struct vs_map **)kalloc(new_map_size);
1880                 if (new_imap == NULL) {
1881                         return -1;
1882                 }
1883                 memset(new_imap, 0, new_map_size);
1884
1885                 if (vs->vs_indirect) {
1886                         /* Copy old entries into new map */
1887                         memcpy(new_imap, vs->vs_imap, old_map_size);
1888                         /* Arrange to free the old map */
1889                         old_map = (void *) vs->vs_imap;
1890                         newdsize = 0;
1891                 } else {        /* Old map was a direct map */
1892                         /* Allocate an indirect page */
1893                         if ((new_imap[0] = (struct vs_map *)
1894                              kalloc(CLMAP_THRESHOLD)) == NULL) {
1895                                 kfree(new_imap, new_map_size);
1896                                 return -1;
1897                         }
1898                         new_dmap = new_imap[0];
1899                         newdsize = CLMAP_ENTRIES;
1900                 }
1901         } else {
1902                 new_imap = NULL;
1903                 newdsize = new_size;
1904                 /*
1905                  * If the new map is a direct map, then the old map must
1906                  * also have been a direct map.  All we have to do is
1907                  * to allocate a new direct map, copy the old entries
1908                  * into it and free the old map.
1909                  */
1910                 if ((new_dmap = (struct vs_map *)
1911                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
1912                         return -1;
1913                 }
1914         }
1915         if (newdsize) {
1916
1917                 /* Free the old map */
1918                 old_map = (void *) vs->vs_dmap;
1919                 old_map_size = CLMAP_SIZE(vs->vs_size);
1920
1921                 /* Copy info from the old map into the new map */
1922                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1923
1924                 /* Initialize the rest of the new map */
1925                 for (i = vs->vs_size; i < newdsize; i++)
1926                         VSM_CLR(new_dmap[i]);
1927         }
1928         if (new_imap) {
1929                 vs->vs_imap = new_imap;
1930                 vs->vs_indirect = TRUE;
1931         } else
1932                 vs->vs_dmap = new_dmap;
1933         bs_commit(new_size - vs->vs_size);
1934         vs->vs_size = new_size;
1935         if (old_map)
1936                 kfree(old_map, old_map_size);
1937         return 0;
1938 }
1939
1940 dp_offset_t
1941 ps_clmap(
1942         vstruct_t       vs,
1943         dp_offset_t     offset,
1944         struct clmap    *clmap,
1945         int             flag,
1946         dp_size_t       size,
1947         int             error)
1948 {
1949         dp_offset_t     cluster;        /* The cluster of offset.       */
1950         dp_offset_t     newcl;          /* The new cluster allocated.   */
1951         dp_offset_t     newoff;
1952         unsigned int    i;
1953         struct vs_map   *vsmap;
1954
1955         VS_MAP_LOCK(vs);
1956
1957         ASSERT(vs->vs_dmap);
1958         cluster = atop_32(offset) >> vs->vs_clshift;
1959
1960         /*
1961          * Initialize cluster error value
1962          */
1963         clmap->cl_error = 0;
1964
1965         /*
1966          * If the object has grown, extend the page map.
1967          */
1968         if (cluster >= vs->vs_size) {
1969                 if (flag == CL_FIND) {
1970                         /* Do not allocate if just doing a lookup */
1971                         VS_MAP_UNLOCK(vs);
1972                         return (dp_offset_t) -1;
1973                 }
1974                 if (ps_map_extend(vs, cluster + 1)) {
1975                         VS_MAP_UNLOCK(vs);
1976                         return (dp_offset_t) -1;
1977                 }
1978         }
1979
1980         /*
1981          * Look for the desired cluster.  If the map is indirect, then we
1982          * have a two level lookup.  First find the indirect block, then
1983          * find the actual cluster.  If the indirect block has not yet
1984          * been allocated, then do so.  If the cluster has not yet been
1985          * allocated, then do so.
1986          *
1987          * If any of the allocations fail, then return an error.
1988          * Don't allocate if just doing a lookup.
1989          */
1990         if (vs->vs_indirect) {
1991                 long    ind_block = cluster/CLMAP_ENTRIES;
1992
1993                 /* Is the indirect block allocated? */
1994                 vsmap = vs->vs_imap[ind_block];
1995                 if (vsmap == NULL) {
1996                         if (flag == CL_FIND) {
1997                                 VS_MAP_UNLOCK(vs);
1998                                 return (dp_offset_t) -1;
1999                         }
2000
2001                         /* Allocate the indirect block */
2002                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
2003                         if (vsmap == NULL) {
2004                                 VS_MAP_UNLOCK(vs);
2005                                 return (dp_offset_t) -1;
2006                         }
2007                         /* Initialize the cluster offsets */
2008                         for (i = 0; i < CLMAP_ENTRIES; i++)
2009                                 VSM_CLR(vsmap[i]);
2010                         vs->vs_imap[ind_block] = vsmap;
2011                 }
2012         } else
2013                 vsmap = vs->vs_dmap;
2014
2015         ASSERT(vsmap);
2016         vsmap += cluster%CLMAP_ENTRIES;
2017
2018         /*
2019          * At this point, vsmap points to the struct vs_map desired.
2020          *
2021          * Look in the map for the cluster, if there was an error on a
2022          * previous write, flag it and return.  If it is not yet
2023          * allocated, then allocate it, if we're writing; if we're
2024          * doing a lookup and the cluster's not allocated, return error.
2025          */
2026         if (VSM_ISERR(*vsmap)) {
2027                 clmap->cl_error = VSM_GETERR(*vsmap);
2028                 VS_MAP_UNLOCK(vs);
2029                 return (dp_offset_t) -1;
2030         } else if (VSM_ISCLR(*vsmap)) {
2031                 int psindex;
2032
2033                 if (flag == CL_FIND) {
2034                         /*
2035                          * If there's an error and the entry is clear, then
2036                          * we've run out of swap space.  Record the error
2037                          * here and return.
2038                          */
2039                         if (error) {
2040                                 VSM_SETERR(*vsmap, error);
2041                         }
2042                         VS_MAP_UNLOCK(vs);
2043                         return (dp_offset_t) -1;
2044                 } else {
2045                         /*
2046                          * Attempt to allocate a cluster from the paging segment
2047                          */
2048                         newcl = ps_allocate_cluster(vs, &psindex,
2049                                                     PAGING_SEGMENT_NULL);
2050                         if (newcl == (dp_offset_t) -1) {
2051                                 VS_MAP_UNLOCK(vs);
2052                                 return (dp_offset_t) -1;
2053                         }
2054                         VSM_CLR(*vsmap);
2055                         VSM_SETCLOFF(*vsmap, newcl);
2056                         VSM_SETPS(*vsmap, psindex);
2057                 }
2058         } else
2059                 newcl = VSM_CLOFF(*vsmap);
2060
2061         /*
2062          * Fill in pertinent fields of the clmap
2063          */
2064         clmap->cl_ps = VSM_PS(*vsmap);
2065         clmap->cl_numpages = VSCLSIZE(vs);
2066         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
2067
2068         /*
2069          * Byte offset in paging segment is byte offset to cluster plus
2070          * byte offset within cluster.  It looks ugly, but should be
2071          * relatively quick.
2072          */
2073         ASSERT(trunc_page(offset) == offset);
2074         newcl = ptoa_32(newcl) << vs->vs_clshift;
2075         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
2076         if (flag == CL_ALLOC) {
2077                 /*
2078                  * set bits in the allocation bitmap according to which
2079                  * pages were requested.  size is in bytes.
2080                  */
2081                 i = atop_32(newoff);
2082                 while ((size > 0) && (i < VSCLSIZE(vs))) {
2083                         VSM_SETALLOC(*vsmap, i);
2084                         i++;
2085                         size -= vm_page_size;
2086                 }
2087         }
2088         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
2089         if (newoff) {
2090                 /*
2091                  * Offset is not cluster aligned, so number of pages
2092                  * and bitmaps must be adjusted
2093                  */
2094                 clmap->cl_numpages -= atop_32(newoff);
2095                 CLMAP_SHIFT(clmap, vs);
2096                 CLMAP_SHIFTALLOC(clmap, vs);
2097         }
2098
2099         /*
2100          *
2101          * The setting of valid bits and handling of write errors
2102          * must be done here, while we hold the lock on the map.
2103          * It logically should be done in ps_vs_write_complete().
2104          * The size and error information has been passed from
2105          * ps_vs_write_complete().  If the size parameter is non-zero,
2106          * then there is work to be done.  If error is also non-zero,
2107          * then the error number is recorded in the cluster and the
2108          * entire cluster is in error.
2109          */
2110         if (size && flag == CL_FIND) {
2111                 dp_offset_t off = (dp_offset_t) 0;
2112
2113                 if (!error) {
2114                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
2115                              i++) {
2116                                 VSM_SETPG(*vsmap, i);
2117                                 size -= vm_page_size;
2118                         }
2119                         ASSERT(i <= VSCLSIZE(vs));
2120                 } else {
2121                         BS_STAT(clmap->cl_ps->ps_bs,
2122                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
2123                                         atop_32(size));
2124                         off = VSM_CLOFF(*vsmap);
2125                         VSM_SETERR(*vsmap, error);
2126                 }
2127                 /*
2128                  * Deallocate cluster if error, and no valid pages
2129                  * already present.
2130                  */
2131                 if (off != (dp_offset_t) 0)
2132                         ps_deallocate_cluster(clmap->cl_ps, off);
2133                 VS_MAP_UNLOCK(vs);
2134                 return (dp_offset_t) 0;
2135         } else
2136                 VS_MAP_UNLOCK(vs);
2137
2138         DP_DEBUG(DEBUG_VS_INTERNAL,
2139                  ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2140                   newcl+newoff, (int) vs, (int) vsmap, flag));
2141         DP_DEBUG(DEBUG_VS_INTERNAL,
2142                  ("     clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2143                   (int) clmap->cl_ps, clmap->cl_numpages,
2144                   (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
2145
2146         return (newcl + newoff);
2147 }
2148
2149 void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t);     /* forward */
2150
2151 void
2152 ps_clunmap(
2153         vstruct_t       vs,
2154         dp_offset_t     offset,
2155         dp_size_t       length)
2156 {
2157         dp_offset_t             cluster; /* The cluster number of offset */
2158         struct vs_map           *vsmap;
2159
2160         VS_MAP_LOCK(vs);
2161
2162         /*
2163          * Loop through all clusters in this range, freeing paging segment
2164          * clusters and map entries as encountered.
2165          */
2166         while (length > 0) {
2167                 dp_offset_t     newoff;
2168                 unsigned int    i;
2169
2170                 cluster = atop_32(offset) >> vs->vs_clshift;
2171                 if (vs->vs_indirect)    /* indirect map */
2172                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2173                 else
2174                         vsmap = vs->vs_dmap;
2175                 if (vsmap == NULL) {
2176                         VS_MAP_UNLOCK(vs);
2177                         return;
2178                 }
2179                 vsmap += cluster%CLMAP_ENTRIES;
2180                 if (VSM_ISCLR(*vsmap)) {
2181                         length -= vm_page_size;
2182                         offset += vm_page_size;
2183                         continue;
2184                 }
2185                 /*
2186                  * We've got a valid mapping.  Clear it and deallocate
2187                  * paging segment cluster pages.
2188                  * Optimize for entire cluster cleraing.
2189                  */
2190                 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2191                         /*
2192                          * Not cluster aligned.
2193                          */
2194                         ASSERT(trunc_page(newoff) == newoff);
2195                         i = atop_32(newoff);
2196                 } else
2197                         i = 0;
2198                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2199                         VSM_CLRPG(*vsmap, i);
2200                         VSM_CLRALLOC(*vsmap, i);
2201                         length -= vm_page_size;
2202                         offset += vm_page_size;
2203                         i++;
2204                 }
2205
2206                 /*
2207                  * If map entry is empty, clear and deallocate cluster.
2208                  */
2209                 if (!VSM_ALLOC(*vsmap)) {
2210                         ps_deallocate_cluster(VSM_PS(*vsmap),
2211                                               VSM_CLOFF(*vsmap));
2212                         VSM_CLR(*vsmap);
2213                 }
2214         }
2215
2216         VS_MAP_UNLOCK(vs);
2217 }
2218
2219 void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */
2220
2221 void
2222 ps_vs_write_complete(
2223         vstruct_t       vs,
2224         dp_offset_t     offset,
2225         dp_size_t       size,
2226         int             error)
2227 {
2228         struct clmap    clmap;
2229
2230         /*
2231          * Get the struct vsmap for this cluster.
2232          * Use READ, even though it was written, because the
2233          * cluster MUST be present, unless there was an error
2234          * in the original ps_clmap (e.g. no space), in which
2235          * case, nothing happens.
2236          *
2237          * Must pass enough information to ps_clmap to allow it
2238          * to set the vs_map structure bitmap under lock.
2239          */
2240         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2241 }
2242
2243 void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int);    /* forward */
2244
2245 void
2246 vs_cl_write_complete(
2247         vstruct_t                       vs,
2248         __unused paging_segment_t       ps,
2249         dp_offset_t                     offset,
2250         __unused vm_offset_t            addr,
2251         dp_size_t                       size,
2252         boolean_t                       async,
2253         int                             error)
2254 {
2255 //      kern_return_t   kr;
2256
2257         if (error) {
2258                 /*
2259                  * For internal objects, the error is recorded on a
2260                  * per-cluster basis by ps_clmap() which is called
2261                  * by ps_vs_write_complete() below.
2262                  */
2263                 dprintf(("write failed error = 0x%x\n", error));
2264                 /* add upl_abort code here */
2265         } else
2266                 GSTAT(global_stats.gs_pages_out += atop_32(size));
2267         /*
2268          * Notify the vstruct mapping code, so it can do its accounting.
2269          */
2270         ps_vs_write_complete(vs, offset, size, error);
2271
2272         if (async) {
2273                 VS_LOCK(vs);
2274                 ASSERT(vs->vs_async_pending > 0);
2275                 vs->vs_async_pending -= size;
2276                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2277                         vs->vs_waiting_async = FALSE;
2278                         VS_UNLOCK(vs);
2279                         thread_wakeup(&vs->vs_async_pending);
2280                 } else {
2281                         VS_UNLOCK(vs);
2282                 }
2283         }
2284 }
2285
2286 #ifdef DEVICE_PAGING
2287 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2288
2289 kern_return_t
2290 device_write_reply(
2291         MACH_PORT_FACE  reply_port,
2292         kern_return_t   device_code,
2293         io_buf_len_t    bytes_written)
2294 {
2295         struct vs_async *vsa;
2296
2297         vsa = (struct vs_async *)
2298                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2299
2300         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2301                 device_code = KERN_FAILURE;
2302         }
2303
2304         vsa->vsa_error = device_code;
2305
2306
2307         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2308         if(vsa->vsa_flags & VSA_TRANSFER) {
2309                 /* revisit when async disk segments redone */
2310                 if(vsa->vsa_error) {
2311                    /* need to consider error condition.  re-write data or */
2312                    /* throw it away here. */
2313                    vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2314                 }
2315                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2316                                                 vsa->vsa_size, vsa->vsa_error);
2317         } else {
2318                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2319                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2320                              vsa->vsa_error);
2321         }
2322         VS_FREE_ASYNC(vsa);
2323
2324         return KERN_SUCCESS;
2325 }
2326
2327 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2328 kern_return_t
2329 device_write_reply_inband(
2330         MACH_PORT_FACE          reply_port,
2331         kern_return_t           return_code,
2332         io_buf_len_t            bytes_written)
2333 {
2334         panic("device_write_reply_inband: illegal");
2335         return KERN_SUCCESS;
2336 }
2337
2338 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2339 kern_return_t
2340 device_read_reply(
2341         MACH_PORT_FACE          reply_port,
2342         kern_return_t           return_code,
2343         io_buf_ptr_t            data,
2344         mach_msg_type_number_t  dataCnt)
2345 {
2346         struct vs_async *vsa;
2347         vsa = (struct vs_async *)
2348                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2349         vsa->vsa_addr = (vm_offset_t)data;
2350         vsa->vsa_size = (vm_size_t)dataCnt;
2351         vsa->vsa_error = return_code;
2352         thread_wakeup(&vsa);
2353         return KERN_SUCCESS;
2354 }
2355
2356 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2357 kern_return_t
2358 device_read_reply_inband(
2359         MACH_PORT_FACE          reply_port,
2360         kern_return_t           return_code,
2361         io_buf_ptr_inband_t     data,
2362         mach_msg_type_number_t  dataCnt)
2363 {
2364         panic("device_read_reply_inband: illegal");
2365         return KERN_SUCCESS;
2366 }
2367
2368 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2369 kern_return_t
2370 device_read_reply_overwrite(
2371         MACH_PORT_FACE          reply_port,
2372         kern_return_t           return_code,
2373         io_buf_len_t            bytes_read)
2374 {
2375         panic("device_read_reply_overwrite: illegal\n");
2376         return KERN_SUCCESS;
2377 }
2378
2379 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2380 kern_return_t
2381 device_open_reply(
2382         MACH_PORT_FACE          reply_port,
2383         kern_return_t           return_code,
2384         MACH_PORT_FACE          device_port)
2385 {
2386         panic("device_open_reply: illegal\n");
2387         return KERN_SUCCESS;
2388 }
2389
2390 kern_return_t
2391 ps_read_device(
2392         paging_segment_t        ps,
2393         dp_offset_t             offset,
2394         vm_offset_t             *bufferp,
2395         unsigned int            size,
2396         unsigned int            *residualp,
2397         int                     flags)
2398 {
2399         kern_return_t   kr;
2400         recnum_t        dev_offset;
2401         unsigned int    bytes_wanted;
2402         unsigned int    bytes_read;
2403         unsigned int    total_read;
2404         vm_offset_t     dev_buffer;
2405         vm_offset_t     buf_ptr;
2406         unsigned int    records_read;
2407         struct vs_async *vsa;
2408
2409         device_t        device;
2410         vm_map_copy_t   device_data = NULL;
2411         default_pager_thread_t *dpt = NULL;
2412
2413         device = dev_port_lookup(ps->ps_device);
2414         clustered_reads[atop_32(size)]++;
2415
2416         dev_offset = (ps->ps_offset +
2417                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2418         bytes_wanted = size;
2419         total_read = 0;
2420         *bufferp = (vm_offset_t)NULL;
2421
2422         do {
2423                 vsa = VS_ALLOC_ASYNC();
2424                 if (vsa) {
2425                         vsa->vsa_vs = NULL;
2426                         vsa->vsa_addr = 0;
2427                         vsa->vsa_offset = 0;
2428                         vsa->vsa_size = 0;
2429                         vsa->vsa_ps = NULL;
2430                 }
2431                 ip_lock(vsa->reply_port);
2432                 vsa->reply_port->ip_sorights++;
2433                 ip_reference(vsa->reply_port);
2434                 ip_unlock(vsa->reply_port);
2435                 kr = ds_device_read_common(device,
2436                                  vsa->reply_port,
2437                                  (mach_msg_type_name_t)
2438                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2439                                  (dev_mode_t) 0,
2440                                  dev_offset,
2441                                  bytes_wanted,
2442                                  (IO_READ | IO_CALL),
2443                                  (io_buf_ptr_t *) &dev_buffer,
2444                                  (mach_msg_type_number_t *) &bytes_read);
2445                 if(kr == MIG_NO_REPLY) {
2446                         assert_wait(&vsa, THREAD_UNINT);
2447                         thread_block(THREAD_CONTINUE_NULL);
2448
2449                         dev_buffer = vsa->vsa_addr;
2450                         bytes_read = (unsigned int)vsa->vsa_size;
2451                         kr = vsa->vsa_error;
2452                 }
2453                 VS_FREE_ASYNC(vsa);
2454                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2455                         break;
2456                 }
2457                 total_read += bytes_read;
2458
2459                 /*
2460                  * If we got the entire range, use the returned dev_buffer.
2461                  */
2462                 if (bytes_read == size) {
2463                         *bufferp = (vm_offset_t)dev_buffer;
2464                         break;
2465                 }
2466
2467 #if 1
2468                 dprintf(("read only %d bytes out of %d\n",
2469                          bytes_read, bytes_wanted));
2470 #endif
2471                 if(dpt == NULL) {
2472                         dpt = get_read_buffer();
2473                         buf_ptr = dpt->dpt_buffer;
2474                         *bufferp = (vm_offset_t)buf_ptr;
2475                 }
2476                 /*
2477                  * Otherwise, copy the data into the provided buffer (*bufferp)
2478                  * and append the rest of the range as it comes in.
2479                  */
2480                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2481                 buf_ptr += bytes_read;
2482                 bytes_wanted -= bytes_read;
2483                 records_read = (bytes_read >>
2484                                 (vm_page_shift - ps->ps_record_shift));
2485                 dev_offset += records_read;
2486                 DP_DEBUG(DEBUG_VS_INTERNAL,
2487                          ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2488                           dev_buffer, bytes_read));
2489                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2490                     != KERN_SUCCESS)
2491                         Panic("dealloc buf");
2492         } while (bytes_wanted);
2493
2494         *residualp = size - total_read;
2495         if((dev_buffer != *bufferp) && (total_read != 0)) {
2496                 vm_offset_t temp_buffer;
2497                 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2498                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2499                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2500                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2501                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2502                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2503                         (vm_map_copy_t *)&device_data, FALSE))
2504                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2505         }
2506         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2507                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2508                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2509                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2510                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2511                         (vm_map_copy_t *)&device_data, FALSE))
2512                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2513         }
2514         else {
2515                 device_data = NULL;
2516         }
2517         *bufferp = (vm_offset_t)device_data;
2518
2519         if(dpt != NULL) {
2520                 /* Free the receive buffer */
2521                 dpt->checked_out = 0;
2522                 thread_wakeup(&dpt_array);
2523         }
2524         return KERN_SUCCESS;
2525 }
2526
2527 kern_return_t
2528 ps_write_device(
2529         paging_segment_t        ps,
2530         dp_offset_t             offset,
2531         vm_offset_t             addr,
2532         unsigned int            size,
2533         struct vs_async         *vsa)
2534 {
2535         recnum_t        dev_offset;
2536         io_buf_len_t    bytes_to_write, bytes_written;
2537         recnum_t        records_written;
2538         kern_return_t   kr;
2539         MACH_PORT_FACE  reply_port;
2540
2541
2542
2543         clustered_writes[atop_32(size)]++;
2544
2545         dev_offset = (ps->ps_offset +
2546                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2547         bytes_to_write = size;
2548
2549         if (vsa) {
2550                 /*
2551                  * Asynchronous write.
2552                  */
2553                 reply_port = vsa->reply_port;
2554                 ip_lock(reply_port);
2555                 reply_port->ip_sorights++;
2556                 ip_reference(reply_port);
2557                 ip_unlock(reply_port);
2558                 {
2559                 device_t        device;
2560                 device = dev_port_lookup(ps->ps_device);
2561
2562                 vsa->vsa_addr = addr;
2563                 kr=ds_device_write_common(device,
2564                         reply_port,
2565                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2566                         (dev_mode_t) 0,
2567                         dev_offset,
2568                         (io_buf_ptr_t)  addr,
2569                         size,
2570                         (IO_WRITE | IO_CALL),
2571                         &bytes_written);
2572                 }
2573                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2574                         if (verbose)
2575                                 dprintf(("%s0x%x, addr=0x%x,"
2576                                          "size=0x%x,offset=0x%x\n",
2577                                          "device_write_request returned ",
2578                                          kr, addr, size, offset));
2579                         BS_STAT(ps->ps_bs,
2580                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2581                         /* do the completion notification to free resources */
2582                         device_write_reply(reply_port, kr, 0);
2583                         return PAGER_ERROR;
2584                 }
2585         } else do {
2586                 /*
2587                  * Synchronous write.
2588                  */
2589                 {
2590                 device_t        device;
2591                 device = dev_port_lookup(ps->ps_device);
2592                 kr=ds_device_write_common(device,
2593                         IP_NULL, 0,
2594                         (dev_mode_t) 0,
2595                         dev_offset,
2596                         (io_buf_ptr_t)  addr,
2597                         size,
2598                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2599                         &bytes_written);
2600                 }
2601                 if (kr != KERN_SUCCESS) {
2602                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2603                                  "device_write returned ",
2604                                  kr, addr, size, offset));
2605                         BS_STAT(ps->ps_bs,
2606                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2607                         return PAGER_ERROR;
2608                 }
2609                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2610                         Panic("fragmented write");
2611                 records_written = (bytes_written >>
2612                                    (vm_page_shift - ps->ps_record_shift));
2613                 dev_offset += records_written;
2614 #if 1
2615                 if (bytes_written != bytes_to_write) {
2616                         dprintf(("wrote only %d bytes out of %d\n",
2617                                  bytes_written, bytes_to_write));
2618                 }
2619 #endif
2620                 bytes_to_write -= bytes_written;
2621                 addr += bytes_written;
2622         } while (bytes_to_write > 0);
2623
2624         return PAGER_SUCCESS;
2625 }
2626
2627
2628 #else /* !DEVICE_PAGING */
2629
2630 kern_return_t
2631 ps_read_device(
2632         __unused paging_segment_t       ps,
2633         __unused dp_offset_t            offset,
2634         __unused vm_offset_t            *bufferp,
2635         __unused unsigned int           size,
2636         __unused unsigned int           *residualp,
2637         __unused int                            flags)
2638 {
2639   panic("ps_read_device not supported");
2640   return KERN_FAILURE;
2641 }
2642
2643 kern_return_t
2644 ps_write_device(
2645         __unused paging_segment_t       ps,
2646         __unused dp_offset_t            offset,
2647         __unused vm_offset_t            addr,
2648         __unused unsigned int           size,
2649         __unused struct vs_async        *vsa)
2650 {
2651   panic("ps_write_device not supported");
2652   return KERN_FAILURE;
2653 }
2654
2655 #endif /* DEVICE_PAGING */
2656 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t);      /* forward */
2657
2658 void
2659 pvs_object_data_provided(
2660         __unused vstruct_t              vs,
2661         __unused upl_t                  upl,
2662         __unused upl_offset_t   offset,
2663         upl_size_t                              size)
2664 {
2665
2666         DP_DEBUG(DEBUG_VS_INTERNAL,
2667                  ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2668                   upl, offset, size));
2669
2670         ASSERT(size > 0);
2671         GSTAT(global_stats.gs_pages_in += atop_32(size));
2672
2673
2674 #if     USE_PRECIOUS
2675         ps_clunmap(vs, offset, size);
2676 #endif  /* USE_PRECIOUS */
2677
2678 }
2679
2680 static memory_object_offset_t   last_start;
2681 static vm_size_t                last_length;
2682
2683 kern_return_t
2684 pvs_cluster_read(
2685         vstruct_t       vs,
2686         dp_offset_t     vs_offset,
2687         dp_size_t       cnt,
2688         void            *fault_info)
2689 {
2690         kern_return_t           error = KERN_SUCCESS;
2691         unsigned int            size;
2692         unsigned int            residual;
2693         unsigned int            request_flags;
2694         int                     io_flags = 0;
2695         int                     seg_index;
2696         int                     pages_in_cl;
2697         int                     cl_size;
2698         int                     cl_mask;
2699         int                     cl_index;
2700         unsigned int            xfer_size;
2701         dp_offset_t             orig_vs_offset;
2702         dp_offset_t       ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2703         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2704         struct clmap            clmap;
2705         upl_t                   upl;
2706         unsigned int            page_list_count;
2707         memory_object_offset_t  cluster_start;
2708         vm_size_t               cluster_length;
2709         uint32_t                io_streaming;
2710
2711         pages_in_cl = 1 << vs->vs_clshift;
2712         cl_size = pages_in_cl * vm_page_size;
2713         cl_mask = cl_size - 1;
2714
2715 #if     USE_PRECIOUS
2716         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2717 #else
2718         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2719 #endif
2720         cl_index = (vs_offset & cl_mask) / vm_page_size;
2721
2722         if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) ||
2723             !CLMAP_ISSET(clmap, cl_index)) {
2724                 /*
2725                  * the needed page doesn't exist in the backing store...
2726                  * we don't want to try to do any I/O, just abort the
2727                  * page and let the fault handler provide a zero-fill
2728                  */
2729                 if (cnt == 0) {
2730                         /*
2731                          * The caller was just poking at us to see if
2732                          * the page has been paged out.  No need to
2733                          * mess with the page at all.
2734                          * Just let the caller know we don't have that page.
2735                          */
2736                         return KERN_FAILURE;
2737                 }
2738
2739                 page_list_count = 0;
2740
2741                 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2742                                                 PAGE_SIZE, PAGE_SIZE,
2743                                                 &upl, NULL, &page_list_count,
2744                                                 request_flags);
2745
2746                 if (clmap.cl_error)
2747                         upl_abort(upl, UPL_ABORT_ERROR);
2748                 else
2749                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2750                 upl_deallocate(upl);
2751
2752                 return KERN_SUCCESS;
2753         }
2754
2755         if (cnt == 0) {
2756                 /*
2757                  * The caller was just poking at us to see if
2758                  * the page has been paged out.  No need to
2759                  * mess with the page at all.
2760                  * Just let the caller know we do have that page.
2761                  */
2762                 return KERN_SUCCESS;
2763         }
2764
2765         assert(dp_encryption_inited);
2766         if (dp_encryption) {
2767                 /*
2768                  * ENCRYPTED SWAP:
2769                  * request that the UPL be prepared for
2770                  * decryption.
2771                  */
2772                 request_flags |= UPL_ENCRYPT;
2773         }
2774         orig_vs_offset = vs_offset;
2775
2776         assert(cnt != 0);
2777         cnt = VM_SUPER_CLUSTER;
2778         cluster_start = (memory_object_offset_t) vs_offset;
2779         cluster_length = (vm_size_t) cnt;
2780         io_streaming = 0;
2781
2782         /*
2783          * determine how big a speculative I/O we should try for...
2784          */
2785         if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
2786                 assert(vs_offset >= (dp_offset_t) cluster_start &&
2787                        vs_offset < (dp_offset_t) (cluster_start + cluster_length));
2788                 vs_offset = (dp_offset_t) cluster_start;
2789                 cnt = (dp_size_t) cluster_length;
2790         } else {
2791                 cluster_length = PAGE_SIZE;
2792                 cnt = PAGE_SIZE;
2793         }
2794
2795         if (io_streaming)
2796                 io_flags |= UPL_IOSTREAMING;
2797
2798         last_start = cluster_start;
2799         last_length = cluster_length;
2800
2801         /*
2802          * This loop will be executed multiple times until the entire
2803          * range has been looked at or we issue an I/O... if the request spans cluster
2804          * boundaries, the clusters will be checked for logical continunity,
2805          * if contiguous the I/O request will span multiple clusters...
2806          * at most only 1 I/O will be issued... it will encompass the original offset
2807          */
2808         while (cnt && error == KERN_SUCCESS) {
2809                 int     ps_info_valid;
2810
2811                 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
2812                         size = VM_SUPER_CLUSTER;
2813                         size -= vs_offset & cl_mask;
2814                 } else if (cnt > VM_SUPER_CLUSTER)
2815                         size = VM_SUPER_CLUSTER;
2816                 else
2817                         size = cnt;
2818
2819                 cnt -= size;
2820
2821                 ps_info_valid = 0;
2822                 seg_index     = 0;
2823
2824                 while (size > 0 && error == KERN_SUCCESS) {
2825                         unsigned int  abort_size;
2826                         int           failed_size;
2827                         int           beg_pseg;
2828                         int           beg_indx;
2829                         dp_offset_t   cur_offset;
2830
2831                         if ( !ps_info_valid) {
2832                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2833                                 psp[seg_index]       = CLMAP_PS(clmap);
2834                                 ps_info_valid = 1;
2835                         }
2836                         /*
2837                          * skip over unallocated physical segments
2838                          */
2839                         if (ps_offset[seg_index] == (dp_offset_t) -1) {
2840                                 abort_size = cl_size - (vs_offset & cl_mask);
2841                                 abort_size = MIN(abort_size, size);
2842
2843                                 size      -= abort_size;
2844                                 vs_offset += abort_size;
2845
2846                                 seg_index++;
2847                                 ps_info_valid = 0;
2848
2849                                 continue;
2850                         }
2851                         cl_index = (vs_offset & cl_mask) / vm_page_size;
2852
2853                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2854                                 /*
2855                                  * skip over unallocated pages
2856                                  */
2857                                 if (CLMAP_ISSET(clmap, cl_index))
2858                                         break;
2859                                 abort_size += vm_page_size;
2860                         }
2861                         if (abort_size) {
2862                                 size      -= abort_size;
2863                                 vs_offset += abort_size;
2864
2865                                 if (cl_index == pages_in_cl) {
2866                                         /*
2867                                          * if we're at the end of this physical cluster
2868                                          * then bump to the next one and continue looking
2869                                          */
2870                                         seg_index++;
2871                                         ps_info_valid = 0;
2872
2873                                         continue;
2874                                 }
2875                                 if (size == 0)
2876                                         break;
2877                         }
2878                         /*
2879                          * remember the starting point of the first allocated page
2880                          * for the I/O we're about to issue
2881                          */
2882                         beg_pseg   = seg_index;
2883                         beg_indx   = cl_index;
2884                         cur_offset = vs_offset;
2885
2886                         /*
2887                          * calculate the size of the I/O that we can do...
2888                          * this may span multiple physical segments if
2889                          * they are contiguous
2890                          */
2891                         for (xfer_size = 0; xfer_size < size; ) {
2892
2893                                 while (cl_index < pages_in_cl && xfer_size < size) {
2894                                         /*
2895                                          * accumulate allocated pages within
2896                                          * a physical segment
2897                                          */
2898                                         if (CLMAP_ISSET(clmap, cl_index)) {
2899                                                 xfer_size  += vm_page_size;
2900                                                 cur_offset += vm_page_size;
2901                                                 cl_index++;
2902
2903                                                 BS_STAT(psp[seg_index]->ps_bs,
2904                                                         psp[seg_index]->ps_bs->bs_pages_in++);
2905                                         } else
2906                                                 break;
2907                                 }
2908                                 if (cl_index < pages_in_cl || xfer_size >= size) {
2909                                         /*
2910                                          * we've hit an unallocated page or
2911                                          * the end of this request... see if
2912                                          * it's time to fire the I/O
2913                                          */
2914                                         break;
2915                                 }
2916                                 /*
2917                                  * we've hit the end of the current physical
2918                                  * segment and there's more to do, so try
2919                                  * moving to the next one
2920                                  */
2921                                 seg_index++;
2922
2923                                 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2924                                 psp[seg_index] = CLMAP_PS(clmap);
2925                                 ps_info_valid = 1;
2926
2927                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2928                                         /*
2929                                          * if the physical segment we're about
2930                                          * to step into is not contiguous to
2931                                          * the one we're currently in, or it's
2932                                          * in a different paging file, or
2933                                          * it hasn't been allocated....
2934                                          * we stop this run and go check
2935                                          * to see if it's time to fire the I/O
2936                                          */
2937                                         break;
2938                                 }
2939                                 /*
2940                                  * start with first page of the next physical
2941                                  * segment
2942                                  */
2943                                 cl_index = 0;
2944                         }
2945                         if (xfer_size == 0) {
2946                                 /*
2947                                  * no I/O to generate for this segment
2948                                  */
2949                                 continue;
2950                         }
2951                         if (cur_offset <= orig_vs_offset) {
2952                                 /*
2953                                  * we've hit a hole in our speculative cluster
2954                                  * before the offset that we're really after...
2955                                  * don't issue the I/O since it doesn't encompass
2956                                  * the original offset and we're looking to only
2957                                  * pull in the speculative pages if they can be
2958                                  * made part of a single I/O
2959                                  */
2960                                 size      -= xfer_size;
2961                                 vs_offset += xfer_size;
2962
2963                                 continue;
2964                         }
2965                         /*
2966                          * we have a contiguous range of allocated pages
2967                          * to read from that encompasses the original offset
2968                          */
2969                         page_list_count = 0;
2970                         memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2971                                                         xfer_size, xfer_size,
2972                                                         &upl, NULL, &page_list_count,
2973                                                         request_flags | UPL_SET_INTERNAL | UPL_NOBLOCK);
2974
2975                         error = ps_read_file(psp[beg_pseg],
2976                                              upl, (upl_offset_t) 0,
2977                                              ps_offset[beg_pseg] + (beg_indx * vm_page_size),
2978                                              xfer_size, &residual, io_flags);
2979
2980                         failed_size = 0;
2981
2982                         /*
2983                          * Adjust counts and send response to VM.  Optimize
2984                          * for the common case, i.e. no error and/or partial
2985                          * data. If there was an error, then we need to error
2986                          * the entire range, even if some data was successfully
2987                          * read. If there was a partial read we may supply some
2988                          * data and may error some as well.  In all cases the
2989                          * VM must receive some notification for every page
2990                          * in the range.
2991                          */
2992                         if ((error == KERN_SUCCESS) && (residual == 0)) {
2993                                 /*
2994                                  * Got everything we asked for, supply the data
2995                                  * to the VM.  Note that as a side effect of
2996                                  * supplying the data, the buffer holding the
2997                                  * supplied data is deallocated from the pager's
2998                                  *  address space.
2999                                  */
3000                                 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
3001                         } else {
3002                                 failed_size = xfer_size;
3003
3004                                 if (error == KERN_SUCCESS) {
3005                                         if (residual == xfer_size) {
3006                                                 /*
3007                                                  * If a read operation returns no error
3008                                                  * and no data moved, we turn it into
3009                                                  * an error, assuming we're reading at
3010                                                  * or beyong EOF.
3011                                                  * Fall through and error the entire range.
3012                                                  */
3013                                                 error = KERN_FAILURE;
3014                                         } else {
3015                                                 /*
3016                                                  * Otherwise, we have partial read. If
3017                                                  * the part read is a integral number
3018                                                  * of pages supply it. Otherwise round
3019                                                  * it up to a page boundary, zero fill
3020                                                  * the unread part, and supply it.
3021                                                  * Fall through and error the remainder
3022                                                  * of the range, if any.
3023                                                  */
3024                                                 int fill;
3025                                                 unsigned int lsize;
3026
3027                                                 fill = residual & ~vm_page_size;
3028                                                 lsize = (xfer_size - residual) + fill;
3029
3030                                                 pvs_object_data_provided(vs, upl, vs_offset, lsize);
3031
3032                                                 if (lsize < xfer_size) {
3033                                                         failed_size = xfer_size - lsize;
3034                                                         error = KERN_FAILURE;
3035                                                 }
3036                                         }
3037                                 }
3038                         }
3039                         if (error != KERN_SUCCESS) {
3040                                 /*
3041                                  * There was an error in some part of the range, tell
3042                                  * the VM. Note that error is explicitly checked again
3043                                  * since it can be modified above.
3044                                  */
3045                                 BS_STAT(psp[beg_pseg]->ps_bs,
3046                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
3047                         }
3048                         /*
3049                          * we've issued a single I/O that encompassed the original offset
3050                          * at this point we either met our speculative request length or
3051                          * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3052                          * not present or not physically contiguous to the previous one), so
3053                          * we're done issuing I/O at this point
3054                          */
3055                         return (error);
3056                 }
3057         }
3058         return error;
3059 }
3060
3061 int vs_do_async_write = 1;
3062
3063 kern_return_t
3064 vs_cluster_write(
3065         vstruct_t       vs,
3066         upl_t           internal_upl,
3067         upl_offset_t    offset,
3068         upl_size_t      cnt,
3069         boolean_t       dp_internal,
3070         int             flags)
3071 {
3072         upl_size_t      transfer_size;
3073         int             error = 0;
3074         struct clmap    clmap;
3075
3076         dp_offset_t     actual_offset;  /* Offset within paging segment */
3077         paging_segment_t ps;
3078         dp_offset_t     mobj_base_addr;
3079         dp_offset_t     mobj_target_addr;
3080
3081         upl_t           upl;
3082         upl_page_info_t *pl;
3083         int             page_index;
3084         unsigned int    page_max_index;
3085         int             list_size;
3086         int             pages_in_cl;
3087         unsigned int    cl_size;
3088         int             base_index;
3089         unsigned int    seg_size;
3090         unsigned int    upl_offset_in_object;
3091         boolean_t       minimal_clustering = FALSE;
3092         boolean_t       found_dirty;
3093
3094         pages_in_cl = 1 << vs->vs_clshift;
3095         cl_size = pages_in_cl * vm_page_size;
3096
3097 #if CONFIG_FREEZE
3098         minimal_clustering = TRUE;
3099 #endif
3100         if (dp_isssd == TRUE)
3101                 minimal_clustering = TRUE;
3102
3103         if (!dp_internal) {
3104                 unsigned int page_list_count;
3105                 int          request_flags;
3106                 unsigned int super_size;
3107                 int          first_dirty;
3108                 int          num_dirty;
3109                 int          num_of_pages;
3110                 int          seg_index;
3111                 upl_offset_t  upl_offset;
3112                 upl_offset_t  upl_offset_aligned;
3113                 dp_offset_t  seg_offset;
3114                 dp_offset_t  ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3115                 paging_segment_t   psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3116
3117
3118                 if (bs_low)
3119                         super_size = cl_size;
3120                 else
3121                         super_size = VM_SUPER_CLUSTER;
3122
3123                 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
3124                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
3125                                 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
3126
3127                 if (!dp_encryption_inited) {
3128                         /*
3129                          * ENCRYPTED SWAP:
3130                          * Once we've started using swap, we
3131                          * can't change our mind on whether
3132                          * it needs to be encrypted or
3133                          * not.
3134                          */
3135                         dp_encryption_inited = TRUE;
3136                 }
3137                 if (dp_encryption) {
3138                         /*
3139                          * ENCRYPTED SWAP:
3140                          * request that the UPL be prepared for
3141                          * encryption.
3142                          */
3143                         request_flags |= UPL_ENCRYPT;
3144                         flags |= UPL_PAGING_ENCRYPTED;
3145                 }
3146                 page_list_count = 0;
3147                 memory_object_super_upl_request(vs->vs_control,
3148                                 (memory_object_offset_t)offset,
3149                                 cnt, super_size,
3150                                 &upl, NULL, &page_list_count,
3151                                 request_flags | UPL_FOR_PAGEOUT);
3152
3153                 /*
3154                  * The default pager does not handle objects larger than
3155                  * 4GB, so it does not deal with offset that don't fit in
3156                  * 32-bit.  Cast down upl->offset now and make sure we
3157                  * did not lose any valuable bits.
3158                  */
3159                 upl_offset_in_object = (unsigned int) upl->offset;
3160                 assert(upl->offset == upl_offset_in_object);
3161
3162                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3163
3164                 seg_size = cl_size - (upl_offset_in_object % cl_size);
3165                 upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1);
3166                 page_index = 0;
3167                 page_max_index = upl->size / PAGE_SIZE;
3168                 found_dirty = TRUE;
3169
3170                 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
3171                         unsigned int    seg_pgcnt;
3172
3173                         seg_pgcnt = seg_size / PAGE_SIZE;
3174
3175                         if (minimal_clustering == TRUE) {
3176                                 unsigned int    non_dirty;
3177
3178                                 non_dirty = 0;
3179                                 found_dirty = FALSE;
3180
3181                                 for (; non_dirty < seg_pgcnt; non_dirty++) {
3182                                         if ((page_index + non_dirty) >= page_max_index)
3183                                                 break;
3184
3185                                         if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) ||
3186                                             UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) {
3187                                                 found_dirty = TRUE;
3188                                                 break;
3189                                         }
3190                                 }
3191                         }
3192                         if (found_dirty == TRUE) {
3193                                 ps_offset[seg_index] =
3194                                         ps_clmap(vs,
3195                                                  upl_offset_aligned,
3196                                                  &clmap, CL_ALLOC,
3197                                                  cl_size, 0);
3198
3199                                 if (ps_offset[seg_index] == (dp_offset_t) -1) {
3200                                         upl_abort(upl, 0);
3201                                         upl_deallocate(upl);
3202
3203                                         return KERN_FAILURE;
3204                                 }
3205                                 psp[seg_index] = CLMAP_PS(clmap);
3206                         }
3207                         if (transfer_size > seg_size) {
3208                                 page_index += seg_pgcnt;
3209                                 transfer_size -= seg_size;
3210                                 upl_offset_aligned += cl_size;
3211                                 seg_size    = cl_size;
3212                                 seg_index++;
3213                         } else
3214                                 transfer_size = 0;
3215                 }
3216                 /*
3217                  * Ignore any non-present pages at the end of the
3218                  * UPL.
3219                  */
3220                 for (page_index = upl->size / vm_page_size; page_index > 0;)
3221                         if (UPL_PAGE_PRESENT(pl, --page_index))
3222                                 break;
3223                 num_of_pages = page_index + 1;
3224
3225                 base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE;
3226
3227                 for (page_index = 0; page_index < num_of_pages; ) {
3228                         /*
3229                          * skip over non-dirty pages
3230                          */
3231                         for ( ; page_index < num_of_pages; page_index++) {
3232                                 if (UPL_DIRTY_PAGE(pl, page_index)
3233                                         || UPL_PRECIOUS_PAGE(pl, page_index))
3234                                         /*
3235                                          * this is a page we need to write
3236                                          * go see if we can buddy it up with
3237                                          * others that are contiguous to it
3238                                          */
3239                                         break;
3240                                 /*
3241                                  * if the page is not-dirty, but present we
3242                                  * need to commit it...  This is an unusual
3243                                  * case since we only asked for dirty pages
3244                                  */
3245                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
3246                                         boolean_t empty = FALSE;
3247                                         upl_commit_range(upl,
3248                                                  page_index * vm_page_size,
3249                                                  vm_page_size,
3250                                                  UPL_COMMIT_NOTIFY_EMPTY,
3251                                                  pl,
3252                                                  page_list_count,
3253                                                  &empty);
3254                                         if (empty) {
3255                                                 assert(page_index ==
3256                                                        num_of_pages - 1);
3257                                                 upl_deallocate(upl);
3258                                         }
3259                                 }
3260                         }
3261                         if (page_index == num_of_pages)
3262                                 /*
3263                                  * no more pages to look at, we're out of here
3264                                  */
3265                                 break;
3266
3267                         /*
3268                          * gather up contiguous dirty pages... we have at
3269                          * least 1 * otherwise we would have bailed above
3270                          * make sure that each physical segment that we step
3271                          * into is contiguous to the one we're currently in
3272                          * if it's not, we have to stop and write what we have
3273                          */
3274                         for (first_dirty = page_index;
3275                                         page_index < num_of_pages; ) {
3276                                 if ( !UPL_DIRTY_PAGE(pl, page_index)
3277                                         && !UPL_PRECIOUS_PAGE(pl, page_index))
3278                                         break;
3279                                 page_index++;
3280                                 /*
3281                                  * if we just looked at the last page in the UPL
3282                                  * we don't need to check for physical segment
3283                                  * continuity
3284                                  */
3285                                 if (page_index < num_of_pages) {
3286                                         int cur_seg;
3287                                         int nxt_seg;
3288
3289                                         cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3290                                         nxt_seg = (base_index + page_index)/pages_in_cl;
3291
3292                                         if (cur_seg != nxt_seg) {
3293                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3294                                                 /*
3295                                                  * if the segment we're about
3296                                                  * to step into is not
3297                                                  * contiguous to the one we're
3298                                                  * currently in, or it's in a
3299                                                  * different paging file....
3300                                                  * we stop here and generate
3301                                                  * the I/O
3302                                                  */
3303                                                         break;
3304                                         }
3305                                 }
3306                         }
3307                         num_dirty = page_index - first_dirty;
3308
3309                         if (num_dirty) {
3310                                 upl_offset = first_dirty * vm_page_size;
3311                                 transfer_size = num_dirty * vm_page_size;
3312
3313                                 while (transfer_size) {
3314
3315                                         if ((seg_size = cl_size -
3316                                                 ((upl_offset_in_object +
3317                                                   upl_offset) % cl_size))
3318                                                         > transfer_size)
3319                                                 seg_size = transfer_size;
3320
3321                                         ps_vs_write_complete(
3322                                                 vs,
3323                                                 (upl_offset_in_object +
3324                                                  upl_offset),
3325                                                 seg_size, error);
3326
3327                                         transfer_size -= seg_size;
3328                                         upl_offset += seg_size;
3329                                 }
3330                                 upl_offset = first_dirty * vm_page_size;
3331                                 transfer_size = num_dirty * vm_page_size;
3332
3333                                 seg_index  = (base_index + first_dirty) / pages_in_cl;
3334                                 seg_offset = (upl_offset_in_object + upl_offset) % cl_size;
3335
3336                                 error = ps_write_file(psp[seg_index],
3337                                                 upl, upl_offset,
3338                                                 ps_offset[seg_index]
3339                                                                 + seg_offset,
3340                                                 transfer_size, flags);
3341                         } else {
3342                                 boolean_t empty = FALSE;
3343                                 upl_abort_range(upl,
3344                                                 first_dirty * vm_page_size,
3345                                                 num_dirty   * vm_page_size,
3346                                                 UPL_ABORT_NOTIFY_EMPTY,
3347                                                 &empty);
3348                                 if (empty) {
3349                                         assert(page_index == num_of_pages);
3350                                         upl_deallocate(upl);
3351                                 }
3352                         }
3353                 }
3354
3355         } else {
3356                 assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift));
3357                 list_size = cnt;
3358
3359                 page_index = 0;
3360                 /* The caller provides a mapped_data which is derived  */
3361                 /* from a temporary object.  The targeted pages are    */
3362                 /* guaranteed to be set at offset 0 in the mapped_data */
3363                 /* The actual offset however must still be derived     */
3364                 /* from the offset in the vs in question               */
3365                 mobj_base_addr = offset;
3366                 mobj_target_addr = mobj_base_addr;
3367
3368                 for (transfer_size = list_size; transfer_size != 0;) {
3369                         actual_offset = ps_clmap(vs, mobj_target_addr,
3370                                 &clmap, CL_ALLOC,
3371                                 transfer_size < cl_size ?
3372                                         transfer_size : cl_size, 0);
3373                         if(actual_offset == (dp_offset_t) -1) {
3374                                 error = 1;
3375                                 break;
3376                         }
3377                         cnt = MIN(transfer_size,
3378                                   (unsigned) CLMAP_NPGS(clmap) * vm_page_size);
3379                         ps = CLMAP_PS(clmap);
3380                         /* Assume that the caller has given us contiguous */
3381                         /* pages */
3382                         if(cnt) {
3383                                 ps_vs_write_complete(vs, mobj_target_addr,
3384                                                                 cnt, error);
3385                                 error = ps_write_file(ps, internal_upl,
3386                                                 0, actual_offset,
3387                                                 cnt, flags);
3388                                 if (error)
3389                                         break;
3390                            }
3391                         if (error)
3392                                 break;
3393                         actual_offset += cnt;
3394                         mobj_target_addr += cnt;
3395                         transfer_size -= cnt;
3396                         cnt = 0;
3397
3398                         if (error)
3399                                 break;
3400                 }
3401         }
3402         if(error)
3403                 return KERN_FAILURE;
3404         else
3405                 return KERN_SUCCESS;
3406 }
3407
3408 vm_size_t
3409 ps_vstruct_allocated_size(
3410         vstruct_t       vs)
3411 {
3412         int             num_pages;
3413         struct vs_map   *vsmap;
3414         unsigned int    i, j, k;
3415
3416         num_pages = 0;
3417         if (vs->vs_indirect) {
3418                 /* loop on indirect maps */
3419                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3420                         vsmap = vs->vs_imap[i];
3421                         if (vsmap == NULL)
3422                                 continue;
3423                         /* loop on clusters in this indirect map */
3424                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3425                                 if (VSM_ISCLR(vsmap[j]) ||
3426                                     VSM_ISERR(vsmap[j]))
3427                                         continue;
3428                                 /* loop on pages in this cluster */
3429                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3430                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3431                                                 num_pages++;
3432                                 }
3433                         }
3434                 }
3435         } else {
3436                 vsmap = vs->vs_dmap;
3437                 if (vsmap == NULL)
3438                         return 0;
3439                 /* loop on clusters in the direct map */
3440                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3441                         if (VSM_ISCLR(vsmap[j]) ||
3442                             VSM_ISERR(vsmap[j]))
3443                                 continue;
3444                         /* loop on pages in this cluster */
3445                         for (k = 0; k < VSCLSIZE(vs); k++) {
3446                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3447                                         num_pages++;
3448                         }
3449                 }
3450         }
3451
3452         return ptoa_32(num_pages);
3453 }
3454
3455 unsigned int
3456 ps_vstruct_allocated_pages(
3457         vstruct_t               vs,
3458         default_pager_page_t    *pages,
3459         unsigned int            pages_size)
3460 {
3461         unsigned int    num_pages;
3462         struct vs_map   *vsmap;
3463         dp_offset_t     offset;
3464         unsigned int    i, j, k;
3465
3466         num_pages = 0;
3467         offset = 0;
3468         if (vs->vs_indirect) {
3469                 /* loop on indirect maps */
3470                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3471                         vsmap = vs->vs_imap[i];
3472                         if (vsmap == NULL) {
3473                                 offset += (vm_page_size * CLMAP_ENTRIES *
3474                                            VSCLSIZE(vs));
3475                                 continue;
3476                         }
3477                         /* loop on clusters in this indirect map */
3478                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3479                                 if (VSM_ISCLR(vsmap[j]) ||
3480                                     VSM_ISERR(vsmap[j])) {
3481                                         offset += vm_page_size * VSCLSIZE(vs);
3482                                         continue;
3483                                 }
3484                                 /* loop on pages in this cluster */
3485                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3486                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3487                                                 num_pages++;
3488                                                 if (num_pages < pages_size)
3489                                                         pages++->dpp_offset =
3490                                                                 offset;
3491                                         }
3492                                         offset += vm_page_size;
3493                                 }
3494                         }
3495                 }
3496         } else {
3497                 vsmap = vs->vs_dmap;
3498                 if (vsmap == NULL)
3499                         return 0;
3500                 /* loop on clusters in the direct map */
3501                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3502                         if (VSM_ISCLR(vsmap[j]) ||
3503                             VSM_ISERR(vsmap[j])) {
3504                                 offset += vm_page_size * VSCLSIZE(vs);
3505                                 continue;
3506                         }
3507                         /* loop on pages in this cluster */
3508                         for (k = 0; k < VSCLSIZE(vs); k++) {
3509                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3510                                         num_pages++;
3511                                         if (num_pages < pages_size)
3512                                                 pages++->dpp_offset = offset;
3513                                 }
3514                                 offset += vm_page_size;
3515                         }
3516                 }
3517         }
3518
3519         return num_pages;
3520 }
3521
3522
3523 kern_return_t
3524 ps_vstruct_transfer_from_segment(
3525         vstruct_t        vs,
3526         paging_segment_t segment,
3527         upl_t            upl)
3528 {
3529         struct vs_map   *vsmap;
3530 //      struct vs_map   old_vsmap;
3531 //      struct vs_map   new_vsmap;
3532         unsigned int    i, j;
3533
3534         VS_LOCK(vs);    /* block all work on this vstruct */
3535                         /* can't allow the normal multiple write */
3536                         /* semantic because writes may conflict */
3537         vs->vs_xfer_pending = TRUE;
3538         vs_wait_for_sync_writers(vs);
3539         vs_start_write(vs);
3540         vs_wait_for_readers(vs);
3541         /* we will unlock the vs to allow other writes while transferring */
3542         /* and will be guaranteed of the persistance of the vs struct     */
3543         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3544         /* vs_async_pending */
3545         /* OK we now have guaranteed no other parties are accessing this */
3546         /* vs.  Now that we are also supporting simple lock versions of  */
3547         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3548         /* our purpose in holding it before was the multiple write case */
3549         /* we now use the boolean xfer_pending to do that.  We can use  */
3550         /* a boolean instead of a count because we have guaranteed single */
3551         /* file access to this code in its caller */
3552         VS_UNLOCK(vs);
3553 vs_changed:
3554         if (vs->vs_indirect) {
3555                 unsigned int    vsmap_size;
3556                 int             clmap_off;
3557                 /* loop on indirect maps */
3558                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3559                         vsmap = vs->vs_imap[i];
3560                         if (vsmap == NULL)
3561                                 continue;
3562                         /* loop on clusters in this indirect map */
3563                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3564                                            VSCLSIZE(vs) * i);
3565                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3566                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3567                         else
3568                                 vsmap_size = CLMAP_ENTRIES;
3569                         for (j = 0; j < vsmap_size; j++) {
3570                                 if (VSM_ISCLR(vsmap[j]) ||
3571                                     VSM_ISERR(vsmap[j]) ||
3572                                     (VSM_PS(vsmap[j]) != segment))
3573                                         continue;
3574                                 if(vs_cluster_transfer(vs,
3575                                         (vm_page_size * (j << vs->vs_clshift))
3576                                         + clmap_off,
3577                                         vm_page_size << vs->vs_clshift,
3578                                         upl)
3579                                                 != KERN_SUCCESS) {
3580                                    VS_LOCK(vs);
3581                                    vs->vs_xfer_pending = FALSE;
3582                                    VS_UNLOCK(vs);
3583                                    vs_finish_write(vs);
3584                                    return KERN_FAILURE;
3585                                 }
3586                                 /* allow other readers/writers during transfer*/
3587                                 VS_LOCK(vs);
3588                                 vs->vs_xfer_pending = FALSE;
3589                                 VS_UNLOCK(vs);
3590                                 vs_finish_write(vs);
3591                                 VS_LOCK(vs);
3592                                 vs->vs_xfer_pending = TRUE;
3593                                 vs_wait_for_sync_writers(vs);
3594                                 vs_start_write(vs);
3595                                 vs_wait_for_readers(vs);
3596                                 VS_UNLOCK(vs);
3597                                 if (!(vs->vs_indirect)) {
3598                                         goto vs_changed;
3599                                 }
3600                         }
3601                 }
3602         } else {
3603                 vsmap = vs->vs_dmap;
3604                 if (vsmap == NULL) {
3605                         VS_LOCK(vs);
3606                         vs->vs_xfer_pending = FALSE;
3607                         VS_UNLOCK(vs);
3608                         vs_finish_write(vs);
3609                         return KERN_SUCCESS;
3610                 }
3611                 /* loop on clusters in the direct map */
3612                 for (j = 0; j < vs->vs_size; j++) {
3613                         if (VSM_ISCLR(vsmap[j]) ||
3614                             VSM_ISERR(vsmap[j]) ||
3615                             (VSM_PS(vsmap[j]) != segment))
3616                                 continue;
3617                         if(vs_cluster_transfer(vs,
3618                                 vm_page_size * (j << vs->vs_clshift),
3619                                 vm_page_size << vs->vs_clshift,
3620                                 upl) != KERN_SUCCESS) {
3621                            VS_LOCK(vs);
3622                            vs->vs_xfer_pending = FALSE;
3623                            VS_UNLOCK(vs);
3624                            vs_finish_write(vs);
3625                            return KERN_FAILURE;
3626                         }
3627                         /* allow other readers/writers during transfer*/
3628                         VS_LOCK(vs);
3629                         vs->vs_xfer_pending = FALSE;
3630                         VS_UNLOCK(vs);
3631                         vs_finish_write(vs);
3632                         VS_LOCK(vs);
3633                         vs->vs_xfer_pending = TRUE;
3634                         vs_wait_for_sync_writers(vs);
3635                         vs_start_write(vs);
3636                         vs_wait_for_readers(vs);
3637                         VS_UNLOCK(vs);
3638                         if (vs->vs_indirect) {
3639                                 goto vs_changed;
3640                         }
3641                 }
3642         }
3643
3644         VS_LOCK(vs);
3645         vs->vs_xfer_pending = FALSE;
3646         VS_UNLOCK(vs);
3647         vs_finish_write(vs);
3648         return KERN_SUCCESS;
3649 }
3650
3651
3652
3653 vs_map_t
3654 vs_get_map_entry(
3655         vstruct_t       vs,
3656         dp_offset_t     offset)
3657 {
3658         struct vs_map   *vsmap;
3659         dp_offset_t     cluster;
3660
3661         cluster = atop_32(offset) >> vs->vs_clshift;
3662         if (vs->vs_indirect) {
3663                 long    ind_block = cluster/CLMAP_ENTRIES;
3664
3665                 /* Is the indirect block allocated? */
3666                 vsmap = vs->vs_imap[ind_block];
3667                 if(vsmap == (vs_map_t) NULL)
3668                         return vsmap;
3669         } else
3670                 vsmap = vs->vs_dmap;
3671         vsmap += cluster%CLMAP_ENTRIES;
3672         return vsmap;
3673 }
3674
3675 kern_return_t
3676 vs_cluster_transfer(
3677         vstruct_t       vs,
3678         dp_offset_t     offset,
3679         dp_size_t       cnt,
3680         upl_t           upl)
3681 {
3682         dp_offset_t             actual_offset;
3683         paging_segment_t        ps;
3684         struct clmap            clmap;
3685         kern_return_t           error = KERN_SUCCESS;
3686         unsigned int            size, size_wanted;
3687         int                     i;
3688         unsigned int            residual = 0;
3689         unsigned int            unavail_size;
3690 //      default_pager_thread_t  *dpt;
3691 //      boolean_t               dealloc;
3692         struct  vs_map          *vsmap_ptr = NULL;
3693         struct  vs_map          read_vsmap;
3694         struct  vs_map          original_read_vsmap;
3695         struct  vs_map          write_vsmap;
3696 //      upl_t                           sync_upl;
3697 //      vm_offset_t                     ioaddr;
3698
3699         /* vs_cluster_transfer reads in the pages of a cluster and
3700          * then writes these pages back to new backing store.  The
3701          * segment the pages are being read from is assumed to have
3702          * been taken off-line and is no longer considered for new
3703          * space requests.
3704          */
3705
3706         /*
3707          * This loop will be executed once per cluster referenced.
3708          * Typically this means once, since it's unlikely that the
3709          * VM system will ask for anything spanning cluster boundaries.
3710          *
3711          * If there are holes in a cluster (in a paging segment), we stop
3712          * reading at the hole, then loop again, hoping to
3713          * find valid pages later in the cluster.  This continues until
3714          * the entire range has been examined, and read, if present.  The
3715          * pages are written as they are read.  If a failure occurs after
3716          * some pages are written the unmap call at the bottom of the loop
3717          * recovers the backing store and the old backing store remains
3718          * in effect.
3719          */
3720
3721         VSM_CLR(write_vsmap);
3722         VSM_CLR(original_read_vsmap);
3723         /* grab the actual object's pages to sync with I/O */
3724         while (cnt && (error == KERN_SUCCESS)) {
3725                 vsmap_ptr = vs_get_map_entry(vs, offset);
3726                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3727
3728                 if (actual_offset == (dp_offset_t) -1) {
3729
3730                         /*
3731                          * Nothing left to write in this cluster at least
3732                          * set write cluster information for any previous
3733                          * write, clear for next cluster, if there is one
3734                          */
3735                         unsigned int local_size, clmask, clsize;
3736
3737                         clsize = vm_page_size << vs->vs_clshift;
3738                         clmask = clsize - 1;
3739                         local_size = clsize - (offset & clmask);
3740                         ASSERT(local_size);
3741                         local_size = MIN(local_size, cnt);
3742
3743                         /* This cluster has no data in it beyond what may */
3744                         /* have been found on a previous iteration through */
3745                         /* the loop "write_vsmap" */
3746                         *vsmap_ptr = write_vsmap;
3747                         VSM_CLR(write_vsmap);
3748                         VSM_CLR(original_read_vsmap);
3749
3750                         cnt -= local_size;
3751                         offset += local_size;
3752                         continue;
3753                 }
3754
3755                 /*
3756                  * Count up contiguous available or unavailable
3757                  * pages.
3758                  */
3759                 ps = CLMAP_PS(clmap);
3760                 ASSERT(ps);
3761                 size = 0;
3762                 unavail_size = 0;
3763                 for (i = 0;
3764                      (size < cnt) && (unavail_size < cnt) &&
3765                      (i < CLMAP_NPGS(clmap)); i++) {
3766                         if (CLMAP_ISSET(clmap, i)) {
3767                                 if (unavail_size != 0)
3768                                         break;
3769                                 size += vm_page_size;
3770                                 BS_STAT(ps->ps_bs,
3771                                         ps->ps_bs->bs_pages_in++);
3772                         } else {
3773                                 if (size != 0)
3774                                         break;
3775                                 unavail_size += vm_page_size;
3776                         }
3777                 }
3778
3779                 if (size == 0) {
3780                         ASSERT(unavail_size);
3781                         ps_clunmap(vs, offset, unavail_size);
3782                         cnt -= unavail_size;
3783                         offset += unavail_size;
3784                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3785                                 == 0) {
3786                                 /* There is no more to transfer in this
3787                                    cluster
3788                                 */
3789                                 *vsmap_ptr = write_vsmap;
3790                                 VSM_CLR(write_vsmap);
3791                                 VSM_CLR(original_read_vsmap);
3792                         }
3793                         continue;
3794                 }
3795
3796                 if(VSM_ISCLR(original_read_vsmap))
3797                         original_read_vsmap = *vsmap_ptr;
3798
3799                 if(ps->ps_segtype == PS_PARTITION) {
3800                         panic("swap partition not supported\n");
3801                         /*NOTREACHED*/
3802                         error = KERN_FAILURE;
3803                         residual = size;
3804 /*
3805                         NEED TO ISSUE WITH SYNC & NO COMMIT
3806                         error = ps_read_device(ps, actual_offset, &buffer,
3807                                        size, &residual, flags);
3808 */
3809                 } else {
3810                         /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3811                         error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
3812                                         size, &residual,
3813                                         (UPL_IOSYNC | UPL_NOCOMMIT));
3814                 }
3815
3816                 read_vsmap = *vsmap_ptr;
3817
3818
3819                 /*
3820                  * Adjust counts and put data in new BS.  Optimize for the
3821                  * common case, i.e. no error and/or partial data.
3822                  * If there was an error, then we need to error the entire
3823                  * range, even if some data was successfully read.
3824                  *
3825                  */
3826                 if ((error == KERN_SUCCESS) && (residual == 0)) {
3827
3828                         /*
3829                          * Got everything we asked for, supply the data to
3830                          * the new BS.  Note that as a side effect of supplying
3831                          * the data, the buffer holding the supplied data is
3832                          * deallocated from the pager's address space unless
3833                          * the write is unsuccessful.
3834                          */
3835
3836                         /* note buffer will be cleaned up in all cases by */
3837                         /* internal_cluster_write or if an error on write */
3838                         /* the vm_map_copy_page_discard call              */
3839                         *vsmap_ptr = write_vsmap;
3840
3841                         if(vs_cluster_write(vs, upl, offset,
3842                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3843                                 error = KERN_FAILURE;
3844                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
3845                                         /* unmap the new backing store object */
3846                                         ps_clunmap(vs, offset, size);
3847                                 }
3848                                 /* original vsmap */
3849                                 *vsmap_ptr = original_read_vsmap;
3850                                 VSM_CLR(write_vsmap);
3851                         } else {
3852                                if((offset + size) &
3853                                         ((vm_page_size << vs->vs_clshift)
3854                                         - 1)) {
3855                                         /* There is more to transfer in this
3856                                            cluster
3857                                         */
3858                                         write_vsmap = *vsmap_ptr;
3859                                         *vsmap_ptr = read_vsmap;
3860                                         ps_clunmap(vs, offset, size);
3861                                 } else {
3862                                         /* discard the old backing object */
3863                                         write_vsmap = *vsmap_ptr;
3864                                         *vsmap_ptr = read_vsmap;
3865                                         ps_clunmap(vs, offset, size);
3866                                         *vsmap_ptr = write_vsmap;
3867                                         VSM_CLR(write_vsmap);
3868                                         VSM_CLR(original_read_vsmap);
3869                                 }
3870                         }
3871                 } else {
3872                         size_wanted = size;
3873                         if (error == KERN_SUCCESS) {
3874                                 if (residual == size) {
3875                                         /*
3876                                          * If a read operation returns no error
3877                                          * and no data moved, we turn it into
3878                                          * an error, assuming we're reading at
3879                                          * or beyond EOF.
3880                                          * Fall through and error the entire
3881                                          * range.
3882                                          */
3883                                         error = KERN_FAILURE;
3884                                         *vsmap_ptr = write_vsmap;
3885                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3886                                         /* unmap the new backing store object */
3887                                         ps_clunmap(vs, offset, size);
3888                                         }
3889                                         *vsmap_ptr = original_read_vsmap;
3890                                         VSM_CLR(write_vsmap);
3891                                         continue;
3892                                 } else {
3893                                         /*
3894                                          * Otherwise, we have partial read.
3895                                          * This is also considered an error
3896                                          * for the purposes of cluster transfer
3897                                          */
3898                                         error = KERN_FAILURE;
3899                                         *vsmap_ptr = write_vsmap;
3900                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3901                                         /* unmap the new backing store object */
3902                                         ps_clunmap(vs, offset, size);
3903                                         }
3904                                         *vsmap_ptr = original_read_vsmap;
3905                                         VSM_CLR(write_vsmap);
3906                                         continue;
3907                                 }
3908                         }
3909
3910                 }
3911                 cnt -= size;
3912                 offset += size;
3913
3914         } /* END while (cnt && (error == 0)) */
3915         if(!VSM_ISCLR(write_vsmap))
3916                 *vsmap_ptr = write_vsmap;
3917
3918         return error;
3919 }
3920
3921 kern_return_t
3922 default_pager_add_file(
3923         MACH_PORT_FACE  backing_store,
3924         vnode_ptr_t     vp,
3925         int             record_size,
3926         vm_size_t       size)
3927 {
3928         backing_store_t         bs;
3929         paging_segment_t        ps;
3930         int                     i;
3931         unsigned int            j;
3932         int                     error;
3933
3934         if ((bs = backing_store_lookup(backing_store))
3935             == BACKING_STORE_NULL)
3936                 return KERN_INVALID_ARGUMENT;
3937
3938         PSL_LOCK();
3939         for (i = 0; i <= paging_segment_max; i++) {
3940                 ps = paging_segments[i];
3941                 if (ps == PAGING_SEGMENT_NULL)
3942                         continue;
3943                 if (ps->ps_segtype != PS_FILE)
3944                         continue;
3945
3946                 /*
3947                  * Check for overlap on same device.
3948                  */
3949                 if (ps->ps_vnode == (struct vnode *)vp) {
3950                         PSL_UNLOCK();
3951                         BS_UNLOCK(bs);
3952                         return KERN_INVALID_ARGUMENT;
3953                 }
3954         }
3955         PSL_UNLOCK();
3956
3957         /*
3958          * Set up the paging segment
3959          */
3960         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3961         if (ps == PAGING_SEGMENT_NULL) {
3962                 BS_UNLOCK(bs);
3963                 return KERN_RESOURCE_SHORTAGE;
3964         }
3965
3966         ps->ps_segtype = PS_FILE;
3967         ps->ps_vnode = (struct vnode *)vp;
3968         ps->ps_offset = 0;
3969         ps->ps_record_shift = local_log2(vm_page_size / record_size);
3970         assert((dp_size_t) size == size);
3971         ps->ps_recnum = (dp_size_t) size;
3972         ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift;
3973
3974         ps->ps_pgcount = ps->ps_pgnum;
3975         ps->ps_clshift = local_log2(bs->bs_clsize);
3976         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3977         ps->ps_special_clusters = 0;
3978         ps->ps_hint = 0;
3979
3980         PS_LOCK_INIT(ps);
3981         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3982         if (!ps->ps_bmap) {
3983                 kfree(ps, sizeof *ps);
3984                 BS_UNLOCK(bs);
3985                 return KERN_RESOURCE_SHORTAGE;
3986         }
3987         for (j = 0; j < ps->ps_ncls; j++) {
3988                 clrbit(ps->ps_bmap, j);
3989         }
3990
3991         if(paging_segment_count == 0) {
3992                 ps->ps_state = PS_EMERGENCY_SEGMENT;
3993                 if(use_emergency_swap_file_first) {
3994                         ps->ps_state |= PS_CAN_USE;
3995                 }
3996                 emergency_segment_backing_store = backing_store;
3997         } else {
3998                 ps->ps_state = PS_CAN_USE;
3999         }
4000
4001         ps->ps_bs = bs;
4002
4003         if ((error = ps_enter(ps)) != 0) {
4004                 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
4005                 kfree(ps, sizeof *ps);
4006                 BS_UNLOCK(bs);
4007                 return KERN_RESOURCE_SHORTAGE;
4008         }
4009
4010         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
4011         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
4012         PSL_LOCK();
4013         if(IS_PS_OK_TO_USE(ps)) {
4014                 dp_pages_free += ps->ps_pgcount;
4015         } else {
4016                 dp_pages_reserve += ps->ps_pgcount;
4017         }
4018         PSL_UNLOCK();
4019
4020         BS_UNLOCK(bs);
4021
4022         bs_more_space(ps->ps_clcount);
4023
4024         /*
4025          * If the paging segment being activated is not the emergency
4026          * segment and we notice that the emergency segment is being
4027          * used then we help recover it. If all goes well, the
4028          * emergency segment will be back to its original state of
4029          * online but not activated (till it's needed the next time).
4030          */
4031         ps = paging_segments[EMERGENCY_PSEG_INDEX];
4032         if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) {
4033                 if(default_pager_backing_store_delete(emergency_segment_backing_store)) {
4034                         dprintf(("Failed to recover emergency paging segment\n"));
4035                 } else {
4036                         dprintf(("Recovered emergency paging segment\n"));
4037                 }
4038         }
4039
4040         DP_DEBUG(DEBUG_BS_INTERNAL,
4041                  ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
4042                   device, offset, (dp_size_t) size, record_size,
4043                   ps->ps_record_shift, ps->ps_pgnum));
4044
4045         return KERN_SUCCESS;
4046 }
4047
4048
4049
4050 kern_return_t
4051 ps_read_file(
4052         paging_segment_t        ps,
4053         upl_t                   upl,
4054         upl_offset_t            upl_offset,
4055         dp_offset_t             offset,
4056         upl_size_t              size,
4057         unsigned int            *residualp,
4058         int                     flags)
4059 {
4060         vm_object_offset_t      f_offset;
4061         int                     error = 0;
4062         int                     result;
4063
4064         assert(dp_encryption_inited);
4065
4066         clustered_reads[atop_32(size)]++;
4067
4068         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4069
4070         /*
4071          * for transfer case we need to pass uploffset and flags
4072          */
4073         assert((upl_size_t) size == size);
4074         error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL);
4075
4076         /* The vnode_pagein semantic is somewhat at odds with the existing   */
4077         /* device_read semantic.  Partial reads are not experienced at this  */
4078         /* level.  It is up to the bit map code and cluster read code to     */
4079         /* check that requested data locations are actually backed, and the  */
4080         /* pagein code to either read all of the requested data or return an */
4081         /* error. */
4082
4083         if (error)
4084                 result = KERN_FAILURE;
4085         else {
4086                 *residualp = 0;
4087                 result = KERN_SUCCESS;
4088         }
4089         return result;
4090 }
4091
4092 kern_return_t
4093 ps_write_file(
4094         paging_segment_t        ps,
4095         upl_t                   upl,
4096         upl_offset_t            upl_offset,
4097         dp_offset_t             offset,
4098         unsigned int            size,
4099         int                     flags)
4100 {
4101         vm_object_offset_t      f_offset;
4102         kern_return_t           result;
4103
4104         assert(dp_encryption_inited);
4105
4106         clustered_writes[atop_32(size)]++;
4107         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4108
4109         if (flags & UPL_PAGING_ENCRYPTED) {
4110                 /*
4111                  * ENCRYPTED SWAP:
4112                  * encrypt all the pages that we're going
4113                  * to pageout.
4114                  */
4115                 upl_encrypt(upl, upl_offset, size);
4116         }
4117         assert((upl_size_t) size == size);
4118         if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL))
4119                 result = KERN_FAILURE;
4120         else
4121                 result = KERN_SUCCESS;
4122
4123         return result;
4124 }
4125
4126 kern_return_t
4127 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
4128         int             hi_wat,
4129         int             lo_wat,
4130         int             flags,
4131         MACH_PORT_FACE  trigger_port)
4132 {
4133         MACH_PORT_FACE release;
4134         kern_return_t kr;
4135         clock_sec_t now;
4136         clock_nsec_t nanoseconds_dummy;
4137         static clock_sec_t error_notify = 0;
4138
4139         PSL_LOCK();
4140         if (flags == SWAP_ENCRYPT_ON) {
4141                 /* ENCRYPTED SWAP: turn encryption on */
4142                 release = trigger_port;
4143                 if (!dp_encryption_inited) {
4144                         dp_encryption_inited = TRUE;
4145                         dp_encryption = TRUE;
4146                         kr = KERN_SUCCESS;
4147                 } else {
4148                         kr = KERN_FAILURE;
4149                 }
4150         } else if (flags == SWAP_ENCRYPT_OFF) {
4151                 /* ENCRYPTED SWAP: turn encryption off */
4152                 release = trigger_port;
4153                 if (!dp_encryption_inited) {
4154                         dp_encryption_inited = TRUE;
4155                         dp_encryption = FALSE;
4156                         kr = KERN_SUCCESS;
4157                 } else {
4158                         kr = KERN_FAILURE;
4159                 }
4160         } else if (flags == HI_WAT_ALERT) {
4161                 release = min_pages_trigger_port;
4162                 min_pages_trigger_port = trigger_port;
4163                 minimum_pages_remaining = hi_wat/vm_page_size;
4164                 bs_low = FALSE;
4165                 kr = KERN_SUCCESS;
4166         } else if (flags ==  LO_WAT_ALERT) {
4167                 release = max_pages_trigger_port;
4168                 max_pages_trigger_port = trigger_port;
4169                 maximum_pages_free = lo_wat/vm_page_size;
4170                 kr = KERN_SUCCESS;
4171         } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) {
4172                 use_emergency_swap_file_first = TRUE;
4173                 release = trigger_port;
4174                 kr = KERN_SUCCESS;
4175         } else if (flags == SWAP_FILE_CREATION_ERROR) {
4176                 release = trigger_port;
4177                 kr = KERN_SUCCESS;
4178                 if( paging_segment_count == 1) {
4179                         use_emergency_swap_file_first = TRUE;
4180                 }
4181                 no_paging_space_action();
4182                 clock_get_system_nanotime(&now, &nanoseconds_dummy);
4183                 if (now > error_notify + 5) {
4184                         dprintf(("Swap File Error.\n"));
4185                         error_notify = now;
4186                 }
4187         } else {
4188                 release = trigger_port;
4189                 kr =  KERN_INVALID_ARGUMENT;
4190         }
4191         PSL_UNLOCK();
4192
4193         if (IP_VALID(release))
4194                 ipc_port_release_send(release);
4195
4196         return kr;
4197 }
4198
4199 /*
4200  * Monitor the amount of available backing store vs. the amount of
4201  * required backing store, notify a listener (if present) when
4202  * backing store may safely be removed.
4203  *
4204  * We attempt to avoid the situation where backing store is
4205  * discarded en masse, as this can lead to thrashing as the
4206  * backing store is compacted.
4207  */
4208
4209 #define PF_INTERVAL     3       /* time between free level checks */
4210 #define PF_LATENCY      10      /* number of intervals before release */
4211
4212 static int dp_pages_free_low_count = 0;
4213 thread_call_t default_pager_backing_store_monitor_callout;
4214
4215 void
4216 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
4217                                                                         __unused thread_call_param_t p2)
4218 {
4219 //      unsigned long long      average;
4220         ipc_port_t              trigger;
4221         uint64_t                deadline;
4222
4223         /*
4224          * We determine whether it will be safe to release some
4225          * backing store by watching the free page level.  If
4226          * it remains below the maximum_pages_free threshold for
4227          * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4228          * then we deem it safe.
4229          *
4230          * Note that this establishes a maximum rate at which backing
4231          * store will be released, as each notification (currently)
4232          * only results in a single backing store object being
4233          * released.
4234          */
4235         if (dp_pages_free > maximum_pages_free) {
4236                 dp_pages_free_low_count++;
4237         } else {
4238                 dp_pages_free_low_count = 0;
4239         }
4240
4241         /* decide whether to send notification */
4242         trigger = IP_NULL;
4243         if (max_pages_trigger_port &&
4244             (backing_store_release_trigger_disable == 0) &&
4245             (dp_pages_free_low_count > PF_LATENCY)) {
4246                 trigger = max_pages_trigger_port;
4247                 max_pages_trigger_port = NULL;
4248         }
4249
4250         /* send notification */
4251         if (trigger != IP_NULL) {
4252                 VSL_LOCK();
4253                 if(backing_store_release_trigger_disable != 0) {
4254                         assert_wait((event_t)
4255                                     &backing_store_release_trigger_disable,
4256                                     THREAD_UNINT);
4257                         VSL_UNLOCK();
4258                         thread_block(THREAD_CONTINUE_NULL);
4259                 } else {
4260                         VSL_UNLOCK();
4261                 }
4262                 default_pager_space_alert(trigger, LO_WAT_ALERT);
4263                 ipc_port_release_send(trigger);
4264                 dp_pages_free_low_count = 0;
4265         }
4266
4267         clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
4268         thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
4269 }