osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /*
  26  * @OSF_COPYRIGHT@
  27  */
  28 /*
  29  * Mach Operating System
  30  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  31  * All Rights Reserved.
  32  *
  33  * Permission to use, copy, modify and distribute this software and its
  34  * documentation is hereby granted, provided that both the copyright
  35  * notice and this permission notice appear in all copies of the
  36  * software, derivative works or modified versions, and any portions
  37  * thereof, and that both notices appear in supporting documentation.
  38  *
  39  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  40  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  41  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  42  *
  43  * Carnegie Mellon requests users of this software to return to
  44  *
  45  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  46  *  School of Computer Science
  47  *  Carnegie Mellon University
  48  *  Pittsburgh PA 15213-3890
  49  *
  50  * any improvements or extensions that they make and grant Carnegie Mellon
  51  * the rights to redistribute these changes.
  52  */
  53
  54 /*
  55  *      Default Pager.
  56  *              Paging File Management.
  57  */
  58
  59 #include <mach/memory_object_control.h>
  60 #include <mach/memory_object_server.h>
  61 #include "default_pager_internal.h"
  62 #include <default_pager/default_pager_alerts.h>
  63 #include <ipc/ipc_port.h>
  64 #include <ipc/ipc_space.h>
  65 #include <kern/queue.h>
  66 #include <kern/counters.h>
  67 #include <kern/sched_prim.h>
  68 #include <vm/vm_kern.h>
  69 #include <vm/vm_pageout.h>
  70 /* CDY CDY */
  71 #include <vm/vm_map.h>
  72
  73 /*
  74  * ALLOC_STRIDE... the maximum number of bytes allocated from
  75  * a swap file before moving on to the next swap file... if
  76  * all swap files reside on a single disk, this value should
  77  * be very large (this is the default assumption)... if the
  78  * swap files are spread across multiple disks, than this value
  79  * should be small (128 * 1024)...
  80  *
  81  * This should be determined dynamically in the future
  82  */
  83
  84 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
  85 int physical_transfer_cluster_count = 0;
  86
  87 #define VM_SUPER_CLUSTER        0x40000
  88 #define VM_SUPER_PAGES          64
  89
  90 /*
  91  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
  92  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
  93  */
  94 #define VSTRUCT_DEF_CLSHIFT     2
  95 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
  96 int default_pager_clsize = 0;
  97
  98 /* statistics */
  99 unsigned int clustered_writes[VM_SUPER_PAGES+1];
 100 unsigned int clustered_reads[VM_SUPER_PAGES+1];
 101
 102 /*
 103  * Globals used for asynchronous paging operations:
 104  *      vs_async_list:  head of list of to-be-completed I/O ops
 105  *      async_num_queued: number of pages completed, but not yet
 106  *              processed by async thread.
 107  *      async_requests_out: number of pages of requests not completed.
 108  */
 109
 110 #if 0
 111 struct vs_async *vs_async_list;
 112 int     async_num_queued;
 113 int     async_requests_out;
 114 #endif
 115
 116
 117 #define VS_ASYNC_REUSE 1
 118 struct vs_async *vs_async_free_list;
 119
 120 mutex_t default_pager_async_lock;       /* Protects globals above */
 121
 122
 123 int vs_alloc_async_failed = 0;                  /* statistics */
 124 int vs_alloc_async_count = 0;                   /* statistics */
 125 struct vs_async *vs_alloc_async(void);          /* forward */
 126 void vs_free_async(struct vs_async *vsa);       /* forward */
 127
 128
 129 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 130 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 131
 132 #define VS_ASYNC_LOCK()         mutex_lock(&default_pager_async_lock)
 133 #define VS_ASYNC_UNLOCK()       mutex_unlock(&default_pager_async_lock)
 134 #define VS_ASYNC_LOCK_INIT()    mutex_init(&default_pager_async_lock,  \
 135                                                 ETAP_IO_DEV_PAGEH)
 136 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 137 /*
 138  *  Paging Space Hysteresis triggers and the target notification port
 139  *
 140  */
 141
 142 unsigned int    minimum_pages_remaining = 0;
 143 unsigned int    maximum_pages_free = 0;
 144 ipc_port_t      min_pages_trigger_port = NULL;
 145 ipc_port_t      max_pages_trigger_port = NULL;
 146
 147 boolean_t       bs_low = FALSE;
 148 int             backing_store_release_trigger_disable = 0;
 149
 150
 151
 152 /*
 153  * Object sizes are rounded up to the next power of 2,
 154  * unless they are bigger than a given maximum size.
 155  */
 156 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 157
 158 /*
 159  * List of all backing store and segments.
 160  */
 161 struct backing_store_list_head backing_store_list;
 162 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 163 mutex_t                 paging_segments_lock;
 164 int                     paging_segment_max = 0;
 165 int                     paging_segment_count = 0;
 166 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 167
 168
 169 /*
 170  * Total pages free in system
 171  * This differs from clusters committed/avail which is a measure of the
 172  * over commitment of paging segments to backing store.  An idea which is
 173  * likely to be deprecated.
 174  */
 175 unsigned  int   dp_pages_free = 0;
 176 unsigned  int   cluster_transfer_minimum = 100;
 177
 178 kern_return_t ps_write_file(paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, int);      /* forward */
 179 kern_return_t ps_read_file (paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, unsigned int *, int);      /* forward */
 180
 181
 182 default_pager_thread_t *
 183 get_read_buffer()
 184 {
 185         int     i;
 186
 187         DPT_LOCK(dpt_lock);
 188         while(TRUE) {
 189                 for (i=0; i<default_pager_internal_count; i++) {
 190                         if(dpt_array[i]->checked_out == FALSE) {
 191                           dpt_array[i]->checked_out = TRUE;
 192                           DPT_UNLOCK(dpt_lock);
 193                           return  dpt_array[i];
 194                         }
 195                 }
 196                 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
 197         }
 198 }
 199
 200 void
 201 bs_initialize(void)
 202 {
 203         int i;
 204
 205         /*
 206          * List of all backing store.
 207          */
 208         BSL_LOCK_INIT();
 209         queue_init(&backing_store_list.bsl_queue);
 210         PSL_LOCK_INIT();
 211
 212         VS_ASYNC_LOCK_INIT();
 213 #if     VS_ASYNC_REUSE
 214         vs_async_free_list = NULL;
 215 #endif  /* VS_ASYNC_REUSE */
 216
 217         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 218                 clustered_writes[i] = 0;
 219                 clustered_reads[i] = 0;
 220         }
 221
 222 }
 223
 224 /*
 225  * When things do not quite workout...
 226  */
 227 void bs_no_paging_space(boolean_t);     /* forward */
 228
 229 void
 230 bs_no_paging_space(
 231         boolean_t out_of_memory)
 232 {
 233
 234         if (out_of_memory)
 235                 dprintf(("*** OUT OF MEMORY ***\n"));
 236         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 237 }
 238
 239 void bs_more_space(int);        /* forward */
 240 void bs_commit(int);            /* forward */
 241
 242 boolean_t       user_warned = FALSE;
 243 unsigned int    clusters_committed = 0;
 244 unsigned int    clusters_available = 0;
 245 unsigned int    clusters_committed_peak = 0;
 246
 247 void
 248 bs_more_space(
 249         int     nclusters)
 250 {
 251         BSL_LOCK();
 252         /*
 253          * Account for new paging space.
 254          */
 255         clusters_available += nclusters;
 256
 257         if (clusters_available >= clusters_committed) {
 258                 if (verbose && user_warned) {
 259                         printf("%s%s - %d excess clusters now.\n",
 260                                my_name,
 261                                "paging space is OK now",
 262                                clusters_available - clusters_committed);
 263                         user_warned = FALSE;
 264                         clusters_committed_peak = 0;
 265                 }
 266         } else {
 267                 if (verbose && user_warned) {
 268                         printf("%s%s - still short of %d clusters.\n",
 269                                my_name,
 270                                "WARNING: paging space over-committed",
 271                                clusters_committed - clusters_available);
 272                         clusters_committed_peak -= nclusters;
 273                 }
 274         }
 275         BSL_UNLOCK();
 276
 277         return;
 278 }
 279
 280 void
 281 bs_commit(
 282         int     nclusters)
 283 {
 284         BSL_LOCK();
 285         clusters_committed += nclusters;
 286         if (clusters_committed > clusters_available) {
 287                 if (verbose && !user_warned) {
 288                         user_warned = TRUE;
 289                         printf("%s%s - short of %d clusters.\n",
 290                                my_name,
 291                                "WARNING: paging space over-committed",
 292                                clusters_committed - clusters_available);
 293                 }
 294                 if (clusters_committed > clusters_committed_peak) {
 295                         clusters_committed_peak = clusters_committed;
 296                 }
 297         } else {
 298                 if (verbose && user_warned) {
 299                         printf("%s%s - was short of up to %d clusters.\n",
 300                                my_name,
 301                                "paging space is OK now",
 302                                clusters_committed_peak - clusters_available);
 303                         user_warned = FALSE;
 304                         clusters_committed_peak = 0;
 305                 }
 306         }
 307         BSL_UNLOCK();
 308
 309         return;
 310 }
 311
 312 int default_pager_info_verbose = 1;
 313
 314 void
 315 bs_global_info(
 316         vm_size_t       *totalp,
 317         vm_size_t       *freep)
 318 {
 319         vm_size_t               pages_total, pages_free;
 320         paging_segment_t        ps;
 321         int                     i;
 322
 323         PSL_LOCK();
 324         pages_total = pages_free = 0;
 325         for (i = 0; i <= paging_segment_max; i++) {
 326                 ps = paging_segments[i];
 327                 if (ps == PAGING_SEGMENT_NULL)
 328                         continue;
 329
 330                 /*
 331                  * no need to lock: by the time this data
 332                  * gets back to any remote requestor it
 333                  * will be obsolete anyways
 334                  */
 335                 pages_total += ps->ps_pgnum;
 336                 pages_free += ps->ps_clcount << ps->ps_clshift;
 337                 DEBUG(DEBUG_BS_INTERNAL,
 338                       ("segment #%d: %d total, %d free\n",
 339                        i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 340         }
 341         *totalp = pages_total;
 342         *freep = pages_free;
 343         if (verbose && user_warned && default_pager_info_verbose) {
 344                 if (clusters_available < clusters_committed) {
 345                         printf("%s %d clusters committed, %d available.\n",
 346                                my_name,
 347                                clusters_committed,
 348                                clusters_available);
 349                 }
 350         }
 351         PSL_UNLOCK();
 352 }
 353
 354 backing_store_t backing_store_alloc(void);      /* forward */
 355
 356 backing_store_t
 357 backing_store_alloc(void)
 358 {
 359         backing_store_t bs;
 360
 361         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 362         if (bs == BACKING_STORE_NULL)
 363                 panic("backing_store_alloc: no memory");
 364
 365         BS_LOCK_INIT(bs);
 366         bs->bs_port = MACH_PORT_NULL;
 367         bs->bs_priority = 0;
 368         bs->bs_clsize = 0;
 369         bs->bs_pages_total = 0;
 370         bs->bs_pages_in = 0;
 371         bs->bs_pages_in_fail = 0;
 372         bs->bs_pages_out = 0;
 373         bs->bs_pages_out_fail = 0;
 374
 375         return bs;
 376 }
 377
 378 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 379
 380 /* Even in both the component space and external versions of this pager, */
 381 /* backing_store_lookup will be called from tasks in the application space */
 382 backing_store_t
 383 backing_store_lookup(
 384         MACH_PORT_FACE port)
 385 {
 386         backing_store_t bs;
 387
 388 /*
 389         port is currently backed with a vs structure in the alias field
 390         we could create an ISBS alias and a port_is_bs call but frankly
 391         I see no reason for the test, the bs->port == port check below
 392         will work properly on junk entries.
 393
 394         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 395 */
 396         if ((port == MACH_PORT_NULL))
 397                 return BACKING_STORE_NULL;
 398
 399         BSL_LOCK();
 400         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 401                       bs_links) {
 402                 BS_LOCK(bs);
 403                 if (bs->bs_port == port) {
 404                         BSL_UNLOCK();
 405                         /* Success, return it locked. */
 406                         return bs;
 407                 }
 408                 BS_UNLOCK(bs);
 409         }
 410         BSL_UNLOCK();
 411         return BACKING_STORE_NULL;
 412 }
 413
 414 void backing_store_add(backing_store_t);        /* forward */
 415
 416 void
 417 backing_store_add(
 418         backing_store_t bs)
 419 {
 420         MACH_PORT_FACE          port = bs->bs_port;
 421         MACH_PORT_FACE          pset = default_pager_default_set;
 422         kern_return_t           kr = KERN_SUCCESS;
 423
 424         if (kr != KERN_SUCCESS)
 425                 panic("backing_store_add: add to set");
 426
 427 }
 428
 429 /*
 430  * Set up default page shift, but only if not already
 431  * set and argument is within range.
 432  */
 433 boolean_t
 434 bs_set_default_clsize(unsigned int npages)
 435 {
 436         switch(npages){
 437             case 1:
 438             case 2:
 439             case 4:
 440             case 8:
 441                 if (default_pager_clsize == 0)  /* if not yet set */
 442                         vstruct_def_clshift = local_log2(npages);
 443                 return(TRUE);
 444         }
 445         return(FALSE);
 446 }
 447
 448 int bs_get_global_clsize(int clsize);   /* forward */
 449
 450 int
 451 bs_get_global_clsize(
 452         int     clsize)
 453 {
 454         int                     i;
 455         memory_object_default_t dmm;
 456         kern_return_t           kr;
 457
 458         /*
 459          * Only allow setting of cluster size once. If called
 460          * with no cluster size (default), we use the compiled-in default
 461          * for the duration. The same cluster size is used for all
 462          * paging segments.
 463          */
 464         if (default_pager_clsize == 0) {
 465                 /*
 466                  * Keep cluster size in bit shift because it's quicker
 467                  * arithmetic, and easier to keep at a power of 2.
 468                  */
 469                 if (clsize != NO_CLSIZE) {
 470                         for (i = 0; (1 << i) < clsize; i++);
 471                         if (i > MAX_CLUSTER_SHIFT)
 472                                 i = MAX_CLUSTER_SHIFT;
 473                         vstruct_def_clshift = i;
 474                 }
 475                 default_pager_clsize = (1 << vstruct_def_clshift);
 476
 477                 /*
 478                  * Let the user know the new (and definitive) cluster size.
 479                  */
 480                 if (verbose)
 481                         printf("%scluster size = %d page%s\n",
 482                                 my_name, default_pager_clsize,
 483                                 (default_pager_clsize == 1) ? "" : "s");
 484
 485                 /*
 486                  * Let the kernel know too, in case it hasn't used the
 487                  * default value provided in main() yet.
 488                  */
 489                 dmm = default_pager_object;
 490                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 491                 kr = host_default_memory_manager(host_priv_self(),
 492                                                  &dmm,
 493                                                  clsize);
 494                 memory_object_default_deallocate(dmm);
 495
 496                 if (kr != KERN_SUCCESS) {
 497                    panic("bs_get_global_cl_size:host_default_memory_manager");
 498                 }
 499                 if (dmm != default_pager_object) {
 500                   panic("bs_get_global_cl_size:there is another default pager");
 501                 }
 502         }
 503         ASSERT(default_pager_clsize > 0 &&
 504                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 505
 506         return default_pager_clsize;
 507 }
 508
 509 kern_return_t
 510 default_pager_backing_store_create(
 511         memory_object_default_t pager,
 512         int                     priority,
 513         int                     clsize,         /* in bytes */
 514         MACH_PORT_FACE          *backing_store)
 515 {
 516         backing_store_t bs;
 517         MACH_PORT_FACE  port;
 518         kern_return_t   kr;
 519         struct vstruct_alias *alias_struct;
 520
 521         if (pager != default_pager_object)
 522                 return KERN_INVALID_ARGUMENT;
 523
 524         bs = backing_store_alloc();
 525         port = ipc_port_alloc_kernel();
 526         ipc_port_make_send(port);
 527         assert (port != IP_NULL);
 528
 529         DEBUG(DEBUG_BS_EXTERNAL,
 530               ("priority=%d clsize=%d bs_port=0x%x\n",
 531                priority, clsize, (int) backing_store));
 532
 533         alias_struct = (struct vstruct_alias *)
 534                                 kalloc(sizeof (struct vstruct_alias));
 535         if(alias_struct != NULL) {
 536                 alias_struct->vs = (struct vstruct *)bs;
 537                 alias_struct->name = ISVS;
 538                 port->alias = (int) alias_struct;
 539         }
 540         else {
 541                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 542                 kfree((vm_offset_t)bs, sizeof (struct backing_store));
 543                 return KERN_RESOURCE_SHORTAGE;
 544         }
 545
 546         bs->bs_port = port;
 547         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 548                 priority = BS_MAXPRI;
 549         else if (priority == BS_NOPRI)
 550                 priority = BS_MAXPRI;
 551         else
 552                 priority = BS_MINPRI;
 553         bs->bs_priority = priority;
 554
 555         bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
 556
 557         BSL_LOCK();
 558         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 559                     bs_links);
 560         BSL_UNLOCK();
 561
 562         backing_store_add(bs);
 563
 564         *backing_store = port;
 565         return KERN_SUCCESS;
 566 }
 567
 568 kern_return_t
 569 default_pager_backing_store_info(
 570         MACH_PORT_FACE          backing_store,
 571         backing_store_flavor_t  flavour,
 572         backing_store_info_t    info,
 573         mach_msg_type_number_t  *size)
 574 {
 575         backing_store_t                 bs;
 576         backing_store_basic_info_t      basic;
 577         int                             i;
 578         paging_segment_t                ps;
 579
 580         if (flavour != BACKING_STORE_BASIC_INFO ||
 581             *size < BACKING_STORE_BASIC_INFO_COUNT)
 582                 return KERN_INVALID_ARGUMENT;
 583
 584         basic = (backing_store_basic_info_t)info;
 585         *size = BACKING_STORE_BASIC_INFO_COUNT;
 586
 587         VSTATS_LOCK(&global_stats.gs_lock);
 588         basic->pageout_calls    = global_stats.gs_pageout_calls;
 589         basic->pagein_calls     = global_stats.gs_pagein_calls;
 590         basic->pages_in         = global_stats.gs_pages_in;
 591         basic->pages_out        = global_stats.gs_pages_out;
 592         basic->pages_unavail    = global_stats.gs_pages_unavail;
 593         basic->pages_init       = global_stats.gs_pages_init;
 594         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 595         VSTATS_UNLOCK(&global_stats.gs_lock);
 596
 597         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 598                 return KERN_INVALID_ARGUMENT;
 599
 600         basic->bs_pages_total   = bs->bs_pages_total;
 601         PSL_LOCK();
 602         bs->bs_pages_free = 0;
 603         for (i = 0; i <= paging_segment_max; i++) {
 604                 ps = paging_segments[i];
 605                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 606                         PS_LOCK(ps);
 607                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 608                         PS_UNLOCK(ps);
 609                 }
 610         }
 611         PSL_UNLOCK();
 612         basic->bs_pages_free    = bs->bs_pages_free;
 613         basic->bs_pages_in      = bs->bs_pages_in;
 614         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 615         basic->bs_pages_out     = bs->bs_pages_out;
 616         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 617
 618         basic->bs_priority      = bs->bs_priority;
 619         basic->bs_clsize        = ptoa_32(bs->bs_clsize);       /* in bytes */
 620
 621         BS_UNLOCK(bs);
 622
 623         return KERN_SUCCESS;
 624 }
 625
 626 int ps_delete(paging_segment_t);        /* forward */
 627
 628 int
 629 ps_delete(
 630         paging_segment_t ps)
 631 {
 632         vstruct_t       vs;
 633         kern_return_t   error = KERN_SUCCESS;
 634         int             vs_count;
 635
 636         VSL_LOCK();             /* get the lock on the list of vs's      */
 637
 638         /* The lock relationship and sequence is farily complicated      */
 639         /* this code looks at a live list, locking and unlocking the list */
 640         /* as it traverses it.  It depends on the locking behavior of    */
 641         /* default_pager_no_senders.  no_senders always locks the vstruct */
 642         /* targeted for removal before locking the vstruct list.  However */
 643         /* it will remove that member of the list without locking its    */
 644         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 645         /* it cannot be removed from the list but we must hold the list  */
 646         /* lock to be sure that its pointers to its neighbors are valid. */
 647         /* Also, we can hold off destruction of a vstruct when the list  */
 648         /* lock and the vs locks are not being held by bumping the       */
 649         /* vs_async_pending count.      */
 650
 651
 652         while(backing_store_release_trigger_disable != 0) {
 653                 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
 654         }
 655
 656         /* we will choose instead to hold a send right */
 657         vs_count = vstruct_list.vsl_count;
 658         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 659         if(vs == (vstruct_t)&vstruct_list)  {
 660                 VSL_UNLOCK();
 661                 return KERN_SUCCESS;
 662         }
 663         VS_LOCK(vs);
 664         vs_async_wait(vs);  /* wait for any pending async writes */
 665         if ((vs_count != 0) && (vs != NULL))
 666                 vs->vs_async_pending += 1;  /* hold parties calling  */
 667                                             /* vs_async_wait */
 668         VS_UNLOCK(vs);
 669         VSL_UNLOCK();
 670         while((vs_count != 0) && (vs != NULL)) {
 671                 /* We take the count of AMO's before beginning the         */
 672                 /* transfer of of the target segment.                      */
 673                 /* We are guaranteed that the target segment cannot get    */
 674                 /* more users.  We also know that queue entries are        */
 675                 /* made at the back of the list.  If some of the entries   */
 676                 /* we would check disappear while we are traversing the    */
 677                 /* list then we will either check new entries which        */
 678                 /* do not have any backing store in the target segment     */
 679                 /* or re-check old entries.  This might not be optimal     */
 680                 /* but it will always be correct. The alternative is to    */
 681                 /* take a snapshot of the list.                            */
 682                 vstruct_t       next_vs;
 683
 684                 if(dp_pages_free < cluster_transfer_minimum)
 685                         error = KERN_FAILURE;
 686                 else {
 687                         vm_object_t     transfer_object;
 688                         int             count;
 689                         upl_t           upl;
 690
 691                         transfer_object = vm_object_allocate(VM_SUPER_CLUSTER);
 692                         count = 0;
 693                         error = vm_object_upl_request(transfer_object,
 694                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 695                                 &upl, NULL, &count,
 696                                 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
 697                                             | UPL_SET_INTERNAL);
 698                         if(error == KERN_SUCCESS) {
 699                                 error = ps_vstruct_transfer_from_segment(
 700                                                         vs, ps, upl);
 701                                 upl_commit(upl, NULL);
 702                                 upl_deallocate(upl);
 703                         } else {
 704                                 error = KERN_FAILURE;
 705                         }
 706                         vm_object_deallocate(transfer_object);
 707                 }
 708                 if(error) {
 709                         VS_LOCK(vs);
 710                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 711                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 712                                 vs->vs_waiting_async = FALSE;
 713                                 VS_UNLOCK(vs);
 714                                 thread_wakeup(&vs->vs_async_pending);
 715                         } else {
 716                                 VS_UNLOCK(vs);
 717                         }
 718                         return KERN_FAILURE;
 719                 }
 720
 721                 VSL_LOCK();
 722
 723                 while(backing_store_release_trigger_disable != 0) {
 724                         VSL_SLEEP(&backing_store_release_trigger_disable,
 725                                   THREAD_UNINT);
 726                 }
 727
 728                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 729                 if((next_vs != (vstruct_t)&vstruct_list) &&
 730                                 (vs != next_vs) && (vs_count != 1)) {
 731                         VS_LOCK(next_vs);
 732                         vs_async_wait(next_vs);  /* wait for any  */
 733                                                  /* pending async writes */
 734                         next_vs->vs_async_pending += 1; /* hold parties  */
 735                                                 /* calling vs_async_wait */
 736                         VS_UNLOCK(next_vs);
 737                 }
 738                 VSL_UNLOCK();
 739                 VS_LOCK(vs);
 740                 vs->vs_async_pending -= 1;
 741                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 742                         vs->vs_waiting_async = FALSE;
 743                         VS_UNLOCK(vs);
 744                         thread_wakeup(&vs->vs_async_pending);
 745                 } else {
 746                         VS_UNLOCK(vs);
 747                 }
 748                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 749                         vs = NULL;
 750                 else
 751                         vs = next_vs;
 752                 vs_count--;
 753         }
 754         return KERN_SUCCESS;
 755 }
 756
 757
 758 kern_return_t
 759 default_pager_backing_store_delete(
 760         MACH_PORT_FACE backing_store)
 761 {
 762         backing_store_t         bs;
 763         int                     i;
 764         paging_segment_t        ps;
 765         int                     error;
 766         int                     interim_pages_removed = 0;
 767         kern_return_t           kr;
 768
 769         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 770                 return KERN_INVALID_ARGUMENT;
 771
 772 #if 0
 773         /* not implemented */
 774         BS_UNLOCK(bs);
 775         return KERN_FAILURE;
 776 #endif
 777
 778     restart:
 779         PSL_LOCK();
 780         error = KERN_SUCCESS;
 781         for (i = 0; i <= paging_segment_max; i++) {
 782                 ps = paging_segments[i];
 783                 if (ps != PAGING_SEGMENT_NULL &&
 784                     ps->ps_bs == bs &&
 785                     ! ps->ps_going_away) {
 786                         PS_LOCK(ps);
 787                         /* disable access to this segment */
 788                         ps->ps_going_away = TRUE;
 789                         PS_UNLOCK(ps);
 790                         /*
 791                          * The "ps" segment is "off-line" now,
 792                          * we can try and delete it...
 793                          */
 794                         if(dp_pages_free < (cluster_transfer_minimum
 795                                                         + ps->ps_pgcount)) {
 796                                 error = KERN_FAILURE;
 797                                 PSL_UNLOCK();
 798                         }
 799                         else {
 800                                 /* remove all pages associated with the  */
 801                                 /* segment from the list of free pages   */
 802                                 /* when transfer is through, all target  */
 803                                 /* segment pages will appear to be free  */
 804
 805                                 dp_pages_free -=  ps->ps_pgcount;
 806                                 interim_pages_removed += ps->ps_pgcount;
 807                                 PSL_UNLOCK();
 808                                 error = ps_delete(ps);
 809                         }
 810                         if (error != KERN_SUCCESS) {
 811                                 /*
 812                                  * We couldn't delete the segment,
 813                                  * probably because there's not enough
 814                                  * virtual memory left.
 815                                  * Re-enable all the segments.
 816                                  */
 817                                 PSL_LOCK();
 818                                 break;
 819                         }
 820                         goto restart;
 821                 }
 822         }
 823
 824         if (error != KERN_SUCCESS) {
 825                 for (i = 0; i <= paging_segment_max; i++) {
 826                         ps = paging_segments[i];
 827                         if (ps != PAGING_SEGMENT_NULL &&
 828                             ps->ps_bs == bs &&
 829                             ps->ps_going_away) {
 830                                 PS_LOCK(ps);
 831                                 /* re-enable access to this segment */
 832                                 ps->ps_going_away = FALSE;
 833                                 PS_UNLOCK(ps);
 834                         }
 835                 }
 836                 dp_pages_free += interim_pages_removed;
 837                 PSL_UNLOCK();
 838                 BS_UNLOCK(bs);
 839                 return error;
 840         }
 841
 842         for (i = 0; i <= paging_segment_max; i++) {
 843                 ps = paging_segments[i];
 844                 if (ps != PAGING_SEGMENT_NULL &&
 845                     ps->ps_bs == bs) {
 846                         if(ps->ps_going_away) {
 847                                 paging_segments[i] = PAGING_SEGMENT_NULL;
 848                                 paging_segment_count--;
 849                                 PS_LOCK(ps);
 850                                 kfree((vm_offset_t)ps->ps_bmap,
 851                                                 RMAPSIZE(ps->ps_ncls));
 852                                 kfree((vm_offset_t)ps, sizeof *ps);
 853                         }
 854                 }
 855         }
 856
 857         /* Scan the entire ps array separately to make certain we find the */
 858         /* proper paging_segment_max                                       */
 859         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 860                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 861                    paging_segment_max = i;
 862         }
 863
 864         PSL_UNLOCK();
 865
 866         /*
 867          * All the segments have been deleted.
 868          * We can remove the backing store.
 869          */
 870
 871         /*
 872          * Disable lookups of this backing store.
 873          */
 874         if((void *)bs->bs_port->alias != NULL)
 875                 kfree((vm_offset_t) bs->bs_port->alias,
 876                                 sizeof (struct vstruct_alias));
 877         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 878         bs->bs_port = MACH_PORT_NULL;
 879         BS_UNLOCK(bs);
 880
 881         /*
 882          * Remove backing store from backing_store list.
 883          */
 884         BSL_LOCK();
 885         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
 886                      bs_links);
 887         BSL_UNLOCK();
 888
 889         /*
 890          * Free the backing store structure.
 891          */
 892         kfree((vm_offset_t)bs, sizeof *bs);
 893
 894         return KERN_SUCCESS;
 895 }
 896
 897 int     ps_enter(paging_segment_t);     /* forward */
 898
 899 int
 900 ps_enter(
 901         paging_segment_t ps)
 902 {
 903         int i;
 904
 905         PSL_LOCK();
 906
 907         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 908                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
 909                         break;
 910         }
 911
 912         if (i < MAX_NUM_PAGING_SEGMENTS) {
 913                 paging_segments[i] = ps;
 914                 if (i > paging_segment_max)
 915                         paging_segment_max = i;
 916                 paging_segment_count++;
 917                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
 918                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
 919                         ps_select_array[ps->ps_bs->bs_priority] = 0;
 920                 i = 0;
 921         } else {
 922                 PSL_UNLOCK();
 923                 return KERN_RESOURCE_SHORTAGE;
 924         }
 925
 926         PSL_UNLOCK();
 927         return i;
 928 }
 929
 930 #ifdef DEVICE_PAGING
 931 kern_return_t
 932 default_pager_add_segment(
 933         MACH_PORT_FACE  backing_store,
 934         MACH_PORT_FACE  device,
 935         recnum_t        offset,
 936         recnum_t        count,
 937         int             record_size)
 938 {
 939         backing_store_t         bs;
 940         paging_segment_t        ps;
 941         int                     i;
 942         int                     error;
 943
 944         if ((bs = backing_store_lookup(backing_store))
 945             == BACKING_STORE_NULL)
 946                 return KERN_INVALID_ARGUMENT;
 947
 948         PSL_LOCK();
 949         for (i = 0; i <= paging_segment_max; i++) {
 950                 ps = paging_segments[i];
 951                 if (ps == PAGING_SEGMENT_NULL)
 952                         continue;
 953
 954                 /*
 955                  * Check for overlap on same device.
 956                  */
 957                 if (!(ps->ps_device != device
 958                       || offset >= ps->ps_offset + ps->ps_recnum
 959                       || offset + count <= ps->ps_offset)) {
 960                         PSL_UNLOCK();
 961                         BS_UNLOCK(bs);
 962                         return KERN_INVALID_ARGUMENT;
 963                 }
 964         }
 965         PSL_UNLOCK();
 966
 967         /*
 968          * Set up the paging segment
 969          */
 970         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
 971         if (ps == PAGING_SEGMENT_NULL) {
 972                 BS_UNLOCK(bs);
 973                 return KERN_RESOURCE_SHORTAGE;
 974         }
 975
 976         ps->ps_segtype = PS_PARTITION;
 977         ps->ps_device = device;
 978         ps->ps_offset = offset;
 979         ps->ps_record_shift = local_log2(vm_page_size / record_size);
 980         ps->ps_recnum = count;
 981         ps->ps_pgnum = count >> ps->ps_record_shift;
 982
 983         ps->ps_pgcount = ps->ps_pgnum;
 984         ps->ps_clshift = local_log2(bs->bs_clsize);
 985         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
 986         ps->ps_hint = 0;
 987
 988         PS_LOCK_INIT(ps);
 989         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
 990         if (!ps->ps_bmap) {
 991                 kfree((vm_offset_t)ps, sizeof *ps);
 992                 BS_UNLOCK(bs);
 993                 return KERN_RESOURCE_SHORTAGE;
 994         }
 995         for (i = 0; i < ps->ps_ncls; i++) {
 996                 clrbit(ps->ps_bmap, i);
 997         }
 998
 999         ps->ps_going_away = FALSE;
1000         ps->ps_bs = bs;
1001
1002         if ((error = ps_enter(ps)) != 0) {
1003                 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1004                 kfree((vm_offset_t)ps, sizeof *ps);
1005                 BS_UNLOCK(bs);
1006                 return KERN_RESOURCE_SHORTAGE;
1007         }
1008
1009         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1010         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1011         BS_UNLOCK(bs);
1012
1013         PSL_LOCK();
1014         dp_pages_free += ps->ps_pgcount;
1015         PSL_UNLOCK();
1016
1017         bs_more_space(ps->ps_clcount);
1018
1019         DEBUG(DEBUG_BS_INTERNAL,
1020               ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1021                device, offset, count, record_size,
1022                ps->ps_record_shift, ps->ps_pgnum));
1023
1024         return KERN_SUCCESS;
1025 }
1026
1027 boolean_t
1028 bs_add_device(
1029         char            *dev_name,
1030         MACH_PORT_FACE  master)
1031 {
1032         security_token_t        null_security_token = {
1033                 { 0, 0 }
1034         };
1035         MACH_PORT_FACE  device;
1036         int             info[DEV_GET_SIZE_COUNT];
1037         mach_msg_type_number_t info_count;
1038         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1039         unsigned int    rec_size;
1040         recnum_t        count;
1041         int             clsize;
1042         MACH_PORT_FACE  reply_port;
1043
1044         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1045                         null_security_token, dev_name, &device))
1046                 return FALSE;
1047
1048         info_count = DEV_GET_SIZE_COUNT;
1049         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1050                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1051                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1052                 clsize = bs_get_global_clsize(0);
1053                 if (!default_pager_backing_store_create(
1054                                         default_pager_object,
1055                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1056                                         (clsize * vm_page_size),
1057                                         &bs)) {
1058                         if (!default_pager_add_segment(bs, device,
1059                                                        0, count, rec_size)) {
1060                                 return TRUE;
1061                         }
1062                         ipc_port_release_receive(bs);
1063                 }
1064         }
1065
1066         ipc_port_release_send(device);
1067         return FALSE;
1068 }
1069 #endif /* DEVICE_PAGING */
1070
1071 #if     VS_ASYNC_REUSE
1072
1073 struct vs_async *
1074 vs_alloc_async(void)
1075 {
1076         struct vs_async *vsa;
1077         MACH_PORT_FACE  reply_port;
1078         kern_return_t   kr;
1079
1080         VS_ASYNC_LOCK();
1081         if (vs_async_free_list == NULL) {
1082                 VS_ASYNC_UNLOCK();
1083                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1084                 if (vsa != NULL) {
1085                         /*
1086                          * Try allocating a reply port named after the
1087                          * address of the vs_async structure.
1088                          */
1089                         struct vstruct_alias    *alias_struct;
1090
1091                         reply_port = ipc_port_alloc_kernel();
1092                         alias_struct = (struct vstruct_alias *)
1093                                 kalloc(sizeof (struct vstruct_alias));
1094                         if(alias_struct != NULL) {
1095                                 alias_struct->vs = (struct vstruct *)vsa;
1096                                 alias_struct->name = ISVS;
1097                                 reply_port->alias = (int) alias_struct;
1098                                 vsa->reply_port = reply_port;
1099                                 vs_alloc_async_count++;
1100                         }
1101                         else {
1102                                 vs_alloc_async_failed++;
1103                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1104                                                                 (reply_port));
1105                                 kfree((vm_offset_t)vsa,
1106                                                 sizeof (struct vs_async));
1107                                 vsa = NULL;
1108                         }
1109                 }
1110         } else {
1111                 vsa = vs_async_free_list;
1112                 vs_async_free_list = vs_async_free_list->vsa_next;
1113                 VS_ASYNC_UNLOCK();
1114         }
1115
1116         return vsa;
1117 }
1118
1119 void
1120 vs_free_async(
1121         struct vs_async *vsa)
1122 {
1123         VS_ASYNC_LOCK();
1124         vsa->vsa_next = vs_async_free_list;
1125         vs_async_free_list = vsa;
1126         VS_ASYNC_UNLOCK();
1127 }
1128
1129 #else   /* VS_ASYNC_REUSE */
1130
1131 struct vs_async *
1132 vs_alloc_async(void)
1133 {
1134         struct vs_async *vsa;
1135         MACH_PORT_FACE  reply_port;
1136         kern_return_t   kr;
1137
1138         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1139         if (vsa != NULL) {
1140                 /*
1141                  * Try allocating a reply port named after the
1142                  * address of the vs_async structure.
1143                  */
1144                         reply_port = ipc_port_alloc_kernel();
1145                         alias_struct = (vstruct_alias *)
1146                                 kalloc(sizeof (struct vstruct_alias));
1147                         if(alias_struct != NULL) {
1148                                 alias_struct->vs = reply_port;
1149                                 alias_struct->name = ISVS;
1150                                 reply_port->alias = (int) vsa;
1151                                 vsa->reply_port = reply_port;
1152                                 vs_alloc_async_count++;
1153                         }
1154                         else {
1155                                 vs_alloc_async_failed++;
1156                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1157                                                                 (reply_port));
1158                                 kfree((vm_offset_t) vsa,
1159                                                 sizeof (struct vs_async));
1160                                 vsa = NULL;
1161                         }
1162         }
1163
1164         return vsa;
1165 }
1166
1167 void
1168 vs_free_async(
1169         struct vs_async *vsa)
1170 {
1171         MACH_PORT_FACE  reply_port;
1172         kern_return_t   kr;
1173
1174         reply_port = vsa->reply_port;
1175         kfree((vm_offset_t) reply_port->alias, sizeof (struct vstuct_alias));
1176         kfree((vm_offset_t) vsa, sizeof (struct vs_async));
1177         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1178 #if 0
1179         VS_ASYNC_LOCK();
1180         vs_alloc_async_count--;
1181         VS_ASYNC_UNLOCK();
1182 #endif
1183 }
1184
1185 #endif  /* VS_ASYNC_REUSE */
1186
1187 zone_t  vstruct_zone;
1188
1189 vstruct_t
1190 ps_vstruct_create(
1191         vm_size_t size)
1192 {
1193         vstruct_t       vs;
1194         int             i;
1195
1196         vs = (vstruct_t) zalloc(vstruct_zone);
1197         if (vs == VSTRUCT_NULL) {
1198                 return VSTRUCT_NULL;
1199         }
1200
1201         VS_LOCK_INIT(vs);
1202
1203         /*
1204          * The following fields will be provided later.
1205          */
1206         vs->vs_mem_obj = NULL;
1207         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1208         vs->vs_references = 1;
1209         vs->vs_seqno = 0;
1210
1211 #ifdef MACH_KERNEL
1212         vs->vs_waiting_seqno = FALSE;
1213         vs->vs_waiting_read = FALSE;
1214         vs->vs_waiting_write = FALSE;
1215         vs->vs_waiting_async = FALSE;
1216 #else
1217         mutex_init(&vs->vs_waiting_seqno, ETAP_DPAGE_VSSEQNO);
1218         mutex_init(&vs->vs_waiting_read, ETAP_DPAGE_VSREAD);
1219         mutex_init(&vs->vs_waiting_write, ETAP_DPAGE_VSWRITE);
1220         mutex_init(&vs->vs_waiting_refs, ETAP_DPAGE_VSREFS);
1221         mutex_init(&vs->vs_waiting_async, ETAP_DPAGE_VSASYNC);
1222 #endif
1223
1224         vs->vs_readers = 0;
1225         vs->vs_writers = 0;
1226
1227         vs->vs_errors = 0;
1228
1229         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1230         vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1231         vs->vs_async_pending = 0;
1232
1233         /*
1234          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1235          * depending on the size of the memory object.
1236          */
1237         if (INDIRECT_CLMAP(vs->vs_size)) {
1238                 vs->vs_imap = (struct vs_map **)
1239                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1240                 vs->vs_indirect = TRUE;
1241         } else {
1242                 vs->vs_dmap = (struct vs_map *)
1243                         kalloc(CLMAP_SIZE(vs->vs_size));
1244                 vs->vs_indirect = FALSE;
1245         }
1246         vs->vs_xfer_pending = FALSE;
1247         DEBUG(DEBUG_VS_INTERNAL,
1248               ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1249
1250         /*
1251          * Check to see that we got the space.
1252          */
1253         if (!vs->vs_dmap) {
1254                 kfree((vm_offset_t)vs, sizeof *vs);
1255                 return VSTRUCT_NULL;
1256         }
1257
1258         /*
1259          * Zero the indirect pointers, or clear the direct pointers.
1260          */
1261         if (vs->vs_indirect)
1262                 memset(vs->vs_imap, 0,
1263                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1264         else
1265                 for (i = 0; i < vs->vs_size; i++)
1266                         VSM_CLR(vs->vs_dmap[i]);
1267
1268         VS_MAP_LOCK_INIT(vs);
1269
1270         bs_commit(vs->vs_size);
1271
1272         return vs;
1273 }
1274
1275 paging_segment_t ps_select_segment(int, int *); /* forward */
1276
1277 paging_segment_t
1278 ps_select_segment(
1279         int     shift,
1280         int     *psindex)
1281 {
1282         paging_segment_t        ps;
1283         int                     i;
1284         int                     j;
1285
1286         /*
1287          * Optimize case where there's only one segment.
1288          * paging_segment_max will index the one and only segment.
1289          */
1290
1291         PSL_LOCK();
1292         if (paging_segment_count == 1) {
1293                 paging_segment_t lps;   /* used to avoid extra PS_UNLOCK */
1294                 ipc_port_t trigger = IP_NULL;
1295
1296                 ps = paging_segments[paging_segment_max];
1297                 *psindex = paging_segment_max;
1298                 PS_LOCK(ps);
1299                 if (ps->ps_going_away) {
1300                         /* this segment is being turned off */
1301                         lps = PAGING_SEGMENT_NULL;
1302                 } else {
1303                         ASSERT(ps->ps_clshift >= shift);
1304                         if (ps->ps_clcount) {
1305                                 ps->ps_clcount--;
1306                                 dp_pages_free -=  1 << ps->ps_clshift;
1307                                 if(min_pages_trigger_port &&
1308                                   (dp_pages_free < minimum_pages_remaining)) {
1309                                         trigger = min_pages_trigger_port;
1310                                         min_pages_trigger_port = NULL;
1311                                         bs_low = TRUE;
1312                                 }
1313                                 lps = ps;
1314                         } else
1315                                 lps = PAGING_SEGMENT_NULL;
1316                 }
1317                 PS_UNLOCK(ps);
1318                 PSL_UNLOCK();
1319
1320                 if (trigger != IP_NULL) {
1321                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1322                         ipc_port_release_send(trigger);
1323                 }
1324                 return lps;
1325         }
1326
1327         if (paging_segment_count == 0) {
1328                 PSL_UNLOCK();
1329                 return PAGING_SEGMENT_NULL;
1330         }
1331
1332         for (i = BS_MAXPRI;
1333              i >= BS_MINPRI; i--) {
1334                 int start_index;
1335
1336                 if ((ps_select_array[i] == BS_NOPRI) ||
1337                                 (ps_select_array[i] == BS_FULLPRI))
1338                         continue;
1339                 start_index = ps_select_array[i];
1340
1341                 if(!(paging_segments[start_index])) {
1342                         j = start_index+1;
1343                         physical_transfer_cluster_count = 0;
1344                 }
1345                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1346                                 (((paging_segments[start_index])->ps_clshift)
1347                                 + vm_page_shift))) {
1348                         physical_transfer_cluster_count = 0;
1349                         j = start_index + 1;
1350                 } else {
1351                         physical_transfer_cluster_count+=1;
1352                         j = start_index;
1353                         if(start_index == 0)
1354                                 start_index = paging_segment_max;
1355                         else
1356                                 start_index = start_index - 1;
1357                 }
1358
1359                 while (1) {
1360                         if (j > paging_segment_max)
1361                                 j = 0;
1362                         if ((ps = paging_segments[j]) &&
1363                             (ps->ps_bs->bs_priority == i)) {
1364                                 /*
1365                                  * Force the ps cluster size to be
1366                                  * >= that of the vstruct.
1367                                  */
1368                                 PS_LOCK(ps);
1369                                 if (ps->ps_going_away) {
1370                                         /* this segment is being turned off */
1371                                 } else if ((ps->ps_clcount) &&
1372                                            (ps->ps_clshift >= shift)) {
1373                                         ipc_port_t trigger = IP_NULL;
1374
1375                                         ps->ps_clcount--;
1376                                         dp_pages_free -=  1 << ps->ps_clshift;
1377                                         if(min_pages_trigger_port &&
1378                                                 (dp_pages_free <
1379                                                 minimum_pages_remaining)) {
1380                                                 trigger = min_pages_trigger_port;
1381                                                 min_pages_trigger_port = NULL;
1382                                         }
1383                                         PS_UNLOCK(ps);
1384                                         /*
1385                                          * found one, quit looking.
1386                                          */
1387                                         ps_select_array[i] = j;
1388                                         PSL_UNLOCK();
1389
1390                                         if (trigger != IP_NULL) {
1391                                                 default_pager_space_alert(
1392                                                         trigger,
1393                                                         HI_WAT_ALERT);
1394                                                 ipc_port_release_send(trigger);
1395                                         }
1396                                         *psindex = j;
1397                                         return ps;
1398                                 }
1399                                 PS_UNLOCK(ps);
1400                         }
1401                         if (j == start_index) {
1402                                 /*
1403                                  * none at this priority -- mark it full
1404                                  */
1405                                 ps_select_array[i] = BS_FULLPRI;
1406                                 break;
1407                         }
1408                         j++;
1409                 }
1410         }
1411         PSL_UNLOCK();
1412         return PAGING_SEGMENT_NULL;
1413 }
1414
1415 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1416
1417 vm_offset_t
1418 ps_allocate_cluster(
1419         vstruct_t               vs,
1420         int                     *psindex,
1421         paging_segment_t        use_ps)
1422 {
1423         int                     byte_num;
1424         int                     bit_num = 0;
1425         paging_segment_t        ps;
1426         vm_offset_t             cluster;
1427         ipc_port_t              trigger = IP_NULL;
1428
1429         /*
1430          * Find best paging segment.
1431          * ps_select_segment will decrement cluster count on ps.
1432          * Must pass cluster shift to find the most appropriate segment.
1433          */
1434         /* NOTE:  The addition of paging segment delete capability threatened
1435          * to seriously complicate the treatment of paging segments in this
1436          * module and the ones that call it (notably ps_clmap), because of the
1437          * difficulty in assuring that the paging segment would continue to
1438          * exist between being unlocked and locked.   This was
1439          * avoided because all calls to this module are based in either
1440          * dp_memory_object calls which rely on the vs lock, or by
1441          * the transfer function which is part of the segment delete path.
1442          * The transfer function which is part of paging segment delete is
1443          * protected from multiple callers by the backing store lock.
1444          * The paging segment delete function treats mappings to a paging
1445          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1446          * while data is transferred to the remaining segments.  This is in
1447          * line with the view that incomplete or in-transition mappings between
1448          * data, a vstruct, and backing store are protected by the vs lock.
1449          * This and the ordering of the paging segment "going_away" bit setting
1450          * protects us.
1451          */
1452         if (use_ps != PAGING_SEGMENT_NULL) {
1453                 ps = use_ps;
1454                 PSL_LOCK();
1455                 PS_LOCK(ps);
1456                 ps->ps_clcount--;
1457                 dp_pages_free -=  1 << ps->ps_clshift;
1458                 if(min_pages_trigger_port &&
1459                                 (dp_pages_free < minimum_pages_remaining)) {
1460                         trigger = min_pages_trigger_port;
1461                         min_pages_trigger_port = NULL;
1462                 }
1463                 PSL_UNLOCK();
1464                 PS_UNLOCK(ps);
1465                 if (trigger != IP_NULL) {
1466                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1467                         ipc_port_release_send(trigger);
1468                 }
1469
1470         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1471                    PAGING_SEGMENT_NULL) {
1472 #if 0
1473                 bs_no_paging_space(TRUE);
1474 #endif
1475 #if 0
1476                 if (verbose)
1477 #endif
1478                         dprintf(("no space in available paging segments; "
1479                                  "swapon suggested\n"));
1480                 /* the count got off maybe, reset to zero */
1481                 PSL_LOCK();
1482                 dp_pages_free = 0;
1483                 if(min_pages_trigger_port) {
1484                         trigger = min_pages_trigger_port;
1485                         min_pages_trigger_port = NULL;
1486                         bs_low = TRUE;
1487                 }
1488                 PSL_UNLOCK();
1489                 if (trigger != IP_NULL) {
1490                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1491                         ipc_port_release_send(trigger);
1492                 }
1493                 return (vm_offset_t) -1;
1494         }
1495         ASSERT(ps->ps_clcount != 0);
1496
1497         /*
1498          * Look for an available cluster.  At the end of the loop,
1499          * byte_num is the byte offset and bit_num is the bit offset of the
1500          * first zero bit in the paging segment bitmap.
1501          */
1502         PS_LOCK(ps);
1503         byte_num = ps->ps_hint;
1504         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1505                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1506                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1507                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1508                                         break;
1509                         }
1510                         ASSERT(bit_num != NBBY);
1511                         break;
1512                 }
1513         }
1514         ps->ps_hint = byte_num;
1515         cluster = (byte_num*NBBY) + bit_num;
1516
1517         /* Space was reserved, so this must be true */
1518         ASSERT(cluster < ps->ps_ncls);
1519
1520         setbit(ps->ps_bmap, cluster);
1521         PS_UNLOCK(ps);
1522
1523         return cluster;
1524 }
1525
1526 void ps_deallocate_cluster(paging_segment_t, vm_offset_t);      /* forward */
1527
1528 void
1529 ps_deallocate_cluster(
1530         paging_segment_t        ps,
1531         vm_offset_t             cluster)
1532 {
1533         ipc_port_t trigger = IP_NULL;
1534
1535         if (cluster >= (vm_offset_t) ps->ps_ncls)
1536                 panic("ps_deallocate_cluster: Invalid cluster number");
1537
1538         /*
1539          * Lock the paging segment, clear the cluster's bitmap and increment the
1540          * number of free cluster.
1541          */
1542         PSL_LOCK();
1543         PS_LOCK(ps);
1544         clrbit(ps->ps_bmap, cluster);
1545         ++ps->ps_clcount;
1546         dp_pages_free +=  1 << ps->ps_clshift;
1547         if(max_pages_trigger_port
1548                 && (backing_store_release_trigger_disable == 0)
1549                 && (dp_pages_free > maximum_pages_free)) {
1550                 trigger = max_pages_trigger_port;
1551                 max_pages_trigger_port = NULL;
1552         }
1553         PSL_UNLOCK();
1554
1555         /*
1556          * Move the hint down to the freed cluster if it is
1557          * less than the current hint.
1558          */
1559         if ((cluster/NBBY) < ps->ps_hint) {
1560                 ps->ps_hint = (cluster/NBBY);
1561         }
1562
1563         PS_UNLOCK(ps);
1564
1565         /*
1566          * If we're freeing space on a full priority, reset the array.
1567          */
1568         PSL_LOCK();
1569         if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1570                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1571         PSL_UNLOCK();
1572
1573         if (trigger != IP_NULL) {
1574                 VSL_LOCK();
1575                 if(backing_store_release_trigger_disable != 0) {
1576                         assert_wait((event_t)
1577                             &backing_store_release_trigger_disable,
1578                             THREAD_UNINT);
1579                         VSL_UNLOCK();
1580                         thread_block(THREAD_CONTINUE_NULL);
1581                 } else {
1582                         VSL_UNLOCK();
1583                 }
1584                 default_pager_space_alert(trigger, LO_WAT_ALERT);
1585                 ipc_port_release_send(trigger);
1586         }
1587
1588         return;
1589 }
1590
1591 void ps_dealloc_vsmap(struct vs_map *, vm_size_t);      /* forward */
1592
1593 void
1594 ps_dealloc_vsmap(
1595         struct vs_map   *vsmap,
1596         vm_size_t       size)
1597 {
1598         int i;
1599         for (i = 0; i < size; i++)
1600                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1601                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1602                                               VSM_CLOFF(vsmap[i]));
1603 }
1604
1605 void
1606 ps_vstruct_dealloc(
1607         vstruct_t vs)
1608 {
1609         int     i;
1610         spl_t   s;
1611
1612         VS_MAP_LOCK(vs);
1613
1614         /*
1615          * If this is an indirect structure, then we walk through the valid
1616          * (non-zero) indirect pointers and deallocate the clusters
1617          * associated with each used map entry (via ps_dealloc_vsmap).
1618          * When all of the clusters in an indirect block have been
1619          * freed, we deallocate the block.  When all of the indirect
1620          * blocks have been deallocated we deallocate the memory
1621          * holding the indirect pointers.
1622          */
1623         if (vs->vs_indirect) {
1624                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1625                         if (vs->vs_imap[i] != NULL) {
1626                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1627                                 kfree((vm_offset_t)vs->vs_imap[i],
1628                                                         CLMAP_THRESHOLD);
1629                         }
1630                 }
1631                 kfree((vm_offset_t)vs->vs_imap,
1632                                         INDIRECT_CLMAP_SIZE(vs->vs_size));
1633         } else {
1634                 /*
1635                  * Direct map.  Free used clusters, then memory.
1636                  */
1637                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1638                 kfree((vm_offset_t)vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1639         }
1640         VS_MAP_UNLOCK(vs);
1641
1642         bs_commit(- vs->vs_size);
1643
1644         zfree(vstruct_zone, (vm_offset_t)vs);
1645 }
1646
1647 int ps_map_extend(vstruct_t, int);      /* forward */
1648
1649 int ps_map_extend(
1650         vstruct_t       vs,
1651         int             new_size)
1652 {
1653         struct vs_map   **new_imap;
1654         struct vs_map   *new_dmap = NULL;
1655         int             newdsize;
1656         int             i;
1657         void            *old_map = NULL;
1658         int             old_map_size = 0;
1659
1660         if (vs->vs_size >= new_size) {
1661                 /*
1662                  * Someone has already done the work.
1663                  */
1664                 return 0;
1665         }
1666
1667         /*
1668          * If the new size extends into the indirect range, then we have one
1669          * of two cases: we are going from indirect to indirect, or we are
1670          * going from direct to indirect.  If we are going from indirect to
1671          * indirect, then it is possible that the new size will fit in the old
1672          * indirect map.  If this is the case, then just reset the size of the
1673          * vstruct map and we are done.  If the new size will not
1674          * fit into the old indirect map, then we have to allocate a new
1675          * indirect map and copy the old map pointers into this new map.
1676          *
1677          * If we are going from direct to indirect, then we have to allocate a
1678          * new indirect map and copy the old direct pages into the first
1679          * indirect page of the new map.
1680          * NOTE: allocating memory here is dangerous, as we're in the
1681          * pageout path.
1682          */
1683         if (INDIRECT_CLMAP(new_size)) {
1684                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1685
1686                 /*
1687                  * Get a new indirect map and zero it.
1688                  */
1689                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1690                 if (vs->vs_indirect &&
1691                     (new_map_size == old_map_size)) {
1692                         bs_commit(new_size - vs->vs_size);
1693                         vs->vs_size = new_size;
1694                         return 0;
1695                 }
1696
1697                 new_imap = (struct vs_map **)kalloc(new_map_size);
1698                 if (new_imap == NULL) {
1699                         return -1;
1700                 }
1701                 memset(new_imap, 0, new_map_size);
1702
1703                 if (vs->vs_indirect) {
1704                         /* Copy old entries into new map */
1705                         memcpy(new_imap, vs->vs_imap, old_map_size);
1706                         /* Arrange to free the old map */
1707                         old_map = (void *) vs->vs_imap;
1708                         newdsize = 0;
1709                 } else {        /* Old map was a direct map */
1710                         /* Allocate an indirect page */
1711                         if ((new_imap[0] = (struct vs_map *)
1712                              kalloc(CLMAP_THRESHOLD)) == NULL) {
1713                                 kfree((vm_offset_t)new_imap, new_map_size);
1714                                 return -1;
1715                         }
1716                         new_dmap = new_imap[0];
1717                         newdsize = CLMAP_ENTRIES;
1718                 }
1719         } else {
1720                 new_imap = NULL;
1721                 newdsize = new_size;
1722                 /*
1723                  * If the new map is a direct map, then the old map must
1724                  * also have been a direct map.  All we have to do is
1725                  * to allocate a new direct map, copy the old entries
1726                  * into it and free the old map.
1727                  */
1728                 if ((new_dmap = (struct vs_map *)
1729                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
1730                         return -1;
1731                 }
1732         }
1733         if (newdsize) {
1734
1735                 /* Free the old map */
1736                 old_map = (void *) vs->vs_dmap;
1737                 old_map_size = CLMAP_SIZE(vs->vs_size);
1738
1739                 /* Copy info from the old map into the new map */
1740                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1741
1742                 /* Initialize the rest of the new map */
1743                 for (i = vs->vs_size; i < newdsize; i++)
1744                         VSM_CLR(new_dmap[i]);
1745         }
1746         if (new_imap) {
1747                 vs->vs_imap = new_imap;
1748                 vs->vs_indirect = TRUE;
1749         } else
1750                 vs->vs_dmap = new_dmap;
1751         bs_commit(new_size - vs->vs_size);
1752         vs->vs_size = new_size;
1753         if (old_map)
1754                 kfree((vm_offset_t)old_map, old_map_size);
1755         return 0;
1756 }
1757
1758 vm_offset_t
1759 ps_clmap(
1760         vstruct_t       vs,
1761         vm_offset_t     offset,
1762         struct clmap    *clmap,
1763         int             flag,
1764         vm_size_t       size,
1765         int             error)
1766 {
1767         vm_offset_t     cluster;        /* The cluster of offset.       */
1768         vm_offset_t     newcl;          /* The new cluster allocated.   */
1769         vm_offset_t     newoff;
1770         int             i;
1771         struct vs_map   *vsmap;
1772
1773         VS_MAP_LOCK(vs);
1774
1775         ASSERT(vs->vs_dmap);
1776         cluster = atop_32(offset) >> vs->vs_clshift;
1777
1778         /*
1779          * Initialize cluster error value
1780          */
1781         clmap->cl_error = 0;
1782
1783         /*
1784          * If the object has grown, extend the page map.
1785          */
1786         if (cluster >= vs->vs_size) {
1787                 if (flag == CL_FIND) {
1788                         /* Do not allocate if just doing a lookup */
1789                         VS_MAP_UNLOCK(vs);
1790                         return (vm_offset_t) -1;
1791                 }
1792                 if (ps_map_extend(vs, cluster + 1)) {
1793                         VS_MAP_UNLOCK(vs);
1794                         return (vm_offset_t) -1;
1795                 }
1796         }
1797
1798         /*
1799          * Look for the desired cluster.  If the map is indirect, then we
1800          * have a two level lookup.  First find the indirect block, then
1801          * find the actual cluster.  If the indirect block has not yet
1802          * been allocated, then do so.  If the cluster has not yet been
1803          * allocated, then do so.
1804          *
1805          * If any of the allocations fail, then return an error.
1806          * Don't allocate if just doing a lookup.
1807          */
1808         if (vs->vs_indirect) {
1809                 long    ind_block = cluster/CLMAP_ENTRIES;
1810
1811                 /* Is the indirect block allocated? */
1812                 vsmap = vs->vs_imap[ind_block];
1813                 if (vsmap == NULL) {
1814                         if (flag == CL_FIND) {
1815                                 VS_MAP_UNLOCK(vs);
1816                                 return (vm_offset_t) -1;
1817                         }
1818
1819                         /* Allocate the indirect block */
1820                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1821                         if (vsmap == NULL) {
1822                                 VS_MAP_UNLOCK(vs);
1823                                 return (vm_offset_t) -1;
1824                         }
1825                         /* Initialize the cluster offsets */
1826                         for (i = 0; i < CLMAP_ENTRIES; i++)
1827                                 VSM_CLR(vsmap[i]);
1828                         vs->vs_imap[ind_block] = vsmap;
1829                 }
1830         } else
1831                 vsmap = vs->vs_dmap;
1832
1833         ASSERT(vsmap);
1834         vsmap += cluster%CLMAP_ENTRIES;
1835
1836         /*
1837          * At this point, vsmap points to the struct vs_map desired.
1838          *
1839          * Look in the map for the cluster, if there was an error on a
1840          * previous write, flag it and return.  If it is not yet
1841          * allocated, then allocate it, if we're writing; if we're
1842          * doing a lookup and the cluster's not allocated, return error.
1843          */
1844         if (VSM_ISERR(*vsmap)) {
1845                 clmap->cl_error = VSM_GETERR(*vsmap);
1846                 VS_MAP_UNLOCK(vs);
1847                 return (vm_offset_t) -1;
1848         } else if (VSM_ISCLR(*vsmap)) {
1849                 int psindex;
1850
1851                 if (flag == CL_FIND) {
1852                         /*
1853                          * If there's an error and the entry is clear, then
1854                          * we've run out of swap space.  Record the error
1855                          * here and return.
1856                          */
1857                         if (error) {
1858                                 VSM_SETERR(*vsmap, error);
1859                         }
1860                         VS_MAP_UNLOCK(vs);
1861                         return (vm_offset_t) -1;
1862                 } else {
1863                         /*
1864                          * Attempt to allocate a cluster from the paging segment
1865                          */
1866                         newcl = ps_allocate_cluster(vs, &psindex,
1867                                                     PAGING_SEGMENT_NULL);
1868                         if (newcl == -1) {
1869                                 VS_MAP_UNLOCK(vs);
1870                                 return (vm_offset_t) -1;
1871                         }
1872                         VSM_CLR(*vsmap);
1873                         VSM_SETCLOFF(*vsmap, newcl);
1874                         VSM_SETPS(*vsmap, psindex);
1875                 }
1876         } else
1877                 newcl = VSM_CLOFF(*vsmap);
1878
1879         /*
1880          * Fill in pertinent fields of the clmap
1881          */
1882         clmap->cl_ps = VSM_PS(*vsmap);
1883         clmap->cl_numpages = VSCLSIZE(vs);
1884         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1885
1886         /*
1887          * Byte offset in paging segment is byte offset to cluster plus
1888          * byte offset within cluster.  It looks ugly, but should be
1889          * relatively quick.
1890          */
1891         ASSERT(trunc_page(offset) == offset);
1892         newcl = ptoa_32(newcl) << vs->vs_clshift;
1893         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1894         if (flag == CL_ALLOC) {
1895                 /*
1896                  * set bits in the allocation bitmap according to which
1897                  * pages were requested.  size is in bytes.
1898                  */
1899                 i = atop_32(newoff);
1900                 while ((size > 0) && (i < VSCLSIZE(vs))) {
1901                         VSM_SETALLOC(*vsmap, i);
1902                         i++;
1903                         size -= vm_page_size;
1904                 }
1905         }
1906         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1907         if (newoff) {
1908                 /*
1909                  * Offset is not cluster aligned, so number of pages
1910                  * and bitmaps must be adjusted
1911                  */
1912                 clmap->cl_numpages -= atop_32(newoff);
1913                 CLMAP_SHIFT(clmap, vs);
1914                 CLMAP_SHIFTALLOC(clmap, vs);
1915         }
1916
1917         /*
1918          *
1919          * The setting of valid bits and handling of write errors
1920          * must be done here, while we hold the lock on the map.
1921          * It logically should be done in ps_vs_write_complete().
1922          * The size and error information has been passed from
1923          * ps_vs_write_complete().  If the size parameter is non-zero,
1924          * then there is work to be done.  If error is also non-zero,
1925          * then the error number is recorded in the cluster and the
1926          * entire cluster is in error.
1927          */
1928         if (size && flag == CL_FIND) {
1929                 vm_offset_t off = (vm_offset_t) 0;
1930
1931                 if (!error) {
1932                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1933                              i++) {
1934                                 VSM_SETPG(*vsmap, i);
1935                                 size -= vm_page_size;
1936                         }
1937                         ASSERT(i <= VSCLSIZE(vs));
1938                 } else {
1939                         BS_STAT(clmap->cl_ps->ps_bs,
1940                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1941                                         atop_32(size));
1942                         off = VSM_CLOFF(*vsmap);
1943                         VSM_SETERR(*vsmap, error);
1944                 }
1945                 /*
1946                  * Deallocate cluster if error, and no valid pages
1947                  * already present.
1948                  */
1949                 if (off != (vm_offset_t) 0)
1950                         ps_deallocate_cluster(clmap->cl_ps, off);
1951                 VS_MAP_UNLOCK(vs);
1952                 return (vm_offset_t) 0;
1953         } else
1954                 VS_MAP_UNLOCK(vs);
1955
1956         DEBUG(DEBUG_VS_INTERNAL,
1957               ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1958                newcl+newoff, (int) vs, (int) vsmap, flag));
1959         DEBUG(DEBUG_VS_INTERNAL,
1960               ("        clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1961                (int) clmap->cl_ps, clmap->cl_numpages,
1962                (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1963
1964         return (newcl + newoff);
1965 }
1966
1967 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t);     /* forward */
1968
1969 void
1970 ps_clunmap(
1971         vstruct_t       vs,
1972         vm_offset_t     offset,
1973         vm_size_t       length)
1974 {
1975         vm_offset_t             cluster; /* The cluster number of offset */
1976         struct vs_map           *vsmap;
1977
1978         VS_MAP_LOCK(vs);
1979
1980         /*
1981          * Loop through all clusters in this range, freeing paging segment
1982          * clusters and map entries as encountered.
1983          */
1984         while (length > 0) {
1985                 vm_offset_t     newoff;
1986                 int             i;
1987
1988                 cluster = atop_32(offset) >> vs->vs_clshift;
1989                 if (vs->vs_indirect)    /* indirect map */
1990                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
1991                 else
1992                         vsmap = vs->vs_dmap;
1993                 if (vsmap == NULL) {
1994                         VS_MAP_UNLOCK(vs);
1995                         return;
1996                 }
1997                 vsmap += cluster%CLMAP_ENTRIES;
1998                 if (VSM_ISCLR(*vsmap)) {
1999                         length -= vm_page_size;
2000                         offset += vm_page_size;
2001                         continue;
2002                 }
2003                 /*
2004                  * We've got a valid mapping.  Clear it and deallocate
2005                  * paging segment cluster pages.
2006                  * Optimize for entire cluster cleraing.
2007                  */
2008                 if (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) {
2009                         /*
2010                          * Not cluster aligned.
2011                          */
2012                         ASSERT(trunc_page(newoff) == newoff);
2013                         i = atop_32(newoff);
2014                 } else
2015                         i = 0;
2016                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2017                         VSM_CLRPG(*vsmap, i);
2018                         VSM_CLRALLOC(*vsmap, i);
2019                         length -= vm_page_size;
2020                         offset += vm_page_size;
2021                         i++;
2022                 }
2023
2024                 /*
2025                  * If map entry is empty, clear and deallocate cluster.
2026                  */
2027                 if (!VSM_ALLOC(*vsmap)) {
2028                         ps_deallocate_cluster(VSM_PS(*vsmap),
2029                                               VSM_CLOFF(*vsmap));
2030                         VSM_CLR(*vsmap);
2031                 }
2032         }
2033
2034         VS_MAP_UNLOCK(vs);
2035 }
2036
2037 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2038
2039 void
2040 ps_vs_write_complete(
2041         vstruct_t       vs,
2042         vm_offset_t     offset,
2043         vm_size_t       size,
2044         int             error)
2045 {
2046         struct clmap    clmap;
2047
2048         /*
2049          * Get the struct vsmap for this cluster.
2050          * Use READ, even though it was written, because the
2051          * cluster MUST be present, unless there was an error
2052          * in the original ps_clmap (e.g. no space), in which
2053          * case, nothing happens.
2054          *
2055          * Must pass enough information to ps_clmap to allow it
2056          * to set the vs_map structure bitmap under lock.
2057          */
2058         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2059 }
2060
2061 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int);    /* forward */
2062
2063 void
2064 vs_cl_write_complete(
2065         vstruct_t               vs,
2066         paging_segment_t        ps,
2067         vm_offset_t             offset,
2068         vm_offset_t             addr,
2069         vm_size_t               size,
2070         boolean_t               async,
2071         int                     error)
2072 {
2073         kern_return_t   kr;
2074
2075         if (error) {
2076                 /*
2077                  * For internal objects, the error is recorded on a
2078                  * per-cluster basis by ps_clmap() which is called
2079                  * by ps_vs_write_complete() below.
2080                  */
2081                 dprintf(("write failed error = 0x%x\n", error));
2082                 /* add upl_abort code here */
2083         } else
2084                 GSTAT(global_stats.gs_pages_out += atop_32(size));
2085         /*
2086          * Notify the vstruct mapping code, so it can do its accounting.
2087          */
2088         ps_vs_write_complete(vs, offset, size, error);
2089
2090         if (async) {
2091                 VS_LOCK(vs);
2092                 ASSERT(vs->vs_async_pending > 0);
2093                 vs->vs_async_pending -= size;
2094                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2095                         vs->vs_waiting_async = FALSE;
2096                         VS_UNLOCK(vs);
2097                         /* mutex_unlock(&vs->vs_waiting_async); */
2098                         thread_wakeup(&vs->vs_async_pending);
2099                 } else {
2100                         VS_UNLOCK(vs);
2101                 }
2102         }
2103 }
2104
2105 #ifdef DEVICE_PAGING
2106 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2107
2108 kern_return_t
2109 device_write_reply(
2110         MACH_PORT_FACE  reply_port,
2111         kern_return_t   device_code,
2112         io_buf_len_t    bytes_written)
2113 {
2114         struct vs_async *vsa;
2115
2116         vsa = (struct vs_async *)
2117                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2118
2119         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2120                 device_code = KERN_FAILURE;
2121         }
2122
2123         vsa->vsa_error = device_code;
2124
2125
2126         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2127         if(vsa->vsa_flags & VSA_TRANSFER) {
2128                 /* revisit when async disk segments redone */
2129                 if(vsa->vsa_error) {
2130                    /* need to consider error condition.  re-write data or */
2131                    /* throw it away here. */
2132                    vm_offset_t  ioaddr;
2133                    if(vm_map_copyout(kernel_map, &ioaddr,
2134                                  (vm_map_copy_t)vsa->vsa_addr) != KERN_SUCCESS)
2135                    panic("vs_cluster_write: unable to copy source list\n");
2136                    vm_deallocate(kernel_map, ioaddr, vsa->vsa_size);
2137                 }
2138                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2139                                                 vsa->vsa_size, vsa->vsa_error);
2140         } else {
2141                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2142                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2143                              vsa->vsa_error);
2144         }
2145         VS_FREE_ASYNC(vsa);
2146
2147         return KERN_SUCCESS;
2148 }
2149
2150 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2151 kern_return_t
2152 device_write_reply_inband(
2153         MACH_PORT_FACE          reply_port,
2154         kern_return_t           return_code,
2155         io_buf_len_t            bytes_written)
2156 {
2157         panic("device_write_reply_inband: illegal");
2158         return KERN_SUCCESS;
2159 }
2160
2161 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2162 kern_return_t
2163 device_read_reply(
2164         MACH_PORT_FACE          reply_port,
2165         kern_return_t           return_code,
2166         io_buf_ptr_t            data,
2167         mach_msg_type_number_t  dataCnt)
2168 {
2169         struct vs_async *vsa;
2170         vsa = (struct vs_async *)
2171                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2172         vsa->vsa_addr = (vm_offset_t)data;
2173         vsa->vsa_size = (vm_size_t)dataCnt;
2174         vsa->vsa_error = return_code;
2175         thread_wakeup(&vsa->vsa_lock);
2176         return KERN_SUCCESS;
2177 }
2178
2179 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2180 kern_return_t
2181 device_read_reply_inband(
2182         MACH_PORT_FACE          reply_port,
2183         kern_return_t           return_code,
2184         io_buf_ptr_inband_t     data,
2185         mach_msg_type_number_t  dataCnt)
2186 {
2187         panic("device_read_reply_inband: illegal");
2188         return KERN_SUCCESS;
2189 }
2190
2191 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2192 kern_return_t
2193 device_read_reply_overwrite(
2194         MACH_PORT_FACE          reply_port,
2195         kern_return_t           return_code,
2196         io_buf_len_t            bytes_read)
2197 {
2198         panic("device_read_reply_overwrite: illegal\n");
2199         return KERN_SUCCESS;
2200 }
2201
2202 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2203 kern_return_t
2204 device_open_reply(
2205         MACH_PORT_FACE          reply_port,
2206         kern_return_t           return_code,
2207         MACH_PORT_FACE          device_port)
2208 {
2209         panic("device_open_reply: illegal\n");
2210         return KERN_SUCCESS;
2211 }
2212
2213 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
2214
2215 kern_return_t
2216 ps_read_device(
2217         paging_segment_t        ps,
2218         vm_offset_t             offset,
2219         vm_offset_t             *bufferp,
2220         unsigned int            size,
2221         unsigned int            *residualp,
2222         int                     flags)
2223 {
2224         kern_return_t   kr;
2225         recnum_t        dev_offset;
2226         unsigned int    bytes_wanted;
2227         unsigned int    bytes_read;
2228         unsigned int    total_read;
2229         vm_offset_t     dev_buffer;
2230         vm_offset_t     buf_ptr;
2231         unsigned int    records_read;
2232         struct vs_async *vsa;
2233         mutex_t vs_waiting_read_reply;
2234
2235         device_t        device;
2236         vm_map_copy_t   device_data = NULL;
2237         default_pager_thread_t *dpt = NULL;
2238
2239         device = dev_port_lookup(ps->ps_device);
2240         clustered_reads[atop_32(size)]++;
2241
2242         dev_offset = (ps->ps_offset +
2243                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2244         bytes_wanted = size;
2245         total_read = 0;
2246         *bufferp = (vm_offset_t)NULL;
2247
2248         do {
2249                 vsa = VS_ALLOC_ASYNC();
2250                 if (vsa) {
2251                         vsa->vsa_vs = NULL;
2252                         vsa->vsa_addr = 0;
2253                         vsa->vsa_offset = 0;
2254                         vsa->vsa_size = 0;
2255                         vsa->vsa_ps = NULL;
2256                 }
2257                 mutex_init(&vsa->vsa_lock, ETAP_DPAGE_VSSEQNO);
2258                 ip_lock(vsa->reply_port);
2259                 vsa->reply_port->ip_sorights++;
2260                 ip_reference(vsa->reply_port);
2261                 ip_unlock(vsa->reply_port);
2262                 kr = ds_device_read_common(device,
2263                                  vsa->reply_port,
2264                                  (mach_msg_type_name_t)
2265                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2266                                  (dev_mode_t) 0,
2267                                  dev_offset,
2268                                  bytes_wanted,
2269                                  (IO_READ | IO_CALL),
2270                                  (io_buf_ptr_t *) &dev_buffer,
2271                                  (mach_msg_type_number_t *) &bytes_read);
2272                 if(kr == MIG_NO_REPLY) {
2273                         assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2274                         thread_block(THREAD_CONTINUE_NULL);
2275
2276                         dev_buffer = vsa->vsa_addr;
2277                         bytes_read = (unsigned int)vsa->vsa_size;
2278                         kr = vsa->vsa_error;
2279                 }
2280                 VS_FREE_ASYNC(vsa);
2281                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2282                         break;
2283                 }
2284                 total_read += bytes_read;
2285
2286                 /*
2287                  * If we got the entire range, use the returned dev_buffer.
2288                  */
2289                 if (bytes_read == size) {
2290                         *bufferp = (vm_offset_t)dev_buffer;
2291                         break;
2292                 }
2293
2294 #if 1
2295                 dprintf(("read only %d bytes out of %d\n",
2296                          bytes_read, bytes_wanted));
2297 #endif
2298                 if(dpt == NULL) {
2299                         dpt = get_read_buffer();
2300                         buf_ptr = dpt->dpt_buffer;
2301                         *bufferp = (vm_offset_t)buf_ptr;
2302                 }
2303                 /*
2304                  * Otherwise, copy the data into the provided buffer (*bufferp)
2305                  * and append the rest of the range as it comes in.
2306                  */
2307                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2308                 buf_ptr += bytes_read;
2309                 bytes_wanted -= bytes_read;
2310                 records_read = (bytes_read >>
2311                                 (vm_page_shift - ps->ps_record_shift));
2312                 dev_offset += records_read;
2313                 DEBUG(DEBUG_VS_INTERNAL,
2314                       ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2315                        dev_buffer, bytes_read));
2316                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2317                     != KERN_SUCCESS)
2318                         Panic("dealloc buf");
2319         } while (bytes_wanted);
2320
2321         *residualp = size - total_read;
2322         if((dev_buffer != *bufferp) && (total_read != 0)) {
2323                 vm_offset_t temp_buffer;
2324                 vm_allocate(kernel_map, &temp_buffer, total_read, TRUE);
2325                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2326                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2327                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2328                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2329                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2330                         (vm_map_copy_t *)&device_data, FALSE))
2331                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2332         }
2333         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2334                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2335                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2336                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2337                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2338                         (vm_map_copy_t *)&device_data, FALSE))
2339                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2340         }
2341         else {
2342                 device_data = NULL;
2343         }
2344         *bufferp = (vm_offset_t)device_data;
2345
2346         if(dpt != NULL) {
2347                 /* Free the receive buffer */
2348                 dpt->checked_out = 0;
2349                 thread_wakeup(&dpt_array);
2350         }
2351         return KERN_SUCCESS;
2352 }
2353
2354 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
2355
2356 kern_return_t
2357 ps_write_device(
2358         paging_segment_t        ps,
2359         vm_offset_t             offset,
2360         vm_offset_t             addr,
2361         unsigned int            size,
2362         struct vs_async         *vsa)
2363 {
2364         recnum_t        dev_offset;
2365         io_buf_len_t    bytes_to_write, bytes_written;
2366         recnum_t        records_written;
2367         kern_return_t   kr;
2368         MACH_PORT_FACE  reply_port;
2369
2370
2371
2372         clustered_writes[atop_32(size)]++;
2373
2374         dev_offset = (ps->ps_offset +
2375                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2376         bytes_to_write = size;
2377
2378         if (vsa) {
2379                 /*
2380                  * Asynchronous write.
2381                  */
2382                 reply_port = vsa->reply_port;
2383                 ip_lock(reply_port);
2384                 reply_port->ip_sorights++;
2385                 ip_reference(reply_port);
2386                 ip_unlock(reply_port);
2387                 {
2388                 device_t        device;
2389                 device = dev_port_lookup(ps->ps_device);
2390
2391                 vsa->vsa_addr = addr;
2392                 kr=ds_device_write_common(device,
2393                         reply_port,
2394                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2395                         (dev_mode_t) 0,
2396                         dev_offset,
2397                         (io_buf_ptr_t)  addr,
2398                         size,
2399                         (IO_WRITE | IO_CALL),
2400                         &bytes_written);
2401                 }
2402                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2403                         if (verbose)
2404                                 dprintf(("%s0x%x, addr=0x%x,"
2405                                          "size=0x%x,offset=0x%x\n",
2406                                          "device_write_request returned ",
2407                                          kr, addr, size, offset));
2408                         BS_STAT(ps->ps_bs,
2409                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2410                         /* do the completion notification to free resources */
2411                         device_write_reply(reply_port, kr, 0);
2412                         return PAGER_ERROR;
2413                 }
2414         } else do {
2415                 /*
2416                  * Synchronous write.
2417                  */
2418                 {
2419                 device_t        device;
2420                 device = dev_port_lookup(ps->ps_device);
2421                 kr=ds_device_write_common(device,
2422                         IP_NULL, 0,
2423                         (dev_mode_t) 0,
2424                         dev_offset,
2425                         (io_buf_ptr_t)  addr,
2426                         size,
2427                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2428                         &bytes_written);
2429                 }
2430                 if (kr != KERN_SUCCESS) {
2431                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2432                                  "device_write returned ",
2433                                  kr, addr, size, offset));
2434                         BS_STAT(ps->ps_bs,
2435                                 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2436                         return PAGER_ERROR;
2437                 }
2438                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2439                         Panic("fragmented write");
2440                 records_written = (bytes_written >>
2441                                    (vm_page_shift - ps->ps_record_shift));
2442                 dev_offset += records_written;
2443 #if 1
2444                 if (bytes_written != bytes_to_write) {
2445                         dprintf(("wrote only %d bytes out of %d\n",
2446                                  bytes_written, bytes_to_write));
2447                 }
2448 #endif
2449                 bytes_to_write -= bytes_written;
2450                 addr += bytes_written;
2451         } while (bytes_to_write > 0);
2452
2453         return PAGER_SUCCESS;
2454 }
2455
2456
2457 #else /* !DEVICE_PAGING */
2458
2459 kern_return_t
2460 ps_read_device(
2461         paging_segment_t        ps,
2462         vm_offset_t             offset,
2463         vm_offset_t             *bufferp,
2464         unsigned int            size,
2465         unsigned int            *residualp,
2466         int                     flags)
2467 {
2468   panic("ps_read_device not supported");
2469 }
2470
2471 ps_write_device(
2472         paging_segment_t        ps,
2473         vm_offset_t             offset,
2474         vm_offset_t             addr,
2475         unsigned int            size,
2476         struct vs_async         *vsa)
2477 {
2478   panic("ps_write_device not supported");
2479 }
2480
2481 #endif /* DEVICE_PAGING */
2482 void pvs_object_data_provided(vstruct_t, upl_t, vm_offset_t, vm_size_t);        /* forward */
2483
2484 void
2485 pvs_object_data_provided(
2486         vstruct_t       vs,
2487         upl_t           upl,
2488         vm_offset_t     offset,
2489         vm_size_t       size)
2490 {
2491
2492         DEBUG(DEBUG_VS_INTERNAL,
2493               ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2494                upl, offset, size));
2495
2496         ASSERT(size > 0);
2497         GSTAT(global_stats.gs_pages_in += atop_32(size));
2498
2499
2500 #if     USE_PRECIOUS
2501         ps_clunmap(vs, offset, size);
2502 #endif  /* USE_PRECIOUS */
2503
2504 }
2505
2506 kern_return_t
2507 pvs_cluster_read(
2508         vstruct_t       vs,
2509         vm_offset_t     vs_offset,
2510         vm_size_t       cnt)
2511 {
2512         upl_t                   upl;
2513         kern_return_t           error = KERN_SUCCESS;
2514         int                     size;
2515         unsigned int            residual;
2516         unsigned int            request_flags;
2517         int                     seg_index;
2518         int                     pages_in_cl;
2519         int                     cl_size;
2520         int                     cl_mask;
2521         int                     cl_index;
2522         int                     xfer_size;
2523         vm_offset_t       ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2524         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2525         struct clmap            clmap;
2526
2527         pages_in_cl = 1 << vs->vs_clshift;
2528         cl_size = pages_in_cl * vm_page_size;
2529         cl_mask = cl_size - 1;
2530
2531         /*
2532          * This loop will be executed multiple times until the entire
2533          * request has been satisfied... if the request spans cluster
2534          * boundaries, the clusters will be checked for logical continunity,
2535          * if contiguous the I/O request will span multiple clusters, otherwise
2536          * it will be broken up into the minimal set of I/O's
2537          *
2538          * If there are holes in a request (either unallocated pages in a paging
2539          * segment or an unallocated paging segment), we stop
2540          * reading at the hole, inform the VM of any data read, inform
2541          * the VM of an unavailable range, then loop again, hoping to
2542          * find valid pages later in the requested range.  This continues until
2543          * the entire range has been examined, and read, if present.
2544          */
2545
2546 #if     USE_PRECIOUS
2547         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
2548 #else
2549         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
2550 #endif
2551         while (cnt && (error == KERN_SUCCESS)) {
2552                 int     ps_info_valid;
2553                 int     page_list_count;
2554
2555                 if((vs_offset & cl_mask) &&
2556                         (cnt > (VM_SUPER_CLUSTER -
2557                                 (vs_offset & cl_mask)))) {
2558                         size = VM_SUPER_CLUSTER;
2559                         size -= vs_offset & cl_mask;
2560                 } else if (cnt > VM_SUPER_CLUSTER) {
2561                         size = VM_SUPER_CLUSTER;
2562                 } else {
2563                         size = cnt;
2564                 }
2565                 cnt -= size;
2566
2567                 ps_info_valid = 0;
2568                 seg_index     = 0;
2569
2570                 while (size > 0 && error == KERN_SUCCESS) {
2571                         int           abort_size;
2572                         int           failed_size;
2573                         int           beg_pseg;
2574                         int           beg_indx;
2575                         vm_offset_t   cur_offset;
2576
2577
2578                         if ( !ps_info_valid) {
2579                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2580                                 psp[seg_index]       = CLMAP_PS(clmap);
2581                                 ps_info_valid = 1;
2582                         }
2583                         /*
2584                          * skip over unallocated physical segments
2585                          */
2586                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2587                                 abort_size = cl_size - (vs_offset & cl_mask);
2588                                 abort_size = MIN(abort_size, size);
2589
2590                                 page_list_count = 0;
2591                                 memory_object_super_upl_request(
2592                                         vs->vs_control,
2593                                         (memory_object_offset_t)vs_offset,
2594                                         abort_size, abort_size,
2595                                         &upl, NULL, &page_list_count,
2596                                         request_flags);
2597
2598                                 if (clmap.cl_error) {
2599                                         upl_abort(upl, UPL_ABORT_ERROR);
2600                                 } else {
2601                                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2602                                 }
2603                                 upl_deallocate(upl);
2604
2605                                 size       -= abort_size;
2606                                 vs_offset  += abort_size;
2607
2608                                 seg_index++;
2609                                 ps_info_valid = 0;
2610                                 continue;
2611                         }
2612                         cl_index = (vs_offset & cl_mask) / vm_page_size;
2613
2614                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2615                                 /*
2616                                  * skip over unallocated pages
2617                                  */
2618                                 if (CLMAP_ISSET(clmap, cl_index))
2619                                         break;
2620                                 abort_size += vm_page_size;
2621                         }
2622                         if (abort_size) {
2623                                 /*
2624                                  * Let VM system know about holes in clusters.
2625                                  */
2626                                 GSTAT(global_stats.gs_pages_unavail += atop_32(abort_size));
2627
2628                                 page_list_count = 0;
2629                                 memory_object_super_upl_request(
2630                                         vs->vs_control,
2631                                         (memory_object_offset_t)vs_offset,
2632                                         abort_size, abort_size,
2633                                         &upl, NULL, &page_list_count,
2634                                         request_flags);
2635
2636                                 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2637                                 upl_deallocate(upl);
2638
2639                                 size       -= abort_size;
2640                                 vs_offset  += abort_size;
2641
2642                                 if (cl_index == pages_in_cl) {
2643                                         /*
2644                                          * if we're at the end of this physical cluster
2645                                          * then bump to the next one and continue looking
2646                                          */
2647                                         seg_index++;
2648                                         ps_info_valid = 0;
2649                                         continue;
2650                                 }
2651                                 if (size == 0)
2652                                         break;
2653                         }
2654                         /*
2655                          * remember the starting point of the first allocated page
2656                          * for the I/O we're about to issue
2657                          */
2658                         beg_pseg   = seg_index;
2659                         beg_indx   = cl_index;
2660                         cur_offset = vs_offset;
2661
2662                         /*
2663                          * calculate the size of the I/O that we can do...
2664                          * this may span multiple physical segments if
2665                          * they are contiguous
2666                          */
2667                         for (xfer_size = 0; xfer_size < size; ) {
2668
2669                                 while (cl_index < pages_in_cl
2670                                                 && xfer_size < size) {
2671                                         /*
2672                                          * accumulate allocated pages within
2673                                          * a physical segment
2674                                          */
2675                                         if (CLMAP_ISSET(clmap, cl_index)) {
2676                                                 xfer_size  += vm_page_size;
2677                                                 cur_offset += vm_page_size;
2678                                                 cl_index++;
2679
2680                                                 BS_STAT(psp[seg_index]->ps_bs,
2681                                                         psp[seg_index]->ps_bs->bs_pages_in++);
2682                                         } else
2683                                                 break;
2684                                 }
2685                                 if (cl_index < pages_in_cl
2686                                                 || xfer_size >= size) {
2687                                         /*
2688                                          * we've hit an unallocated page or
2689                                          * the end of this request... go fire
2690                                          * the I/O
2691                                          */
2692                                         break;
2693                                 }
2694                                 /*
2695                                  * we've hit the end of the current physical
2696                                  * segment and there's more to do, so try
2697                                  * moving to the next one
2698                                  */
2699                                 seg_index++;
2700
2701                                 ps_offset[seg_index] =
2702                                         ps_clmap(vs,
2703                                                 cur_offset & ~cl_mask,
2704                                                 &clmap, CL_FIND, 0, 0);
2705                                 psp[seg_index] = CLMAP_PS(clmap);
2706                                 ps_info_valid = 1;
2707
2708                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2709                                         /*
2710                                          * if the physical segment we're about
2711                                          * to step into is not contiguous to
2712                                          * the one we're currently in, or it's
2713                                          * in a different paging file, or
2714                                          * it hasn't been allocated....
2715                                          * we stop here and generate the I/O
2716                                          */
2717                                         break;
2718                                 }
2719                                 /*
2720                                  * start with first page of the next physical
2721                                  * segment
2722                                  */
2723                                 cl_index = 0;
2724                         }
2725                         if (xfer_size) {
2726                                 /*
2727                                  * we have a contiguous range of allocated pages
2728                                  * to read from
2729                                  */
2730                                 page_list_count = 0;
2731                                 memory_object_super_upl_request(vs->vs_control,
2732                                         (memory_object_offset_t)vs_offset,
2733                                         xfer_size, xfer_size,
2734                                         &upl, NULL, &page_list_count,
2735                                         request_flags | UPL_SET_INTERNAL);
2736
2737                                 error = ps_read_file(psp[beg_pseg],
2738                                         upl, (vm_offset_t) 0,
2739                                         ps_offset[beg_pseg] +
2740                                                 (beg_indx * vm_page_size),
2741                                         xfer_size, &residual, 0);
2742                         } else
2743                                 continue;
2744
2745                         failed_size = 0;
2746
2747                         /*
2748                          * Adjust counts and send response to VM.  Optimize
2749                          * for the common case, i.e. no error and/or partial
2750                          * data.  If there was an error, then we need to error
2751                          * the entire range, even if some data was successfully
2752                          * read.  If there was a partial read we may supply some
2753                          * data and may error some as well.  In all cases the
2754                          * VM must receive some notification for every page in the
2755                          * range.
2756                          */
2757                         if ((error == KERN_SUCCESS) && (residual == 0)) {
2758                                 /*
2759                                  * Got everything we asked for, supply the data
2760                                  * to the VM.  Note that as a side effect of
2761                                  * supplying * the data, the buffer holding the
2762                                  * supplied data is * deallocated from the pager's
2763                                  * address space.
2764                                  */
2765                                 pvs_object_data_provided(
2766                                         vs, upl, vs_offset, xfer_size);
2767                         } else {
2768                                 failed_size = xfer_size;
2769
2770                                 if (error == KERN_SUCCESS) {
2771                                         if (residual == xfer_size) {
2772                                         /*
2773                                          * If a read operation returns no error
2774                                          * and no data moved, we turn it into
2775                                          * an error, assuming we're reading at
2776                                          * or beyong EOF.
2777                                          * Fall through and error the entire
2778                                          * range.
2779                                          */
2780                                                 error = KERN_FAILURE;
2781                                         } else {
2782                                         /*
2783                                          * Otherwise, we have partial read. If
2784                                          * the part read is a integral number
2785                                          * of pages supply it. Otherwise round
2786                                          * it up to a page boundary, zero fill
2787                                          * the unread part, and supply it.
2788                                          * Fall through and error the remainder
2789                                          * of the range, if any.
2790                                          */
2791                                                 int fill, lsize;
2792
2793                                                 fill = residual
2794                                                         & ~vm_page_size;
2795                                                 lsize = (xfer_size - residual)
2796                                                                          + fill;
2797                                                 pvs_object_data_provided(
2798                                                         vs, upl,
2799                                                         vs_offset, lsize);
2800
2801                                                 if (lsize < xfer_size) {
2802                                                         failed_size =
2803                                                             xfer_size - lsize;
2804                                                         error = KERN_FAILURE;
2805                                                 }
2806                                         }
2807                                 }
2808                         }
2809                         /*
2810                          * If there was an error in any part of the range, tell
2811                          * the VM. Note that error is explicitly checked again
2812                          * since it can be modified above.
2813                          */
2814                         if (error != KERN_SUCCESS) {
2815                                 BS_STAT(psp[beg_pseg]->ps_bs,
2816                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail
2817                                                 += atop_32(failed_size));
2818                         }
2819                         size       -= xfer_size;
2820                         vs_offset  += xfer_size;
2821                 }
2822
2823         } /* END while (cnt && (error == 0)) */
2824         return error;
2825 }
2826
2827 int vs_do_async_write = 1;
2828
2829 kern_return_t
2830 vs_cluster_write(
2831         vstruct_t       vs,
2832         upl_t           internal_upl,
2833         vm_offset_t     offset,
2834         vm_size_t       cnt,
2835         boolean_t       dp_internal,
2836         int             flags)
2837 {
2838         vm_offset_t     size;
2839         vm_offset_t     transfer_size;
2840         int             error = 0;
2841         struct clmap    clmap;
2842
2843         vm_offset_t     actual_offset;  /* Offset within paging segment */
2844         paging_segment_t ps;
2845         vm_offset_t     subx_size;
2846         vm_offset_t     mobj_base_addr;
2847         vm_offset_t     mobj_target_addr;
2848         int             mobj_size;
2849
2850         struct vs_async *vsa;
2851         vm_map_copy_t   copy;
2852
2853         upl_t           upl;
2854         upl_page_info_t *pl;
2855         int             page_index;
2856         int             list_size;
2857         int             cl_size;
2858
2859         if (!dp_internal) {
2860                 int          page_list_count;
2861                 int          request_flags;
2862                 int          super_size;
2863                 int          first_dirty;
2864                 int          num_dirty;
2865                 int          num_of_pages;
2866                 int          seg_index;
2867                 int          pages_in_cl;
2868                 int          must_abort;
2869                 vm_offset_t  upl_offset;
2870                 vm_offset_t  seg_offset;
2871                 vm_offset_t  ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2872                 paging_segment_t   psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2873
2874
2875                 pages_in_cl = 1 << vs->vs_clshift;
2876                 cl_size = pages_in_cl * vm_page_size;
2877
2878                 if (bs_low) {
2879                         super_size = cl_size;
2880
2881                         request_flags = UPL_NOBLOCK |
2882                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2883                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2884                 } else {
2885                         super_size = VM_SUPER_CLUSTER;
2886
2887                         request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2888                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2889                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2890                 }
2891
2892                 page_list_count = 0;
2893                 memory_object_super_upl_request(vs->vs_control,
2894                                 (memory_object_offset_t)offset,
2895                                 cnt, super_size,
2896                                 &upl, NULL, &page_list_count,
2897                                 request_flags | UPL_FOR_PAGEOUT);
2898
2899                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2900
2901                 for (seg_index = 0, transfer_size = upl->size;
2902                                                 transfer_size > 0; ) {
2903
2904                         ps_offset[seg_index] =
2905                                 ps_clmap(vs, upl->offset + (seg_index * cl_size),
2906                                        &clmap, CL_ALLOC,
2907                                        transfer_size < cl_size ?
2908                                        transfer_size : cl_size, 0);
2909
2910                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2911                                 upl_abort(upl, 0);
2912                                 upl_deallocate(upl);
2913
2914                                 return KERN_FAILURE;
2915
2916                         }
2917                         psp[seg_index] = CLMAP_PS(clmap);
2918
2919                         if (transfer_size > cl_size) {
2920                                 transfer_size -= cl_size;
2921                                 seg_index++;
2922                         } else
2923                                 transfer_size = 0;
2924                 }
2925                 for (page_index = 0,
2926                                 num_of_pages = upl->size / vm_page_size;
2927                                 page_index < num_of_pages; ) {
2928                         /*
2929                          * skip over non-dirty pages
2930                          */
2931                         for ( ; page_index < num_of_pages; page_index++) {
2932                                 if (UPL_DIRTY_PAGE(pl, page_index)
2933                                         || UPL_PRECIOUS_PAGE(pl, page_index))
2934                                         /*
2935                                          * this is a page we need to write
2936                                          * go see if we can buddy it up with
2937                                          * others that are contiguous to it
2938                                          */
2939                                         break;
2940                                 /*
2941                                  * if the page is not-dirty, but present we
2942                                  * need to commit it...  This is an unusual
2943                                  * case since we only asked for dirty pages
2944                                  */
2945                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
2946                                         boolean_t empty = FALSE;
2947                                         upl_commit_range(upl,
2948                                                  page_index * vm_page_size,
2949                                                  vm_page_size,
2950                                                  UPL_COMMIT_NOTIFY_EMPTY,
2951                                                  pl,
2952                                                  page_list_count,
2953                                                  &empty);
2954                                         if (empty)
2955                                                 upl_deallocate(upl);
2956                                 }
2957                         }
2958                         if (page_index == num_of_pages)
2959                                 /*
2960                                  * no more pages to look at, we're out of here
2961                                  */
2962                                 break;
2963
2964                         /*
2965                          * gather up contiguous dirty pages... we have at
2966                          * least 1 otherwise we would have bailed above
2967                          * make sure that each physical segment that we step
2968                          * into is contiguous to the one we're currently in
2969                          * if it's not, we have to stop and write what we have
2970                          */
2971                         for (first_dirty = page_index;
2972                                         page_index < num_of_pages; ) {
2973                                 if ( !UPL_DIRTY_PAGE(pl, page_index)
2974                                         && !UPL_PRECIOUS_PAGE(pl, page_index))
2975                                         break;
2976                                 page_index++;
2977                                 /*
2978                                  * if we just looked at the last page in the UPL
2979                                  * we don't need to check for physical segment
2980                                  * continuity
2981                                  */
2982                                 if (page_index < num_of_pages) {
2983                                         int cur_seg;
2984                                         int nxt_seg;
2985
2986                                         cur_seg =
2987                                                 (page_index - 1) / pages_in_cl;
2988                                         nxt_seg = page_index / pages_in_cl;
2989
2990                                         if (cur_seg != nxt_seg) {
2991                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
2992                                                 /*
2993                                                  * if the segment we're about
2994                                                  * to step into is not
2995                                                  * contiguous to the one we're
2996                                                  * currently in, or it's in a
2997                                                  * different paging file....
2998                                                  * we stop here and generate
2999                                                  * the I/O
3000                                                  */
3001                                                         break;
3002                                         }
3003                                 }
3004                         }
3005                         num_dirty = page_index - first_dirty;
3006                         must_abort = 1;
3007
3008                         if (num_dirty) {
3009                                 upl_offset = first_dirty * vm_page_size;
3010                                 seg_index  = first_dirty / pages_in_cl;
3011                                 seg_offset = upl_offset - (seg_index * cl_size);
3012                                 transfer_size = num_dirty * vm_page_size;
3013
3014
3015                                 while (transfer_size) {
3016                                         int seg_size;
3017
3018                                         if ((seg_size = cl_size -
3019                                                 (upl_offset % cl_size))
3020                                                         > transfer_size)
3021                                                 seg_size = transfer_size;
3022
3023                                         ps_vs_write_complete(vs,
3024                                                 upl->offset + upl_offset,
3025                                                 seg_size, error);
3026
3027                                         transfer_size -= seg_size;
3028                                         upl_offset += seg_size;
3029                                 }
3030                                 upl_offset = first_dirty * vm_page_size;
3031                                 transfer_size = num_dirty * vm_page_size;
3032                                 error = ps_write_file(psp[seg_index],
3033                                                 upl, upl_offset,
3034                                                 ps_offset[seg_index]
3035                                                                 + seg_offset,
3036                                                 transfer_size, flags);
3037                                 must_abort = 0;
3038                         }
3039                         if (must_abort) {
3040                                 boolean_t empty = FALSE;
3041                                 upl_abort_range(upl,
3042                                                 first_dirty * vm_page_size,
3043                                                 num_dirty   * vm_page_size,
3044                                                 UPL_ABORT_NOTIFY_EMPTY,
3045                                                 &empty);
3046                                 if (empty)
3047                                         upl_deallocate(upl);
3048                         }
3049                 }
3050
3051         } else {
3052                 assert(cnt  <= (vm_page_size << vs->vs_clshift));
3053                 list_size = cnt;
3054
3055                 page_index = 0;
3056                 /* The caller provides a mapped_data which is derived  */
3057                 /* from a temporary object.  The targeted pages are    */
3058                 /* guaranteed to be set at offset 0 in the mapped_data */
3059                 /* The actual offset however must still be derived     */
3060                 /* from the offset in the vs in question               */
3061                 mobj_base_addr = offset;
3062                 mobj_target_addr = mobj_base_addr;
3063
3064                 for (transfer_size = list_size; transfer_size != 0;) {
3065                         actual_offset = ps_clmap(vs, mobj_target_addr,
3066                                 &clmap, CL_ALLOC,
3067                                 transfer_size < cl_size ?
3068                                         transfer_size : cl_size, 0);
3069                         if(actual_offset == (vm_offset_t) -1) {
3070                                 error = 1;
3071                                 break;
3072                         }
3073                         cnt = MIN(transfer_size,
3074                                 CLMAP_NPGS(clmap) * vm_page_size);
3075                         ps = CLMAP_PS(clmap);
3076                         /* Assume that the caller has given us contiguous */
3077                         /* pages */
3078                         if(cnt) {
3079                                 ps_vs_write_complete(vs, mobj_target_addr,
3080                                                                 cnt, error);
3081                                 error = ps_write_file(ps, internal_upl,
3082                                                 0, actual_offset,
3083                                                 cnt, flags);
3084                                 if (error)
3085                                         break;
3086                         }
3087                         if (error)
3088                                 break;
3089                         actual_offset += cnt;
3090                         mobj_target_addr += cnt;
3091                         transfer_size -= cnt;
3092                         cnt = 0;
3093
3094                         if (error)
3095                                 break;
3096                 }
3097         }
3098         if(error)
3099                 return KERN_FAILURE;
3100         else
3101                 return KERN_SUCCESS;
3102 }
3103
3104 vm_size_t
3105 ps_vstruct_allocated_size(
3106         vstruct_t       vs)
3107 {
3108         int             num_pages;
3109         struct vs_map   *vsmap;
3110         int             i, j, k;
3111
3112         num_pages = 0;
3113         if (vs->vs_indirect) {
3114                 /* loop on indirect maps */
3115                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3116                         vsmap = vs->vs_imap[i];
3117                         if (vsmap == NULL)
3118                                 continue;
3119                         /* loop on clusters in this indirect map */
3120                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3121                                 if (VSM_ISCLR(vsmap[j]) ||
3122                                     VSM_ISERR(vsmap[j]))
3123                                         continue;
3124                                 /* loop on pages in this cluster */
3125                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3126                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3127                                                 num_pages++;
3128                                 }
3129                         }
3130                 }
3131         } else {
3132                 vsmap = vs->vs_dmap;
3133                 if (vsmap == NULL)
3134                         return 0;
3135                 /* loop on clusters in the direct map */
3136                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3137                         if (VSM_ISCLR(vsmap[j]) ||
3138                             VSM_ISERR(vsmap[j]))
3139                                 continue;
3140                         /* loop on pages in this cluster */
3141                         for (k = 0; k < VSCLSIZE(vs); k++) {
3142                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3143                                         num_pages++;
3144                         }
3145                 }
3146         }
3147
3148         return ptoa_32(num_pages);
3149 }
3150
3151 size_t
3152 ps_vstruct_allocated_pages(
3153         vstruct_t               vs,
3154         default_pager_page_t    *pages,
3155         size_t                  pages_size)
3156 {
3157         int             num_pages;
3158         struct vs_map   *vsmap;
3159         vm_offset_t     offset;
3160         int             i, j, k;
3161
3162         num_pages = 0;
3163         offset = 0;
3164         if (vs->vs_indirect) {
3165                 /* loop on indirect maps */
3166                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3167                         vsmap = vs->vs_imap[i];
3168                         if (vsmap == NULL) {
3169                                 offset += (vm_page_size * CLMAP_ENTRIES *
3170                                            VSCLSIZE(vs));
3171                                 continue;
3172                         }
3173                         /* loop on clusters in this indirect map */
3174                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3175                                 if (VSM_ISCLR(vsmap[j]) ||
3176                                     VSM_ISERR(vsmap[j])) {
3177                                         offset += vm_page_size * VSCLSIZE(vs);
3178                                         continue;
3179                                 }
3180                                 /* loop on pages in this cluster */
3181                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3182                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3183                                                 num_pages++;
3184                                                 if (num_pages < pages_size)
3185                                                         pages++->dpp_offset =
3186                                                                 offset;
3187                                         }
3188                                         offset += vm_page_size;
3189                                 }
3190                         }
3191                 }
3192         } else {
3193                 vsmap = vs->vs_dmap;
3194                 if (vsmap == NULL)
3195                         return 0;
3196                 /* loop on clusters in the direct map */
3197                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3198                         if (VSM_ISCLR(vsmap[j]) ||
3199                             VSM_ISERR(vsmap[j])) {
3200                                 offset += vm_page_size * VSCLSIZE(vs);
3201                                 continue;
3202                         }
3203                         /* loop on pages in this cluster */
3204                         for (k = 0; k < VSCLSIZE(vs); k++) {
3205                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3206                                         num_pages++;
3207                                         if (num_pages < pages_size)
3208                                                 pages++->dpp_offset = offset;
3209                                 }
3210                                 offset += vm_page_size;
3211                         }
3212                 }
3213         }
3214
3215         return num_pages;
3216 }
3217
3218
3219 kern_return_t
3220 ps_vstruct_transfer_from_segment(
3221         vstruct_t        vs,
3222         paging_segment_t segment,
3223         upl_t            upl)
3224 {
3225         struct vs_map   *vsmap;
3226         struct vs_map   old_vsmap;
3227         struct vs_map   new_vsmap;
3228         int             i, j, k;
3229
3230         VS_LOCK(vs);    /* block all work on this vstruct */
3231                         /* can't allow the normal multiple write */
3232                         /* semantic because writes may conflict */
3233         vs->vs_xfer_pending = TRUE;
3234         vs_wait_for_sync_writers(vs);
3235         vs_start_write(vs);
3236         vs_wait_for_readers(vs);
3237         /* we will unlock the vs to allow other writes while transferring */
3238         /* and will be guaranteed of the persistance of the vs struct     */
3239         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3240         /* vs_async_pending */
3241         /* OK we now have guaranteed no other parties are accessing this */
3242         /* vs.  Now that we are also supporting simple lock versions of  */
3243         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3244         /* our purpose in holding it before was the multiple write case */
3245         /* we now use the boolean xfer_pending to do that.  We can use  */
3246         /* a boolean instead of a count because we have guaranteed single */
3247         /* file access to this code in its caller */
3248         VS_UNLOCK(vs);
3249 vs_changed:
3250         if (vs->vs_indirect) {
3251                 int     vsmap_size;
3252                 int     clmap_off;
3253                 /* loop on indirect maps */
3254                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3255                         vsmap = vs->vs_imap[i];
3256                         if (vsmap == NULL)
3257                                 continue;
3258                         /* loop on clusters in this indirect map */
3259                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3260                                            VSCLSIZE(vs) * i);
3261                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3262                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3263                         else
3264                                 vsmap_size = CLMAP_ENTRIES;
3265                         for (j = 0; j < vsmap_size; j++) {
3266                                 if (VSM_ISCLR(vsmap[j]) ||
3267                                     VSM_ISERR(vsmap[j]) ||
3268                                     (VSM_PS(vsmap[j]) != segment))
3269                                         continue;
3270                                 if(vs_cluster_transfer(vs,
3271                                         (vm_page_size * (j << vs->vs_clshift))
3272                                         + clmap_off,
3273                                         vm_page_size << vs->vs_clshift,
3274                                         upl)
3275                                                 != KERN_SUCCESS) {
3276                                    VS_LOCK(vs);
3277                                    vs->vs_xfer_pending = FALSE;
3278                                    VS_UNLOCK(vs);
3279                                    vs_finish_write(vs);
3280                                    return KERN_FAILURE;
3281                                 }
3282                                 /* allow other readers/writers during transfer*/
3283                                 VS_LOCK(vs);
3284                                 vs->vs_xfer_pending = FALSE;
3285                                 VS_UNLOCK(vs);
3286                                 vs_finish_write(vs);
3287                                 VS_LOCK(vs);
3288                                 vs->vs_xfer_pending = TRUE;
3289                                 vs_wait_for_sync_writers(vs);
3290                                 vs_start_write(vs);
3291                                 vs_wait_for_readers(vs);
3292                                 VS_UNLOCK(vs);
3293                                 if (!(vs->vs_indirect)) {
3294                                         goto vs_changed;
3295                                 }
3296                         }
3297                 }
3298         } else {
3299                 vsmap = vs->vs_dmap;
3300                 if (vsmap == NULL) {
3301                         VS_LOCK(vs);
3302                         vs->vs_xfer_pending = FALSE;
3303                         VS_UNLOCK(vs);
3304                         vs_finish_write(vs);
3305                         return KERN_SUCCESS;
3306                 }
3307                 /* loop on clusters in the direct map */
3308                 for (j = 0; j < vs->vs_size; j++) {
3309                         if (VSM_ISCLR(vsmap[j]) ||
3310                             VSM_ISERR(vsmap[j]) ||
3311                             (VSM_PS(vsmap[j]) != segment))
3312                                 continue;
3313                         if(vs_cluster_transfer(vs,
3314                                 vm_page_size * (j << vs->vs_clshift),
3315                                 vm_page_size << vs->vs_clshift,
3316                                 upl) != KERN_SUCCESS) {
3317                            VS_LOCK(vs);
3318                            vs->vs_xfer_pending = FALSE;
3319                            VS_UNLOCK(vs);
3320                            vs_finish_write(vs);
3321                            return KERN_FAILURE;
3322                         }
3323                         /* allow other readers/writers during transfer*/
3324                         VS_LOCK(vs);
3325                         vs->vs_xfer_pending = FALSE;
3326                         VS_UNLOCK(vs);
3327                         vs_finish_write(vs);
3328                         VS_LOCK(vs);
3329                         vs->vs_xfer_pending = TRUE;
3330                         VS_UNLOCK(vs);
3331                         vs_wait_for_sync_writers(vs);
3332                         vs_start_write(vs);
3333                         vs_wait_for_readers(vs);
3334                         if (vs->vs_indirect) {
3335                                 goto vs_changed;
3336                         }
3337                 }
3338         }
3339
3340         VS_LOCK(vs);
3341         vs->vs_xfer_pending = FALSE;
3342         VS_UNLOCK(vs);
3343         vs_finish_write(vs);
3344         return KERN_SUCCESS;
3345 }
3346
3347
3348
3349 vs_map_t
3350 vs_get_map_entry(
3351         vstruct_t       vs,
3352         vm_offset_t     offset)
3353 {
3354         struct vs_map   *vsmap;
3355         vm_offset_t     cluster;
3356
3357         cluster = atop_32(offset) >> vs->vs_clshift;
3358         if (vs->vs_indirect) {
3359                 long    ind_block = cluster/CLMAP_ENTRIES;
3360
3361                 /* Is the indirect block allocated? */
3362                 vsmap = vs->vs_imap[ind_block];
3363                 if(vsmap == (vs_map_t) NULL)
3364                         return vsmap;
3365         } else
3366                 vsmap = vs->vs_dmap;
3367         vsmap += cluster%CLMAP_ENTRIES;
3368         return vsmap;
3369 }
3370
3371 kern_return_t
3372 vs_cluster_transfer(
3373         vstruct_t       vs,
3374         vm_offset_t     offset,
3375         vm_size_t       cnt,
3376         upl_t           upl)
3377 {
3378         vm_offset_t             actual_offset;
3379         paging_segment_t        ps;
3380         struct clmap            clmap;
3381         kern_return_t           error = KERN_SUCCESS;
3382         int                     size, size_wanted, i;
3383         unsigned int            residual;
3384         int                     unavail_size;
3385         default_pager_thread_t  *dpt;
3386         boolean_t               dealloc;
3387         struct  vs_map          *vsmap_ptr;
3388         struct  vs_map          read_vsmap;
3389         struct  vs_map          original_read_vsmap;
3390         struct  vs_map          write_vsmap;
3391         upl_t                   sync_upl;
3392         vm_offset_t     ioaddr;
3393
3394         /* vs_cluster_transfer reads in the pages of a cluster and
3395          * then writes these pages back to new backing store.  The
3396          * segment the pages are being read from is assumed to have
3397          * been taken off-line and is no longer considered for new
3398          * space requests.
3399          */
3400
3401         /*
3402          * This loop will be executed once per cluster referenced.
3403          * Typically this means once, since it's unlikely that the
3404          * VM system will ask for anything spanning cluster boundaries.
3405          *
3406          * If there are holes in a cluster (in a paging segment), we stop
3407          * reading at the hole, then loop again, hoping to
3408          * find valid pages later in the cluster.  This continues until
3409          * the entire range has been examined, and read, if present.  The
3410          * pages are written as they are read.  If a failure occurs after
3411          * some pages are written the unmap call at the bottom of the loop
3412          * recovers the backing store and the old backing store remains
3413          * in effect.
3414          */
3415
3416         VSM_CLR(write_vsmap);
3417         VSM_CLR(original_read_vsmap);
3418         /* grab the actual object's pages to sync with I/O */
3419         while (cnt && (error == KERN_SUCCESS)) {
3420                 vsmap_ptr = vs_get_map_entry(vs, offset);
3421                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3422
3423                 if (actual_offset == (vm_offset_t) -1) {
3424
3425                         /*
3426                          * Nothing left to write in this cluster at least
3427                          * set write cluster information for any previous
3428                          * write, clear for next cluster, if there is one
3429                          */
3430                         unsigned int local_size, clmask, clsize;
3431
3432                         clsize = vm_page_size << vs->vs_clshift;
3433                         clmask = clsize - 1;
3434                         local_size = clsize - (offset & clmask);
3435                         ASSERT(local_size);
3436                         local_size = MIN(local_size, cnt);
3437
3438                         /* This cluster has no data in it beyond what may */
3439                         /* have been found on a previous iteration through */
3440                         /* the loop "write_vsmap" */
3441                         *vsmap_ptr = write_vsmap;
3442                         VSM_CLR(write_vsmap);
3443                         VSM_CLR(original_read_vsmap);
3444
3445                         cnt -= local_size;
3446                         offset += local_size;
3447                         continue;
3448                 }
3449
3450                 /*
3451                  * Count up contiguous available or unavailable
3452                  * pages.
3453                  */
3454                 ps = CLMAP_PS(clmap);
3455                 ASSERT(ps);
3456                 size = 0;
3457                 unavail_size = 0;
3458                 for (i = 0;
3459                      (size < cnt) && (unavail_size < cnt) &&
3460                      (i < CLMAP_NPGS(clmap)); i++) {
3461                         if (CLMAP_ISSET(clmap, i)) {
3462                                 if (unavail_size != 0)
3463                                         break;
3464                                 size += vm_page_size;
3465                                 BS_STAT(ps->ps_bs,
3466                                         ps->ps_bs->bs_pages_in++);
3467                         } else {
3468                                 if (size != 0)
3469                                         break;
3470                                 unavail_size += vm_page_size;
3471                         }
3472                 }
3473
3474                 if (size == 0) {
3475                         ASSERT(unavail_size);
3476                         cnt -= unavail_size;
3477                         offset += unavail_size;
3478                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3479                                 == 0) {
3480                                 /* There is no more to transfer in this
3481                                    cluster
3482                                 */
3483                                 *vsmap_ptr = write_vsmap;
3484                                 VSM_CLR(write_vsmap);
3485                                 VSM_CLR(original_read_vsmap);
3486                         }
3487                         continue;
3488                 }
3489
3490                 if(VSM_ISCLR(original_read_vsmap))
3491                         original_read_vsmap = *vsmap_ptr;
3492
3493                 if(ps->ps_segtype == PS_PARTITION) {
3494 /*
3495                         NEED TO ISSUE WITH SYNC & NO COMMIT
3496                         error = ps_read_device(ps, actual_offset, &buffer,
3497                                        size, &residual, flags);
3498 */
3499                 } else {
3500                         /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3501                         error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset,
3502                                         size, &residual,
3503                                         (UPL_IOSYNC | UPL_NOCOMMIT));
3504                 }
3505
3506                 read_vsmap = *vsmap_ptr;
3507
3508
3509                 /*
3510                  * Adjust counts and put data in new BS.  Optimize for the
3511                  * common case, i.e. no error and/or partial data.
3512                  * If there was an error, then we need to error the entire
3513                  * range, even if some data was successfully read.
3514                  *
3515                  */
3516                 if ((error == KERN_SUCCESS) && (residual == 0)) {
3517                         int page_list_count = 0;
3518
3519                         /*
3520                          * Got everything we asked for, supply the data to
3521                          * the new BS.  Note that as a side effect of supplying
3522                          * the data, the buffer holding the supplied data is
3523                          * deallocated from the pager's address space unless
3524                          * the write is unsuccessful.
3525                          */
3526
3527                         /* note buffer will be cleaned up in all cases by */
3528                         /* internal_cluster_write or if an error on write */
3529                         /* the vm_map_copy_page_discard call              */
3530                         *vsmap_ptr = write_vsmap;
3531
3532                         if(vs_cluster_write(vs, upl, offset,
3533                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3534                                 error = KERN_FAILURE;
3535                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
3536                                         /* unmap the new backing store object */
3537                                         ps_clunmap(vs, offset, size);
3538                                 }
3539                                 /* original vsmap */
3540                                 *vsmap_ptr = original_read_vsmap;
3541                                 VSM_CLR(write_vsmap);
3542                         } else {
3543                                if((offset + size) &
3544                                         ((vm_page_size << vs->vs_clshift)
3545                                         - 1)) {
3546                                         /* There is more to transfer in this
3547                                            cluster
3548                                         */
3549                                         write_vsmap = *vsmap_ptr;
3550                                         *vsmap_ptr = read_vsmap;
3551                                 } else {
3552                                         /* discard the old backing object */
3553                                         write_vsmap = *vsmap_ptr;
3554                                         *vsmap_ptr = read_vsmap;
3555                                         ps_clunmap(vs, offset, size);
3556                                         *vsmap_ptr = write_vsmap;
3557                                         VSM_CLR(write_vsmap);
3558                                         VSM_CLR(original_read_vsmap);
3559                                 }
3560                         }
3561                 } else {
3562                         size_wanted = size;
3563                         if (error == KERN_SUCCESS) {
3564                                 if (residual == size) {
3565                                         /*
3566                                          * If a read operation returns no error
3567                                          * and no data moved, we turn it into
3568                                          * an error, assuming we're reading at
3569                                          * or beyond EOF.
3570                                          * Fall through and error the entire
3571                                          * range.
3572                                          */
3573                                         error = KERN_FAILURE;
3574                                         *vsmap_ptr = write_vsmap;
3575                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3576                                         /* unmap the new backing store object */
3577                                         ps_clunmap(vs, offset, size);
3578                                         }
3579                                         *vsmap_ptr = original_read_vsmap;
3580                                         VSM_CLR(write_vsmap);
3581                                         continue;
3582                                 } else {
3583                                         /*
3584                                          * Otherwise, we have partial read.
3585                                          * This is also considered an error
3586                                          * for the purposes of cluster transfer
3587                                          */
3588                                         error = KERN_FAILURE;
3589                                         *vsmap_ptr = write_vsmap;
3590                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3591                                         /* unmap the new backing store object */
3592                                         ps_clunmap(vs, offset, size);
3593                                         }
3594                                         *vsmap_ptr = original_read_vsmap;
3595                                         VSM_CLR(write_vsmap);
3596                                         continue;
3597                                 }
3598                         }
3599
3600                 }
3601                 cnt -= size;
3602                 offset += size;
3603
3604         } /* END while (cnt && (error == 0)) */
3605         if(!VSM_ISCLR(write_vsmap))
3606                 *vsmap_ptr = write_vsmap;
3607
3608         return error;
3609 }
3610
3611 kern_return_t
3612 default_pager_add_file(MACH_PORT_FACE backing_store,
3613         int             *vp,
3614         int             record_size,
3615         long            size)
3616 {
3617         backing_store_t         bs;
3618         paging_segment_t        ps;
3619         int                     i;
3620         int                     error;
3621
3622         if ((bs = backing_store_lookup(backing_store))
3623             == BACKING_STORE_NULL)
3624                 return KERN_INVALID_ARGUMENT;
3625
3626         PSL_LOCK();
3627         for (i = 0; i <= paging_segment_max; i++) {
3628                 ps = paging_segments[i];
3629                 if (ps == PAGING_SEGMENT_NULL)
3630                         continue;
3631                 if (ps->ps_segtype != PS_FILE)
3632                         continue;
3633
3634                 /*
3635                  * Check for overlap on same device.
3636                  */
3637                 if (ps->ps_vnode == (struct vnode *)vp) {
3638                         PSL_UNLOCK();
3639                         BS_UNLOCK(bs);
3640                         return KERN_INVALID_ARGUMENT;
3641                 }
3642         }
3643         PSL_UNLOCK();
3644
3645         /*
3646          * Set up the paging segment
3647          */
3648         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3649         if (ps == PAGING_SEGMENT_NULL) {
3650                 BS_UNLOCK(bs);
3651                 return KERN_RESOURCE_SHORTAGE;
3652         }
3653
3654         ps->ps_segtype = PS_FILE;
3655         ps->ps_vnode = (struct vnode *)vp;
3656         ps->ps_offset = 0;
3657         ps->ps_record_shift = local_log2(vm_page_size / record_size);
3658         ps->ps_recnum = size;
3659         ps->ps_pgnum = size >> ps->ps_record_shift;
3660
3661         ps->ps_pgcount = ps->ps_pgnum;
3662         ps->ps_clshift = local_log2(bs->bs_clsize);
3663         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3664         ps->ps_hint = 0;
3665
3666         PS_LOCK_INIT(ps);
3667         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3668         if (!ps->ps_bmap) {
3669                 kfree((vm_offset_t)ps, sizeof *ps);
3670                 BS_UNLOCK(bs);
3671                 return KERN_RESOURCE_SHORTAGE;
3672         }
3673         for (i = 0; i < ps->ps_ncls; i++) {
3674                 clrbit(ps->ps_bmap, i);
3675         }
3676
3677         ps->ps_going_away = FALSE;
3678         ps->ps_bs = bs;
3679
3680         if ((error = ps_enter(ps)) != 0) {
3681                 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3682                 kfree((vm_offset_t)ps, sizeof *ps);
3683                 BS_UNLOCK(bs);
3684                 return KERN_RESOURCE_SHORTAGE;
3685         }
3686
3687         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3688         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3689         PSL_LOCK();
3690         dp_pages_free += ps->ps_pgcount;
3691         PSL_UNLOCK();
3692
3693         BS_UNLOCK(bs);
3694
3695         bs_more_space(ps->ps_clcount);
3696
3697         DEBUG(DEBUG_BS_INTERNAL,
3698               ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3699                device, offset, size, record_size,
3700                ps->ps_record_shift, ps->ps_pgnum));
3701
3702         return KERN_SUCCESS;
3703 }
3704
3705
3706
3707 kern_return_t
3708 ps_read_file(
3709         paging_segment_t        ps,
3710         upl_t                   upl,
3711         vm_offset_t             upl_offset,
3712         vm_offset_t             offset,
3713         unsigned int            size,
3714         unsigned int            *residualp,
3715         int                     flags)
3716 {
3717         vm_object_offset_t      f_offset;
3718         int                     error = 0;
3719         int                     result;
3720
3721
3722         clustered_reads[atop_32(size)]++;
3723
3724         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3725
3726         /* for transfer case we need to pass uploffset and flags */
3727         error = vnode_pagein(ps->ps_vnode,
3728                                    upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
3729
3730         /* The vnode_pagein semantic is somewhat at odds with the existing   */
3731         /* device_read semantic.  Partial reads are not experienced at this  */
3732         /* level.  It is up to the bit map code and cluster read code to     */
3733         /* check that requested data locations are actually backed, and the  */
3734         /* pagein code to either read all of the requested data or return an */
3735         /* error. */
3736
3737         if (error)
3738                 result = KERN_FAILURE;
3739         else {
3740                 *residualp = 0;
3741                 result = KERN_SUCCESS;
3742         }
3743         return result;
3744 }
3745
3746 kern_return_t
3747 ps_write_file(
3748         paging_segment_t        ps,
3749         upl_t                   upl,
3750         vm_offset_t             upl_offset,
3751         vm_offset_t             offset,
3752         unsigned int            size,
3753         int                     flags)
3754 {
3755         vm_object_offset_t      f_offset;
3756         kern_return_t           result;
3757
3758         int             error = 0;
3759
3760         clustered_writes[atop_32(size)]++;
3761         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3762
3763         if (vnode_pageout(ps->ps_vnode,
3764                                 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3765                 result = KERN_FAILURE;
3766         else
3767                 result = KERN_SUCCESS;
3768
3769         return result;
3770 }
3771
3772 kern_return_t
3773 default_pager_triggers(MACH_PORT_FACE default_pager,
3774         int             hi_wat,
3775         int             lo_wat,
3776         int             flags,
3777         MACH_PORT_FACE  trigger_port)
3778 {
3779         MACH_PORT_FACE release;
3780         kern_return_t kr;
3781
3782         PSL_LOCK();
3783         if (flags == HI_WAT_ALERT) {
3784                 release = min_pages_trigger_port;
3785                 min_pages_trigger_port = trigger_port;
3786                 minimum_pages_remaining = hi_wat/vm_page_size;
3787                 bs_low = FALSE;
3788                 kr = KERN_SUCCESS;
3789         } else if (flags ==  LO_WAT_ALERT) {
3790                 release = max_pages_trigger_port;
3791                 max_pages_trigger_port = trigger_port;
3792                 maximum_pages_free = lo_wat/vm_page_size;
3793                 kr = KERN_SUCCESS;
3794         } else {
3795                 release = trigger_port;
3796                 kr =  KERN_INVALID_ARGUMENT;
3797         }
3798         PSL_UNLOCK();
3799
3800         if (IP_VALID(release))
3801                 ipc_port_release_send(release);
3802
3803         return kr;
3804 }