osfmk/default_pager/dp_backing_store.c

   1
   2 /*
   3  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   4  *
   5  * @APPLE_LICENSE_HEADER_START@
   6  *
   7  * The contents of this file constitute Original Code as defined in and
   8  * are subject to the Apple Public Source License Version 1.1 (the
   9  * "License").  You may not use this file except in compliance with the
  10  * License.  Please obtain a copy of the License at
  11  * http://www.apple.com/publicsource and read it before using this file.
  12  *
  13  * This Original Code and all software distributed under the License are
  14  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  18  * License for the specific language governing rights and limitations
  19  * under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*
  24  * @OSF_COPYRIGHT@
  25  */
  26 /*
  27  * Mach Operating System
  28  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  29  * All Rights Reserved.
  30  *
  31  * Permission to use, copy, modify and distribute this software and its
  32  * documentation is hereby granted, provided that both the copyright
  33  * notice and this permission notice appear in all copies of the
  34  * software, derivative works or modified versions, and any portions
  35  * thereof, and that both notices appear in supporting documentation.
  36  *
  37  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  38  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  39  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  40  *
  41  * Carnegie Mellon requests users of this software to return to
  42  *
  43  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  44  *  School of Computer Science
  45  *  Carnegie Mellon University
  46  *  Pittsburgh PA 15213-3890
  47  *
  48  * any improvements or extensions that they make and grant Carnegie Mellon
  49  * the rights to redistribute these changes.
  50  */
  51
  52 /*
  53  *      Default Pager.
  54  *              Paging File Management.
  55  */
  56
  57 #include <mach/memory_object_control.h>
  58 #include <mach/memory_object_server.h>
  59 #include "default_pager_internal.h"
  60 #include <default_pager/default_pager_alerts.h>
  61 #include <ipc/ipc_port.h>
  62 #include <ipc/ipc_space.h>
  63 #include <kern/queue.h>
  64 #include <kern/counters.h>
  65 #include <kern/sched_prim.h>
  66 #include <vm/vm_kern.h>
  67 #include <vm/vm_pageout.h>
  68 /* CDY CDY */
  69 #include <vm/vm_map.h>
  70
  71 /*
  72  * ALLOC_STRIDE... the maximum number of bytes allocated from
  73  * a swap file before moving on to the next swap file... if
  74  * all swap files reside on a single disk, this value should
  75  * be very large (this is the default assumption)... if the
  76  * swap files are spread across multiple disks, than this value
  77  * should be small (128 * 1024)...
  78  *
  79  * This should be determined dynamically in the future
  80  */
  81
  82 #define ALLOC_STRIDE  (1024 * 1024 * 1024)
  83 int physical_transfer_cluster_count = 0;
  84
  85 #define VM_SUPER_CLUSTER        0x20000
  86 #define VM_SUPER_PAGES          32
  87
  88 /*
  89  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
  90  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
  91  */
  92 #define VSTRUCT_DEF_CLSHIFT     2
  93 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
  94 int default_pager_clsize = 0;
  95
  96 /* statistics */
  97 unsigned int clustered_writes[VM_SUPER_PAGES+1];
  98 unsigned int clustered_reads[VM_SUPER_PAGES+1];
  99
 100 /*
 101  * Globals used for asynchronous paging operations:
 102  *      vs_async_list:  head of list of to-be-completed I/O ops
 103  *      async_num_queued: number of pages completed, but not yet
 104  *              processed by async thread.
 105  *      async_requests_out: number of pages of requests not completed.
 106  */
 107
 108 #if 0
 109 struct vs_async *vs_async_list;
 110 int     async_num_queued;
 111 int     async_requests_out;
 112 #endif
 113
 114
 115 #define VS_ASYNC_REUSE 1
 116 struct vs_async *vs_async_free_list;
 117
 118 mutex_t default_pager_async_lock;       /* Protects globals above */
 119
 120
 121 int vs_alloc_async_failed = 0;                  /* statistics */
 122 int vs_alloc_async_count = 0;                   /* statistics */
 123 struct vs_async *vs_alloc_async(void);          /* forward */
 124 void vs_free_async(struct vs_async *vsa);       /* forward */
 125
 126
 127 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 128 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 129
 130 #define VS_ASYNC_LOCK()         mutex_lock(&default_pager_async_lock)
 131 #define VS_ASYNC_UNLOCK()       mutex_unlock(&default_pager_async_lock)
 132 #define VS_ASYNC_LOCK_INIT()    mutex_init(&default_pager_async_lock,  \
 133                                                 ETAP_IO_DEV_PAGEH)
 134 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 135 /*
 136  *  Paging Space Hysteresis triggers and the target notification port
 137  *
 138  */
 139
 140 unsigned int    minimum_pages_remaining = 0;
 141 unsigned int    maximum_pages_free = 0;
 142 ipc_port_t      min_pages_trigger_port = NULL;
 143 ipc_port_t      max_pages_trigger_port = NULL;
 144
 145 boolean_t       bs_low = FALSE;
 146 int             backing_store_release_trigger_disable = 0;
 147
 148
 149
 150 /*
 151  * Object sizes are rounded up to the next power of 2,
 152  * unless they are bigger than a given maximum size.
 153  */
 154 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 155
 156 /*
 157  * List of all backing store and segments.
 158  */
 159 struct backing_store_list_head backing_store_list;
 160 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 161 mutex_t                 paging_segments_lock;
 162 int                     paging_segment_max = 0;
 163 int                     paging_segment_count = 0;
 164 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 165
 166
 167 /*
 168  * Total pages free in system
 169  * This differs from clusters committed/avail which is a measure of the
 170  * over commitment of paging segments to backing store.  An idea which is
 171  * likely to be deprecated.
 172  */
 173 unsigned  int   dp_pages_free = 0;
 174 unsigned  int   cluster_transfer_minimum = 100;
 175
 176 kern_return_t ps_write_file(paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, int);      /* forward */
 177 kern_return_t ps_read_file (paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, unsigned int *, int);      /* forward */
 178
 179
 180 default_pager_thread_t *
 181 get_read_buffer()
 182 {
 183         int     i;
 184
 185         DPT_LOCK(dpt_lock);
 186         while(TRUE) {
 187                 for (i=0; i<default_pager_internal_count; i++) {
 188                         if(dpt_array[i]->checked_out == FALSE) {
 189                           dpt_array[i]->checked_out = TRUE;
 190                           DPT_UNLOCK(dpt_lock);
 191                           return  dpt_array[i];
 192                         }
 193                 }
 194                 assert_wait(&dpt_array, THREAD_UNINT);
 195                 DPT_UNLOCK(dpt_lock);
 196                 thread_block((void(*)(void))0);
 197         }
 198 }
 199
 200 void
 201 bs_initialize(void)
 202 {
 203         int i;
 204
 205         /*
 206          * List of all backing store.
 207          */
 208         BSL_LOCK_INIT();
 209         queue_init(&backing_store_list.bsl_queue);
 210         PSL_LOCK_INIT();
 211
 212         VS_ASYNC_LOCK_INIT();
 213 #if     VS_ASYNC_REUSE
 214         vs_async_free_list = NULL;
 215 #endif  /* VS_ASYNC_REUSE */
 216
 217         for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
 218                 clustered_writes[i] = 0;
 219                 clustered_reads[i] = 0;
 220         }
 221
 222 }
 223
 224 /*
 225  * When things do not quite workout...
 226  */
 227 void bs_no_paging_space(boolean_t);     /* forward */
 228
 229 void
 230 bs_no_paging_space(
 231         boolean_t out_of_memory)
 232 {
 233
 234         if (out_of_memory)
 235                 dprintf(("*** OUT OF MEMORY ***\n"));
 236         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 237 }
 238
 239 void bs_more_space(int);        /* forward */
 240 void bs_commit(int);            /* forward */
 241
 242 boolean_t       user_warned = FALSE;
 243 unsigned int    clusters_committed = 0;
 244 unsigned int    clusters_available = 0;
 245 unsigned int    clusters_committed_peak = 0;
 246
 247 void
 248 bs_more_space(
 249         int     nclusters)
 250 {
 251         BSL_LOCK();
 252         /*
 253          * Account for new paging space.
 254          */
 255         clusters_available += nclusters;
 256
 257         if (clusters_available >= clusters_committed) {
 258                 if (verbose && user_warned) {
 259                         printf("%s%s - %d excess clusters now.\n",
 260                                my_name,
 261                                "paging space is OK now",
 262                                clusters_available - clusters_committed);
 263                         user_warned = FALSE;
 264                         clusters_committed_peak = 0;
 265                 }
 266         } else {
 267                 if (verbose && user_warned) {
 268                         printf("%s%s - still short of %d clusters.\n",
 269                                my_name,
 270                                "WARNING: paging space over-committed",
 271                                clusters_committed - clusters_available);
 272                         clusters_committed_peak -= nclusters;
 273                 }
 274         }
 275         BSL_UNLOCK();
 276
 277         return;
 278 }
 279
 280 void
 281 bs_commit(
 282         int     nclusters)
 283 {
 284         BSL_LOCK();
 285         clusters_committed += nclusters;
 286         if (clusters_committed > clusters_available) {
 287                 if (verbose && !user_warned) {
 288                         user_warned = TRUE;
 289                         printf("%s%s - short of %d clusters.\n",
 290                                my_name,
 291                                "WARNING: paging space over-committed",
 292                                clusters_committed - clusters_available);
 293                 }
 294                 if (clusters_committed > clusters_committed_peak) {
 295                         clusters_committed_peak = clusters_committed;
 296                 }
 297         } else {
 298                 if (verbose && user_warned) {
 299                         printf("%s%s - was short of up to %d clusters.\n",
 300                                my_name,
 301                                "paging space is OK now",
 302                                clusters_committed_peak - clusters_available);
 303                         user_warned = FALSE;
 304                         clusters_committed_peak = 0;
 305                 }
 306         }
 307         BSL_UNLOCK();
 308
 309         return;
 310 }
 311
 312 int default_pager_info_verbose = 1;
 313
 314 void
 315 bs_global_info(
 316         vm_size_t       *totalp,
 317         vm_size_t       *freep)
 318 {
 319         vm_size_t               pages_total, pages_free;
 320         paging_segment_t        ps;
 321         int                     i;
 322
 323         PSL_LOCK();
 324         pages_total = pages_free = 0;
 325         for (i = 0; i <= paging_segment_max; i++) {
 326                 ps = paging_segments[i];
 327                 if (ps == PAGING_SEGMENT_NULL)
 328                         continue;
 329
 330                 /*
 331                  * no need to lock: by the time this data
 332                  * gets back to any remote requestor it
 333                  * will be obsolete anyways
 334                  */
 335                 pages_total += ps->ps_pgnum;
 336                 pages_free += ps->ps_clcount << ps->ps_clshift;
 337                 DEBUG(DEBUG_BS_INTERNAL,
 338                       ("segment #%d: %d total, %d free\n",
 339                        i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 340         }
 341         *totalp = pages_total;
 342         *freep = pages_free;
 343         if (verbose && user_warned && default_pager_info_verbose) {
 344                 if (clusters_available < clusters_committed) {
 345                         printf("%s %d clusters committed, %d available.\n",
 346                                my_name,
 347                                clusters_committed,
 348                                clusters_available);
 349                 }
 350         }
 351         PSL_UNLOCK();
 352 }
 353
 354 backing_store_t backing_store_alloc(void);      /* forward */
 355
 356 backing_store_t
 357 backing_store_alloc(void)
 358 {
 359         backing_store_t bs;
 360
 361         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 362         if (bs == BACKING_STORE_NULL)
 363                 panic("backing_store_alloc: no memory");
 364
 365         BS_LOCK_INIT(bs);
 366         bs->bs_port = MACH_PORT_NULL;
 367         bs->bs_priority = 0;
 368         bs->bs_clsize = 0;
 369         bs->bs_pages_total = 0;
 370         bs->bs_pages_in = 0;
 371         bs->bs_pages_in_fail = 0;
 372         bs->bs_pages_out = 0;
 373         bs->bs_pages_out_fail = 0;
 374
 375         return bs;
 376 }
 377
 378 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 379
 380 /* Even in both the component space and external versions of this pager, */
 381 /* backing_store_lookup will be called from tasks in the application space */
 382 backing_store_t
 383 backing_store_lookup(
 384         MACH_PORT_FACE port)
 385 {
 386         backing_store_t bs;
 387
 388 /*
 389         port is currently backed with a vs structure in the alias field
 390         we could create an ISBS alias and a port_is_bs call but frankly
 391         I see no reason for the test, the bs->port == port check below
 392         will work properly on junk entries.
 393
 394         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 395 */
 396         if ((port == MACH_PORT_NULL))
 397                 return BACKING_STORE_NULL;
 398
 399         BSL_LOCK();
 400         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 401                       bs_links) {
 402                 BS_LOCK(bs);
 403                 if (bs->bs_port == port) {
 404                         BSL_UNLOCK();
 405                         /* Success, return it locked. */
 406                         return bs;
 407                 }
 408                 BS_UNLOCK(bs);
 409         }
 410         BSL_UNLOCK();
 411         return BACKING_STORE_NULL;
 412 }
 413
 414 void backing_store_add(backing_store_t);        /* forward */
 415
 416 void
 417 backing_store_add(
 418         backing_store_t bs)
 419 {
 420         MACH_PORT_FACE          port = bs->bs_port;
 421         MACH_PORT_FACE          pset = default_pager_default_set;
 422         kern_return_t           kr = KERN_SUCCESS;
 423
 424         if (kr != KERN_SUCCESS)
 425                 panic("backing_store_add: add to set");
 426
 427 }
 428
 429 /*
 430  * Set up default page shift, but only if not already
 431  * set and argument is within range.
 432  */
 433 boolean_t
 434 bs_set_default_clsize(unsigned int npages)
 435 {
 436         switch(npages){
 437             case 1:
 438             case 2:
 439             case 4:
 440             case 8:
 441                 if (default_pager_clsize == 0)  /* if not yet set */
 442                         vstruct_def_clshift = local_log2(npages);
 443                 return(TRUE);
 444         }
 445         return(FALSE);
 446 }
 447
 448 int bs_get_global_clsize(int clsize);   /* forward */
 449
 450 int
 451 bs_get_global_clsize(
 452         int     clsize)
 453 {
 454         int                     i;
 455         memory_object_default_t dmm;
 456         kern_return_t           kr;
 457
 458         /*
 459          * Only allow setting of cluster size once. If called
 460          * with no cluster size (default), we use the compiled-in default
 461          * for the duration. The same cluster size is used for all
 462          * paging segments.
 463          */
 464         if (default_pager_clsize == 0) {
 465                 /*
 466                  * Keep cluster size in bit shift because it's quicker
 467                  * arithmetic, and easier to keep at a power of 2.
 468                  */
 469                 if (clsize != NO_CLSIZE) {
 470                         for (i = 0; (1 << i) < clsize; i++);
 471                         if (i > MAX_CLUSTER_SHIFT)
 472                                 i = MAX_CLUSTER_SHIFT;
 473                         vstruct_def_clshift = i;
 474                 }
 475                 default_pager_clsize = (1 << vstruct_def_clshift);
 476
 477                 /*
 478                  * Let the user know the new (and definitive) cluster size.
 479                  */
 480                 if (verbose)
 481                         printf("%scluster size = %d page%s\n",
 482                                 my_name, default_pager_clsize,
 483                                 (default_pager_clsize == 1) ? "" : "s");
 484
 485                 /*
 486                  * Let the kernel know too, in case it hasn't used the
 487                  * default value provided in main() yet.
 488                  */
 489                 dmm = default_pager_object;
 490                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 491                 kr = host_default_memory_manager(host_priv_self(),
 492                                                  &dmm,
 493                                                  clsize);
 494                 memory_object_default_deallocate(dmm);
 495
 496                 if (kr != KERN_SUCCESS) {
 497                    panic("bs_get_global_cl_size:host_default_memory_manager");
 498                 }
 499                 if (dmm != default_pager_object) {
 500                   panic("bs_get_global_cl_size:there is another default pager");
 501                 }
 502         }
 503         ASSERT(default_pager_clsize > 0 &&
 504                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 505
 506         return default_pager_clsize;
 507 }
 508
 509 kern_return_t
 510 default_pager_backing_store_create(
 511         memory_object_default_t pager,
 512         int                     priority,
 513         int                     clsize,         /* in bytes */
 514         MACH_PORT_FACE          *backing_store)
 515 {
 516         backing_store_t bs;
 517         MACH_PORT_FACE  port;
 518         kern_return_t   kr;
 519         struct vstruct_alias *alias_struct;
 520
 521         if (pager != default_pager_object)
 522                 return KERN_INVALID_ARGUMENT;
 523
 524         bs = backing_store_alloc();
 525         port = ipc_port_alloc_kernel();
 526         ipc_port_make_send(port);
 527         assert (port != IP_NULL);
 528
 529         DEBUG(DEBUG_BS_EXTERNAL,
 530               ("priority=%d clsize=%d bs_port=0x%x\n",
 531                priority, clsize, (int) backing_store));
 532
 533         alias_struct = (struct vstruct_alias *)
 534                                 kalloc(sizeof (struct vstruct_alias));
 535         if(alias_struct != NULL) {
 536                 alias_struct->vs = (struct vstruct *)bs;
 537                 alias_struct->name = ISVS;
 538                 port->alias = (int) alias_struct;
 539         }
 540         else {
 541                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 542                 kfree((vm_offset_t)bs, sizeof (struct backing_store));
 543                 return KERN_RESOURCE_SHORTAGE;
 544         }
 545
 546         bs->bs_port = port;
 547         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 548                 priority = BS_MAXPRI;
 549         else if (priority == BS_NOPRI)
 550                 priority = BS_MAXPRI;
 551         else
 552                 priority = BS_MINPRI;
 553         bs->bs_priority = priority;
 554
 555         bs->bs_clsize = bs_get_global_clsize(atop(clsize));
 556
 557         BSL_LOCK();
 558         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 559                     bs_links);
 560         BSL_UNLOCK();
 561
 562         backing_store_add(bs);
 563
 564         *backing_store = port;
 565         return KERN_SUCCESS;
 566 }
 567
 568 kern_return_t
 569 default_pager_backing_store_info(
 570         MACH_PORT_FACE          backing_store,
 571         backing_store_flavor_t  flavour,
 572         backing_store_info_t    info,
 573         mach_msg_type_number_t  *size)
 574 {
 575         backing_store_t                 bs;
 576         backing_store_basic_info_t      basic;
 577         int                             i;
 578         paging_segment_t                ps;
 579
 580         if (flavour != BACKING_STORE_BASIC_INFO ||
 581             *size < BACKING_STORE_BASIC_INFO_COUNT)
 582                 return KERN_INVALID_ARGUMENT;
 583
 584         basic = (backing_store_basic_info_t)info;
 585         *size = BACKING_STORE_BASIC_INFO_COUNT;
 586
 587         VSTATS_LOCK(&global_stats.gs_lock);
 588         basic->pageout_calls    = global_stats.gs_pageout_calls;
 589         basic->pagein_calls     = global_stats.gs_pagein_calls;
 590         basic->pages_in         = global_stats.gs_pages_in;
 591         basic->pages_out        = global_stats.gs_pages_out;
 592         basic->pages_unavail    = global_stats.gs_pages_unavail;
 593         basic->pages_init       = global_stats.gs_pages_init;
 594         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 595         VSTATS_UNLOCK(&global_stats.gs_lock);
 596
 597         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 598                 return KERN_INVALID_ARGUMENT;
 599
 600         basic->bs_pages_total   = bs->bs_pages_total;
 601         PSL_LOCK();
 602         bs->bs_pages_free = 0;
 603         for (i = 0; i <= paging_segment_max; i++) {
 604                 ps = paging_segments[i];
 605                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 606                         PS_LOCK(ps);
 607                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 608                         PS_UNLOCK(ps);
 609                 }
 610         }
 611         PSL_UNLOCK();
 612         basic->bs_pages_free    = bs->bs_pages_free;
 613         basic->bs_pages_in      = bs->bs_pages_in;
 614         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 615         basic->bs_pages_out     = bs->bs_pages_out;
 616         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 617
 618         basic->bs_priority      = bs->bs_priority;
 619         basic->bs_clsize        = ptoa(bs->bs_clsize);  /* in bytes */
 620
 621         BS_UNLOCK(bs);
 622
 623         return KERN_SUCCESS;
 624 }
 625
 626 int ps_delete(paging_segment_t);        /* forward */
 627
 628 int
 629 ps_delete(
 630         paging_segment_t ps)
 631 {
 632         vstruct_t       vs;
 633         kern_return_t   error = KERN_SUCCESS;
 634         int             vs_count;
 635
 636         VSL_LOCK();             /* get the lock on the list of vs's      */
 637
 638         /* The lock relationship and sequence is farily complicated      */
 639         /* this code looks at a live list, locking and unlocking the list */
 640         /* as it traverses it.  It depends on the locking behavior of    */
 641         /* default_pager_no_senders.  no_senders always locks the vstruct */
 642         /* targeted for removal before locking the vstruct list.  However */
 643         /* it will remove that member of the list without locking its    */
 644         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 645         /* it cannot be removed from the list but we must hold the list  */
 646         /* lock to be sure that its pointers to its neighbors are valid. */
 647         /* Also, we can hold off destruction of a vstruct when the list  */
 648         /* lock and the vs locks are not being held by bumping the       */
 649         /* vs_async_pending count.      */
 650
 651
 652         while(backing_store_release_trigger_disable != 0) {
 653                 assert_wait((event_t)
 654                         &backing_store_release_trigger_disable,
 655                         THREAD_UNINT);
 656                 VSL_UNLOCK();
 657                 thread_block((void (*)(void)) 0);
 658                 VSL_LOCK();
 659         }
 660
 661         /* we will choose instead to hold a send right */
 662         vs_count = vstruct_list.vsl_count;
 663         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 664         if(vs == (vstruct_t)&vstruct_list)  {
 665                 VSL_UNLOCK();
 666                 return KERN_SUCCESS;
 667         }
 668         VS_LOCK(vs);
 669         vs_async_wait(vs);  /* wait for any pending async writes */
 670         if ((vs_count != 0) && (vs != NULL))
 671                 vs->vs_async_pending += 1;  /* hold parties calling  */
 672                                             /* vs_async_wait */
 673         VS_UNLOCK(vs);
 674         VSL_UNLOCK();
 675         while((vs_count != 0) && (vs != NULL)) {
 676                 /* We take the count of AMO's before beginning the         */
 677                 /* transfer of of the target segment.                      */
 678                 /* We are guaranteed that the target segment cannot get    */
 679                 /* more users.  We also know that queue entries are        */
 680                 /* made at the back of the list.  If some of the entries   */
 681                 /* we would check disappear while we are traversing the    */
 682                 /* list then we will either check new entries which        */
 683                 /* do not have any backing store in the target segment     */
 684                 /* or re-check old entries.  This might not be optimal     */
 685                 /* but it will always be correct. The alternative is to    */
 686                 /* take a snapshot of the list.                            */
 687                 vstruct_t       next_vs;
 688
 689                 if(dp_pages_free < cluster_transfer_minimum)
 690                         error = KERN_FAILURE;
 691                 else {
 692                         vm_object_t     transfer_object;
 693                         int             count;
 694                         upl_t           upl;
 695
 696                         transfer_object = vm_object_allocate(VM_SUPER_CLUSTER);
 697                         count = 0;
 698                         error = vm_object_upl_request(transfer_object,
 699                                 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
 700                                 &upl, NULL, &count,
 701                                 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
 702                                             | UPL_SET_INTERNAL);
 703                         if(error == KERN_SUCCESS) {
 704 #ifndef ubc_sync_working
 705                                 upl_commit(upl, NULL);
 706                                 upl_deallocate(upl);
 707                                 error = ps_vstruct_transfer_from_segment(
 708                                                 vs, ps, transfer_object);
 709 #else
 710                                 error = ps_vstruct_transfer_from_segment(
 711                                                         vs, ps, upl);
 712                                 upl_commit(upl, NULL);
 713                                 upl_deallocate(upl);
 714 #endif
 715                                 vm_object_deallocate(transfer_object);
 716                         } else {
 717                                 vm_object_deallocate(transfer_object);
 718                                 error = KERN_FAILURE;
 719                         }
 720                 }
 721                 if(error) {
 722                         VS_LOCK(vs);
 723                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 724                         if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 725                                 vs->vs_waiting_async = FALSE;
 726                                 VS_UNLOCK(vs);
 727                                 thread_wakeup(&vs->vs_async_pending);
 728                         } else {
 729                                 VS_UNLOCK(vs);
 730                         }
 731                         return KERN_FAILURE;
 732                 }
 733
 734                 VSL_LOCK();
 735
 736                 while(backing_store_release_trigger_disable != 0) {
 737                         assert_wait((event_t)
 738                                 &backing_store_release_trigger_disable,
 739                                 THREAD_UNINT);
 740                         VSL_UNLOCK();
 741                         thread_block((void (*)(void)) 0);
 742                         VSL_LOCK();
 743                 }
 744
 745                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 746                 if((next_vs != (vstruct_t)&vstruct_list) &&
 747                                 (vs != next_vs) && (vs_count != 1)) {
 748                         VS_LOCK(next_vs);
 749                         vs_async_wait(next_vs);  /* wait for any  */
 750                                                  /* pending async writes */
 751                         next_vs->vs_async_pending += 1; /* hold parties  */
 752                                                 /* calling vs_async_wait */
 753                         VS_UNLOCK(next_vs);
 754                 }
 755                 VSL_UNLOCK();
 756                 VS_LOCK(vs);
 757                 vs->vs_async_pending -= 1;
 758                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
 759                         vs->vs_waiting_async = FALSE;
 760                         VS_UNLOCK(vs);
 761                         thread_wakeup(&vs->vs_async_pending);
 762                 } else {
 763                         VS_UNLOCK(vs);
 764                 }
 765                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 766                         vs = NULL;
 767                 else
 768                         vs = next_vs;
 769                 vs_count--;
 770         }
 771         return KERN_SUCCESS;
 772 }
 773
 774
 775 kern_return_t
 776 default_pager_backing_store_delete(
 777         MACH_PORT_FACE backing_store)
 778 {
 779         backing_store_t         bs;
 780         int                     i;
 781         paging_segment_t        ps;
 782         int                     error;
 783         int                     interim_pages_removed = 0;
 784         kern_return_t           kr;
 785
 786         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 787                 return KERN_INVALID_ARGUMENT;
 788
 789 #if 0
 790         /* not implemented */
 791         BS_UNLOCK(bs);
 792         return KERN_FAILURE;
 793 #endif
 794
 795     restart:
 796         PSL_LOCK();
 797         error = KERN_SUCCESS;
 798         for (i = 0; i <= paging_segment_max; i++) {
 799                 ps = paging_segments[i];
 800                 if (ps != PAGING_SEGMENT_NULL &&
 801                     ps->ps_bs == bs &&
 802                     ! ps->ps_going_away) {
 803                         PS_LOCK(ps);
 804                         /* disable access to this segment */
 805                         ps->ps_going_away = TRUE;
 806                         PS_UNLOCK(ps);
 807                         /*
 808                          * The "ps" segment is "off-line" now,
 809                          * we can try and delete it...
 810                          */
 811                         if(dp_pages_free < (cluster_transfer_minimum
 812                                                         + ps->ps_pgcount)) {
 813                                 error = KERN_FAILURE;
 814                                 PSL_UNLOCK();
 815                         }
 816                         else {
 817                                 /* remove all pages associated with the  */
 818                                 /* segment from the list of free pages   */
 819                                 /* when transfer is through, all target  */
 820                                 /* segment pages will appear to be free  */
 821
 822                                 dp_pages_free -=  ps->ps_pgcount;
 823                                 interim_pages_removed += ps->ps_pgcount;
 824                                 PSL_UNLOCK();
 825                                 error = ps_delete(ps);
 826                         }
 827                         if (error != KERN_SUCCESS) {
 828                                 /*
 829                                  * We couldn't delete the segment,
 830                                  * probably because there's not enough
 831                                  * virtual memory left.
 832                                  * Re-enable all the segments.
 833                                  */
 834                                 PSL_LOCK();
 835                                 break;
 836                         }
 837                         goto restart;
 838                 }
 839         }
 840
 841         if (error != KERN_SUCCESS) {
 842                 for (i = 0; i <= paging_segment_max; i++) {
 843                         ps = paging_segments[i];
 844                         if (ps != PAGING_SEGMENT_NULL &&
 845                             ps->ps_bs == bs &&
 846                             ps->ps_going_away) {
 847                                 PS_LOCK(ps);
 848                                 /* re-enable access to this segment */
 849                                 ps->ps_going_away = FALSE;
 850                                 PS_UNLOCK(ps);
 851                         }
 852                 }
 853                 dp_pages_free += interim_pages_removed;
 854                 PSL_UNLOCK();
 855                 BS_UNLOCK(bs);
 856                 return error;
 857         }
 858
 859         for (i = 0; i <= paging_segment_max; i++) {
 860                 ps = paging_segments[i];
 861                 if (ps != PAGING_SEGMENT_NULL &&
 862                     ps->ps_bs == bs) {
 863                         if(ps->ps_going_away) {
 864                                 paging_segments[i] = PAGING_SEGMENT_NULL;
 865                                 paging_segment_count--;
 866                                 PS_LOCK(ps);
 867                                 kfree((vm_offset_t)ps->ps_bmap,
 868                                                 RMAPSIZE(ps->ps_ncls));
 869                                 kfree((vm_offset_t)ps, sizeof *ps);
 870                         }
 871                 }
 872         }
 873
 874         /* Scan the entire ps array separately to make certain we find the */
 875         /* proper paging_segment_max                                       */
 876         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 877                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 878                    paging_segment_max = i;
 879         }
 880
 881         PSL_UNLOCK();
 882
 883         /*
 884          * All the segments have been deleted.
 885          * We can remove the backing store.
 886          */
 887
 888         /*
 889          * Disable lookups of this backing store.
 890          */
 891         if((void *)bs->bs_port->alias != NULL)
 892                 kfree((vm_offset_t) bs->bs_port->alias,
 893                                 sizeof (struct vstruct_alias));
 894         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 895         bs->bs_port = MACH_PORT_NULL;
 896         BS_UNLOCK(bs);
 897
 898         /*
 899          * Remove backing store from backing_store list.
 900          */
 901         BSL_LOCK();
 902         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
 903                      bs_links);
 904         BSL_UNLOCK();
 905
 906         /*
 907          * Free the backing store structure.
 908          */
 909         kfree((vm_offset_t)bs, sizeof *bs);
 910
 911         return KERN_SUCCESS;
 912 }
 913
 914 int     ps_enter(paging_segment_t);     /* forward */
 915
 916 int
 917 ps_enter(
 918         paging_segment_t ps)
 919 {
 920         int i;
 921
 922         PSL_LOCK();
 923
 924         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 925                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
 926                         break;
 927         }
 928
 929         if (i < MAX_NUM_PAGING_SEGMENTS) {
 930                 paging_segments[i] = ps;
 931                 if (i > paging_segment_max)
 932                         paging_segment_max = i;
 933                 paging_segment_count++;
 934                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
 935                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
 936                         ps_select_array[ps->ps_bs->bs_priority] = 0;
 937                 i = 0;
 938         } else {
 939                 PSL_UNLOCK();
 940                 return KERN_RESOURCE_SHORTAGE;
 941         }
 942
 943         PSL_UNLOCK();
 944         return i;
 945 }
 946
 947 #ifdef DEVICE_PAGING
 948 kern_return_t
 949 default_pager_add_segment(
 950         MACH_PORT_FACE  backing_store,
 951         MACH_PORT_FACE  device,
 952         recnum_t        offset,
 953         recnum_t        count,
 954         int             record_size)
 955 {
 956         backing_store_t         bs;
 957         paging_segment_t        ps;
 958         int                     i;
 959         int                     error;
 960
 961         if ((bs = backing_store_lookup(backing_store))
 962             == BACKING_STORE_NULL)
 963                 return KERN_INVALID_ARGUMENT;
 964
 965         PSL_LOCK();
 966         for (i = 0; i <= paging_segment_max; i++) {
 967                 ps = paging_segments[i];
 968                 if (ps == PAGING_SEGMENT_NULL)
 969                         continue;
 970
 971                 /*
 972                  * Check for overlap on same device.
 973                  */
 974                 if (!(ps->ps_device != device
 975                       || offset >= ps->ps_offset + ps->ps_recnum
 976                       || offset + count <= ps->ps_offset)) {
 977                         PSL_UNLOCK();
 978                         BS_UNLOCK(bs);
 979                         return KERN_INVALID_ARGUMENT;
 980                 }
 981         }
 982         PSL_UNLOCK();
 983
 984         /*
 985          * Set up the paging segment
 986          */
 987         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
 988         if (ps == PAGING_SEGMENT_NULL) {
 989                 BS_UNLOCK(bs);
 990                 return KERN_RESOURCE_SHORTAGE;
 991         }
 992
 993         ps->ps_segtype = PS_PARTITION;
 994         ps->ps_device = device;
 995         ps->ps_offset = offset;
 996         ps->ps_record_shift = local_log2(vm_page_size / record_size);
 997         ps->ps_recnum = count;
 998         ps->ps_pgnum = count >> ps->ps_record_shift;
 999
1000         ps->ps_pgcount = ps->ps_pgnum;
1001         ps->ps_clshift = local_log2(bs->bs_clsize);
1002         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1003         ps->ps_hint = 0;
1004
1005         PS_LOCK_INIT(ps);
1006         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1007         if (!ps->ps_bmap) {
1008                 kfree((vm_offset_t)ps, sizeof *ps);
1009                 BS_UNLOCK(bs);
1010                 return KERN_RESOURCE_SHORTAGE;
1011         }
1012         for (i = 0; i < ps->ps_ncls; i++) {
1013                 clrbit(ps->ps_bmap, i);
1014         }
1015
1016         ps->ps_going_away = FALSE;
1017         ps->ps_bs = bs;
1018
1019         if ((error = ps_enter(ps)) != 0) {
1020                 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1021                 kfree((vm_offset_t)ps, sizeof *ps);
1022                 BS_UNLOCK(bs);
1023                 return KERN_RESOURCE_SHORTAGE;
1024         }
1025
1026         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1027         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1028         BS_UNLOCK(bs);
1029
1030         PSL_LOCK();
1031         dp_pages_free += ps->ps_pgcount;
1032         PSL_UNLOCK();
1033
1034         bs_more_space(ps->ps_clcount);
1035
1036         DEBUG(DEBUG_BS_INTERNAL,
1037               ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1038                device, offset, count, record_size,
1039                ps->ps_record_shift, ps->ps_pgnum));
1040
1041         return KERN_SUCCESS;
1042 }
1043
1044 boolean_t
1045 bs_add_device(
1046         char            *dev_name,
1047         MACH_PORT_FACE  master)
1048 {
1049         security_token_t        null_security_token = {
1050                 { 0, 0 }
1051         };
1052         MACH_PORT_FACE  device;
1053         int             info[DEV_GET_SIZE_COUNT];
1054         mach_msg_type_number_t info_count;
1055         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1056         unsigned int    rec_size;
1057         recnum_t        count;
1058         int             clsize;
1059         MACH_PORT_FACE  reply_port;
1060
1061         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1062                         null_security_token, dev_name, &device))
1063                 return FALSE;
1064
1065         info_count = DEV_GET_SIZE_COUNT;
1066         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1067                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1068                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1069                 clsize = bs_get_global_clsize(0);
1070                 if (!default_pager_backing_store_create(
1071                                         default_pager_object,
1072                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1073                                         (clsize * vm_page_size),
1074                                         &bs)) {
1075                         if (!default_pager_add_segment(bs, device,
1076                                                        0, count, rec_size)) {
1077                                 return TRUE;
1078                         }
1079                         ipc_port_release_receive(bs);
1080                 }
1081         }
1082
1083         ipc_port_release_send(device);
1084         return FALSE;
1085 }
1086 #endif /* DEVICE_PAGING */
1087
1088 #if     VS_ASYNC_REUSE
1089
1090 struct vs_async *
1091 vs_alloc_async(void)
1092 {
1093         struct vs_async *vsa;
1094         MACH_PORT_FACE  reply_port;
1095         kern_return_t   kr;
1096
1097         VS_ASYNC_LOCK();
1098         if (vs_async_free_list == NULL) {
1099                 VS_ASYNC_UNLOCK();
1100                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1101                 if (vsa != NULL) {
1102                         /*
1103                          * Try allocating a reply port named after the
1104                          * address of the vs_async structure.
1105                          */
1106                         struct vstruct_alias    *alias_struct;
1107
1108                         reply_port = ipc_port_alloc_kernel();
1109                         alias_struct = (struct vstruct_alias *)
1110                                 kalloc(sizeof (struct vstruct_alias));
1111                         if(alias_struct != NULL) {
1112                                 alias_struct->vs = (struct vstruct *)vsa;
1113                                 alias_struct->name = ISVS;
1114                                 reply_port->alias = (int) alias_struct;
1115                                 vsa->reply_port = reply_port;
1116                                 vs_alloc_async_count++;
1117                         }
1118                         else {
1119                                 vs_alloc_async_failed++;
1120                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1121                                                                 (reply_port));
1122                                 kfree((vm_offset_t)vsa,
1123                                                 sizeof (struct vs_async));
1124                                 vsa = NULL;
1125                         }
1126                 }
1127         } else {
1128                 vsa = vs_async_free_list;
1129                 vs_async_free_list = vs_async_free_list->vsa_next;
1130                 VS_ASYNC_UNLOCK();
1131         }
1132
1133         return vsa;
1134 }
1135
1136 void
1137 vs_free_async(
1138         struct vs_async *vsa)
1139 {
1140         VS_ASYNC_LOCK();
1141         vsa->vsa_next = vs_async_free_list;
1142         vs_async_free_list = vsa;
1143         VS_ASYNC_UNLOCK();
1144 }
1145
1146 #else   /* VS_ASYNC_REUSE */
1147
1148 struct vs_async *
1149 vs_alloc_async(void)
1150 {
1151         struct vs_async *vsa;
1152         MACH_PORT_FACE  reply_port;
1153         kern_return_t   kr;
1154
1155         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1156         if (vsa != NULL) {
1157                 /*
1158                  * Try allocating a reply port named after the
1159                  * address of the vs_async structure.
1160                  */
1161                         reply_port = ipc_port_alloc_kernel();
1162                         alias_struct = (vstruct_alias *)
1163                                 kalloc(sizeof (struct vstruct_alias));
1164                         if(alias_struct != NULL) {
1165                                 alias_struct->vs = reply_port;
1166                                 alias_struct->name = ISVS;
1167                                 reply_port->alias = (int) vsa;
1168                                 vsa->reply_port = reply_port;
1169                                 vs_alloc_async_count++;
1170                         }
1171                         else {
1172                                 vs_alloc_async_failed++;
1173                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1174                                                                 (reply_port));
1175                                 kfree((vm_offset_t) vsa,
1176                                                 sizeof (struct vs_async));
1177                                 vsa = NULL;
1178                         }
1179         }
1180
1181         return vsa;
1182 }
1183
1184 void
1185 vs_free_async(
1186         struct vs_async *vsa)
1187 {
1188         MACH_PORT_FACE  reply_port;
1189         kern_return_t   kr;
1190
1191         reply_port = vsa->reply_port;
1192         kfree((vm_offset_t) reply_port->alias, sizeof (struct vstuct_alias));
1193         kfree((vm_offset_t) vsa, sizeof (struct vs_async));
1194         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1195 #if 0
1196         VS_ASYNC_LOCK();
1197         vs_alloc_async_count--;
1198         VS_ASYNC_UNLOCK();
1199 #endif
1200 }
1201
1202 #endif  /* VS_ASYNC_REUSE */
1203
1204 zone_t  vstruct_zone;
1205
1206 vstruct_t
1207 ps_vstruct_create(
1208         vm_size_t size)
1209 {
1210         vstruct_t       vs;
1211         int             i;
1212
1213         vs = (vstruct_t) zalloc(vstruct_zone);
1214         if (vs == VSTRUCT_NULL) {
1215                 return VSTRUCT_NULL;
1216         }
1217
1218         VS_LOCK_INIT(vs);
1219
1220         /*
1221          * The following fields will be provided later.
1222          */
1223         vs->vs_mem_obj = NULL;
1224         vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1225         vs->vs_references = 1;
1226         vs->vs_seqno = 0;
1227
1228 #ifdef MACH_KERNEL
1229         vs->vs_waiting_seqno = FALSE;
1230         vs->vs_waiting_read = FALSE;
1231         vs->vs_waiting_write = FALSE;
1232         vs->vs_waiting_async = FALSE;
1233 #else
1234         mutex_init(&vs->vs_waiting_seqno, ETAP_DPAGE_VSSEQNO);
1235         mutex_init(&vs->vs_waiting_read, ETAP_DPAGE_VSREAD);
1236         mutex_init(&vs->vs_waiting_write, ETAP_DPAGE_VSWRITE);
1237         mutex_init(&vs->vs_waiting_refs, ETAP_DPAGE_VSREFS);
1238         mutex_init(&vs->vs_waiting_async, ETAP_DPAGE_VSASYNC);
1239 #endif
1240
1241         vs->vs_readers = 0;
1242         vs->vs_writers = 0;
1243
1244         vs->vs_errors = 0;
1245
1246         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1247         vs->vs_size = ((atop(round_page(size)) - 1) >> vs->vs_clshift) + 1;
1248         vs->vs_async_pending = 0;
1249
1250         /*
1251          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1252          * depending on the size of the memory object.
1253          */
1254         if (INDIRECT_CLMAP(vs->vs_size)) {
1255                 vs->vs_imap = (struct vs_map **)
1256                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1257                 vs->vs_indirect = TRUE;
1258         } else {
1259                 vs->vs_dmap = (struct vs_map *)
1260                         kalloc(CLMAP_SIZE(vs->vs_size));
1261                 vs->vs_indirect = FALSE;
1262         }
1263         vs->vs_xfer_pending = FALSE;
1264         DEBUG(DEBUG_VS_INTERNAL,
1265               ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1266
1267         /*
1268          * Check to see that we got the space.
1269          */
1270         if (!vs->vs_dmap) {
1271                 kfree((vm_offset_t)vs, sizeof *vs);
1272                 return VSTRUCT_NULL;
1273         }
1274
1275         /*
1276          * Zero the indirect pointers, or clear the direct pointers.
1277          */
1278         if (vs->vs_indirect)
1279                 memset(vs->vs_imap, 0,
1280                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1281         else
1282                 for (i = 0; i < vs->vs_size; i++)
1283                         VSM_CLR(vs->vs_dmap[i]);
1284
1285         VS_MAP_LOCK_INIT(vs);
1286
1287         bs_commit(vs->vs_size);
1288
1289         return vs;
1290 }
1291
1292 paging_segment_t ps_select_segment(int, int *); /* forward */
1293
1294 paging_segment_t
1295 ps_select_segment(
1296         int     shift,
1297         int     *psindex)
1298 {
1299         paging_segment_t        ps;
1300         int                     i;
1301         int                     j;
1302
1303         /*
1304          * Optimize case where there's only one segment.
1305          * paging_segment_max will index the one and only segment.
1306          */
1307
1308         PSL_LOCK();
1309         if (paging_segment_count == 1) {
1310                 paging_segment_t lps;   /* used to avoid extra PS_UNLOCK */
1311                 ipc_port_t trigger = IP_NULL;
1312
1313                 ps = paging_segments[paging_segment_max];
1314                 *psindex = paging_segment_max;
1315                 PS_LOCK(ps);
1316                 if (ps->ps_going_away) {
1317                         /* this segment is being turned off */
1318                         lps = PAGING_SEGMENT_NULL;
1319                 } else {
1320                         ASSERT(ps->ps_clshift >= shift);
1321                         if (ps->ps_clcount) {
1322                                 ps->ps_clcount--;
1323                                 dp_pages_free -=  1 << ps->ps_clshift;
1324                                 if(min_pages_trigger_port &&
1325                                   (dp_pages_free < minimum_pages_remaining)) {
1326                                         trigger = min_pages_trigger_port;
1327                                         min_pages_trigger_port = NULL;
1328                                         bs_low = TRUE;
1329                                 }
1330                                 lps = ps;
1331                         } else
1332                                 lps = PAGING_SEGMENT_NULL;
1333                 }
1334                 PS_UNLOCK(ps);
1335                 PSL_UNLOCK();
1336
1337                 if (trigger != IP_NULL) {
1338                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1339                         ipc_port_release_send(trigger);
1340                 }
1341                 return lps;
1342         }
1343
1344         if (paging_segment_count == 0) {
1345                 PSL_UNLOCK();
1346                 return PAGING_SEGMENT_NULL;
1347         }
1348
1349         for (i = BS_MAXPRI;
1350              i >= BS_MINPRI; i--) {
1351                 int start_index;
1352
1353                 if ((ps_select_array[i] == BS_NOPRI) ||
1354                                 (ps_select_array[i] == BS_FULLPRI))
1355                         continue;
1356                 start_index = ps_select_array[i];
1357
1358                 if(!(paging_segments[start_index])) {
1359                         j = start_index+1;
1360                         physical_transfer_cluster_count = 0;
1361                 }
1362                 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1363                                 (((paging_segments[start_index])->ps_clshift)
1364                                 + vm_page_shift))) {
1365                         physical_transfer_cluster_count = 0;
1366                         j = start_index + 1;
1367                 } else {
1368                         physical_transfer_cluster_count+=1;
1369                         j = start_index;
1370                         if(start_index == 0)
1371                                 start_index = paging_segment_max;
1372                         else
1373                                 start_index = start_index - 1;
1374                 }
1375
1376                 while (1) {
1377                         if (j > paging_segment_max)
1378                                 j = 0;
1379                         if ((ps = paging_segments[j]) &&
1380                             (ps->ps_bs->bs_priority == i)) {
1381                                 /*
1382                                  * Force the ps cluster size to be
1383                                  * >= that of the vstruct.
1384                                  */
1385                                 PS_LOCK(ps);
1386                                 if (ps->ps_going_away) {
1387                                         /* this segment is being turned off */
1388                                 } else if ((ps->ps_clcount) &&
1389                                            (ps->ps_clshift >= shift)) {
1390                                         ipc_port_t trigger = IP_NULL;
1391
1392                                         ps->ps_clcount--;
1393                                         dp_pages_free -=  1 << ps->ps_clshift;
1394                                         if(min_pages_trigger_port &&
1395                                                 (dp_pages_free <
1396                                                 minimum_pages_remaining)) {
1397                                                 trigger = min_pages_trigger_port;
1398                                                 min_pages_trigger_port = NULL;
1399                                         }
1400                                         PS_UNLOCK(ps);
1401                                         /*
1402                                          * found one, quit looking.
1403                                          */
1404                                         ps_select_array[i] = j;
1405                                         PSL_UNLOCK();
1406
1407                                         if (trigger != IP_NULL) {
1408                                                 default_pager_space_alert(
1409                                                         trigger,
1410                                                         HI_WAT_ALERT);
1411                                                 ipc_port_release_send(trigger);
1412                                         }
1413                                         *psindex = j;
1414                                         return ps;
1415                                 }
1416                                 PS_UNLOCK(ps);
1417                         }
1418                         if (j == start_index) {
1419                                 /*
1420                                  * none at this priority -- mark it full
1421                                  */
1422                                 ps_select_array[i] = BS_FULLPRI;
1423                                 break;
1424                         }
1425                         j++;
1426                 }
1427         }
1428         PSL_UNLOCK();
1429         return PAGING_SEGMENT_NULL;
1430 }
1431
1432 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1433
1434 vm_offset_t
1435 ps_allocate_cluster(
1436         vstruct_t               vs,
1437         int                     *psindex,
1438         paging_segment_t        use_ps)
1439 {
1440         int                     byte_num;
1441         int                     bit_num = 0;
1442         paging_segment_t        ps;
1443         vm_offset_t             cluster;
1444         ipc_port_t              trigger = IP_NULL;
1445
1446         /*
1447          * Find best paging segment.
1448          * ps_select_segment will decrement cluster count on ps.
1449          * Must pass cluster shift to find the most appropriate segment.
1450          */
1451         /* NOTE:  The addition of paging segment delete capability threatened
1452          * to seriously complicate the treatment of paging segments in this
1453          * module and the ones that call it (notably ps_clmap), because of the
1454          * difficulty in assuring that the paging segment would continue to
1455          * exist between being unlocked and locked.   This was
1456          * avoided because all calls to this module are based in either
1457          * dp_memory_object calls which rely on the vs lock, or by
1458          * the transfer function which is part of the segment delete path.
1459          * The transfer function which is part of paging segment delete is
1460          * protected from multiple callers by the backing store lock.
1461          * The paging segment delete function treats mappings to a paging
1462          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1463          * while data is transferred to the remaining segments.  This is in
1464          * line with the view that incomplete or in-transition mappings between
1465          * data, a vstruct, and backing store are protected by the vs lock.
1466          * This and the ordering of the paging segment "going_away" bit setting
1467          * protects us.
1468          */
1469         if (use_ps != PAGING_SEGMENT_NULL) {
1470                 ps = use_ps;
1471                 PSL_LOCK();
1472                 PS_LOCK(ps);
1473                 ps->ps_clcount--;
1474                 dp_pages_free -=  1 << ps->ps_clshift;
1475                 if(min_pages_trigger_port &&
1476                                 (dp_pages_free < minimum_pages_remaining)) {
1477                         trigger = min_pages_trigger_port;
1478                         min_pages_trigger_port = NULL;
1479                 }
1480                 PSL_UNLOCK();
1481                 PS_UNLOCK(ps);
1482                 if (trigger != IP_NULL) {
1483                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1484                         ipc_port_release_send(trigger);
1485                 }
1486
1487         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1488                    PAGING_SEGMENT_NULL) {
1489 #if 0
1490                 bs_no_paging_space(TRUE);
1491 #endif
1492 #if 0
1493                 if (verbose)
1494 #endif
1495                         dprintf(("no space in available paging segments; "
1496                                  "swapon suggested\n"));
1497                 /* the count got off maybe, reset to zero */
1498                 PSL_LOCK();
1499                 dp_pages_free = 0;
1500                 if(min_pages_trigger_port) {
1501                         trigger = min_pages_trigger_port;
1502                         min_pages_trigger_port = NULL;
1503                         bs_low = TRUE;
1504                 }
1505                 PSL_UNLOCK();
1506                 if (trigger != IP_NULL) {
1507                         default_pager_space_alert(trigger, HI_WAT_ALERT);
1508                         ipc_port_release_send(trigger);
1509                 }
1510                 return (vm_offset_t) -1;
1511         }
1512         ASSERT(ps->ps_clcount != 0);
1513
1514         /*
1515          * Look for an available cluster.  At the end of the loop,
1516          * byte_num is the byte offset and bit_num is the bit offset of the
1517          * first zero bit in the paging segment bitmap.
1518          */
1519         PS_LOCK(ps);
1520         byte_num = ps->ps_hint;
1521         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1522                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1523                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1524                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1525                                         break;
1526                         }
1527                         ASSERT(bit_num != NBBY);
1528                         break;
1529                 }
1530         }
1531         ps->ps_hint = byte_num;
1532         cluster = (byte_num*NBBY) + bit_num;
1533
1534         /* Space was reserved, so this must be true */
1535         ASSERT(cluster < ps->ps_ncls);
1536
1537         setbit(ps->ps_bmap, cluster);
1538         PS_UNLOCK(ps);
1539
1540         return cluster;
1541 }
1542
1543 void ps_deallocate_cluster(paging_segment_t, vm_offset_t);      /* forward */
1544
1545 void
1546 ps_deallocate_cluster(
1547         paging_segment_t        ps,
1548         vm_offset_t             cluster)
1549 {
1550         ipc_port_t trigger = IP_NULL;
1551
1552         if (cluster >= (vm_offset_t) ps->ps_ncls)
1553                 panic("ps_deallocate_cluster: Invalid cluster number");
1554
1555         /*
1556          * Lock the paging segment, clear the cluster's bitmap and increment the
1557          * number of free cluster.
1558          */
1559         PSL_LOCK();
1560         PS_LOCK(ps);
1561         clrbit(ps->ps_bmap, cluster);
1562         ++ps->ps_clcount;
1563         dp_pages_free +=  1 << ps->ps_clshift;
1564         if(max_pages_trigger_port
1565                 && (backing_store_release_trigger_disable == 0)
1566                 && (dp_pages_free > maximum_pages_free)) {
1567                 trigger = max_pages_trigger_port;
1568                 max_pages_trigger_port = NULL;
1569         }
1570         PSL_UNLOCK();
1571
1572         /*
1573          * Move the hint down to the freed cluster if it is
1574          * less than the current hint.
1575          */
1576         if ((cluster/NBBY) < ps->ps_hint) {
1577                 ps->ps_hint = (cluster/NBBY);
1578         }
1579
1580         PS_UNLOCK(ps);
1581
1582         /*
1583          * If we're freeing space on a full priority, reset the array.
1584          */
1585         PSL_LOCK();
1586         if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1587                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1588         PSL_UNLOCK();
1589
1590         if (trigger != IP_NULL) {
1591                 VSL_LOCK();
1592                 if(backing_store_release_trigger_disable != 0) {
1593                         assert_wait((event_t)
1594                             &backing_store_release_trigger_disable,
1595                             THREAD_UNINT);
1596                         VSL_UNLOCK();
1597                         thread_block((void (*)(void)) 0);
1598                 } else {
1599                         VSL_UNLOCK();
1600                 }
1601                 default_pager_space_alert(trigger, LO_WAT_ALERT);
1602                 ipc_port_release_send(trigger);
1603         }
1604
1605         return;
1606 }
1607
1608 void ps_dealloc_vsmap(struct vs_map *, vm_size_t);      /* forward */
1609
1610 void
1611 ps_dealloc_vsmap(
1612         struct vs_map   *vsmap,
1613         vm_size_t       size)
1614 {
1615         int i;
1616         for (i = 0; i < size; i++)
1617                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1618                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1619                                               VSM_CLOFF(vsmap[i]));
1620 }
1621
1622 void
1623 ps_vstruct_dealloc(
1624         vstruct_t vs)
1625 {
1626         int     i;
1627         spl_t   s;
1628
1629         VS_MAP_LOCK(vs);
1630
1631         /*
1632          * If this is an indirect structure, then we walk through the valid
1633          * (non-zero) indirect pointers and deallocate the clusters
1634          * associated with each used map entry (via ps_dealloc_vsmap).
1635          * When all of the clusters in an indirect block have been
1636          * freed, we deallocate the block.  When all of the indirect
1637          * blocks have been deallocated we deallocate the memory
1638          * holding the indirect pointers.
1639          */
1640         if (vs->vs_indirect) {
1641                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1642                         if (vs->vs_imap[i] != NULL) {
1643                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1644                                 kfree((vm_offset_t)vs->vs_imap[i],
1645                                                         CLMAP_THRESHOLD);
1646                         }
1647                 }
1648                 kfree((vm_offset_t)vs->vs_imap,
1649                                         INDIRECT_CLMAP_SIZE(vs->vs_size));
1650         } else {
1651                 /*
1652                  * Direct map.  Free used clusters, then memory.
1653                  */
1654                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1655                 kfree((vm_offset_t)vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1656         }
1657         VS_MAP_UNLOCK(vs);
1658
1659         bs_commit(- vs->vs_size);
1660
1661         zfree(vstruct_zone, (vm_offset_t)vs);
1662 }
1663
1664 int ps_map_extend(vstruct_t, int);      /* forward */
1665
1666 int ps_map_extend(
1667         vstruct_t       vs,
1668         int             new_size)
1669 {
1670         struct vs_map   **new_imap;
1671         struct vs_map   *new_dmap = NULL;
1672         int             newdsize;
1673         int             i;
1674         void            *old_map = NULL;
1675         int             old_map_size = 0;
1676
1677         if (vs->vs_size >= new_size) {
1678                 /*
1679                  * Someone has already done the work.
1680                  */
1681                 return 0;
1682         }
1683
1684         /*
1685          * If the new size extends into the indirect range, then we have one
1686          * of two cases: we are going from indirect to indirect, or we are
1687          * going from direct to indirect.  If we are going from indirect to
1688          * indirect, then it is possible that the new size will fit in the old
1689          * indirect map.  If this is the case, then just reset the size of the
1690          * vstruct map and we are done.  If the new size will not
1691          * fit into the old indirect map, then we have to allocate a new
1692          * indirect map and copy the old map pointers into this new map.
1693          *
1694          * If we are going from direct to indirect, then we have to allocate a
1695          * new indirect map and copy the old direct pages into the first
1696          * indirect page of the new map.
1697          * NOTE: allocating memory here is dangerous, as we're in the
1698          * pageout path.
1699          */
1700         if (INDIRECT_CLMAP(new_size)) {
1701                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1702
1703                 /*
1704                  * Get a new indirect map and zero it.
1705                  */
1706                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1707                 if (vs->vs_indirect &&
1708                     (new_map_size == old_map_size)) {
1709                         bs_commit(new_size - vs->vs_size);
1710                         vs->vs_size = new_size;
1711                         return 0;
1712                 }
1713
1714                 new_imap = (struct vs_map **)kalloc(new_map_size);
1715                 if (new_imap == NULL) {
1716                         return -1;
1717                 }
1718                 memset(new_imap, 0, new_map_size);
1719
1720                 if (vs->vs_indirect) {
1721                         /* Copy old entries into new map */
1722                         memcpy(new_imap, vs->vs_imap, old_map_size);
1723                         /* Arrange to free the old map */
1724                         old_map = (void *) vs->vs_imap;
1725                         newdsize = 0;
1726                 } else {        /* Old map was a direct map */
1727                         /* Allocate an indirect page */
1728                         if ((new_imap[0] = (struct vs_map *)
1729                              kalloc(CLMAP_THRESHOLD)) == NULL) {
1730                                 kfree((vm_offset_t)new_imap, new_map_size);
1731                                 return -1;
1732                         }
1733                         new_dmap = new_imap[0];
1734                         newdsize = CLMAP_ENTRIES;
1735                 }
1736         } else {
1737                 new_imap = NULL;
1738                 newdsize = new_size;
1739                 /*
1740                  * If the new map is a direct map, then the old map must
1741                  * also have been a direct map.  All we have to do is
1742                  * to allocate a new direct map, copy the old entries
1743                  * into it and free the old map.
1744                  */
1745                 if ((new_dmap = (struct vs_map *)
1746                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
1747                         return -1;
1748                 }
1749         }
1750         if (newdsize) {
1751
1752                 /* Free the old map */
1753                 old_map = (void *) vs->vs_dmap;
1754                 old_map_size = CLMAP_SIZE(vs->vs_size);
1755
1756                 /* Copy info from the old map into the new map */
1757                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1758
1759                 /* Initialize the rest of the new map */
1760                 for (i = vs->vs_size; i < newdsize; i++)
1761                         VSM_CLR(new_dmap[i]);
1762         }
1763         if (new_imap) {
1764                 vs->vs_imap = new_imap;
1765                 vs->vs_indirect = TRUE;
1766         } else
1767                 vs->vs_dmap = new_dmap;
1768         bs_commit(new_size - vs->vs_size);
1769         vs->vs_size = new_size;
1770         if (old_map)
1771                 kfree((vm_offset_t)old_map, old_map_size);
1772         return 0;
1773 }
1774
1775 vm_offset_t
1776 ps_clmap(
1777         vstruct_t       vs,
1778         vm_offset_t     offset,
1779         struct clmap    *clmap,
1780         int             flag,
1781         vm_size_t       size,
1782         int             error)
1783 {
1784         vm_offset_t     cluster;        /* The cluster of offset.       */
1785         vm_offset_t     newcl;          /* The new cluster allocated.   */
1786         vm_offset_t     newoff;
1787         int             i;
1788         struct vs_map   *vsmap;
1789
1790         VS_MAP_LOCK(vs);
1791
1792         ASSERT(vs->vs_dmap);
1793         cluster = atop(offset) >> vs->vs_clshift;
1794
1795         /*
1796          * Initialize cluster error value
1797          */
1798         clmap->cl_error = 0;
1799
1800         /*
1801          * If the object has grown, extend the page map.
1802          */
1803         if (cluster >= vs->vs_size) {
1804                 if (flag == CL_FIND) {
1805                         /* Do not allocate if just doing a lookup */
1806                         VS_MAP_UNLOCK(vs);
1807                         return (vm_offset_t) -1;
1808                 }
1809                 if (ps_map_extend(vs, cluster + 1)) {
1810                         VS_MAP_UNLOCK(vs);
1811                         return (vm_offset_t) -1;
1812                 }
1813         }
1814
1815         /*
1816          * Look for the desired cluster.  If the map is indirect, then we
1817          * have a two level lookup.  First find the indirect block, then
1818          * find the actual cluster.  If the indirect block has not yet
1819          * been allocated, then do so.  If the cluster has not yet been
1820          * allocated, then do so.
1821          *
1822          * If any of the allocations fail, then return an error.
1823          * Don't allocate if just doing a lookup.
1824          */
1825         if (vs->vs_indirect) {
1826                 long    ind_block = cluster/CLMAP_ENTRIES;
1827
1828                 /* Is the indirect block allocated? */
1829                 vsmap = vs->vs_imap[ind_block];
1830                 if (vsmap == NULL) {
1831                         if (flag == CL_FIND) {
1832                                 VS_MAP_UNLOCK(vs);
1833                                 return (vm_offset_t) -1;
1834                         }
1835
1836                         /* Allocate the indirect block */
1837                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1838                         if (vsmap == NULL) {
1839                                 VS_MAP_UNLOCK(vs);
1840                                 return (vm_offset_t) -1;
1841                         }
1842                         /* Initialize the cluster offsets */
1843                         for (i = 0; i < CLMAP_ENTRIES; i++)
1844                                 VSM_CLR(vsmap[i]);
1845                         vs->vs_imap[ind_block] = vsmap;
1846                 }
1847         } else
1848                 vsmap = vs->vs_dmap;
1849
1850         ASSERT(vsmap);
1851         vsmap += cluster%CLMAP_ENTRIES;
1852
1853         /*
1854          * At this point, vsmap points to the struct vs_map desired.
1855          *
1856          * Look in the map for the cluster, if there was an error on a
1857          * previous write, flag it and return.  If it is not yet
1858          * allocated, then allocate it, if we're writing; if we're
1859          * doing a lookup and the cluster's not allocated, return error.
1860          */
1861         if (VSM_ISERR(*vsmap)) {
1862                 clmap->cl_error = VSM_GETERR(*vsmap);
1863                 VS_MAP_UNLOCK(vs);
1864                 return (vm_offset_t) -1;
1865         } else if (VSM_ISCLR(*vsmap)) {
1866                 int psindex;
1867
1868                 if (flag == CL_FIND) {
1869                         /*
1870                          * If there's an error and the entry is clear, then
1871                          * we've run out of swap space.  Record the error
1872                          * here and return.
1873                          */
1874                         if (error) {
1875                                 VSM_SETERR(*vsmap, error);
1876                         }
1877                         VS_MAP_UNLOCK(vs);
1878                         return (vm_offset_t) -1;
1879                 } else {
1880                         /*
1881                          * Attempt to allocate a cluster from the paging segment
1882                          */
1883                         newcl = ps_allocate_cluster(vs, &psindex,
1884                                                     PAGING_SEGMENT_NULL);
1885                         if (newcl == -1) {
1886                                 VS_MAP_UNLOCK(vs);
1887                                 return (vm_offset_t) -1;
1888                         }
1889                         VSM_CLR(*vsmap);
1890                         VSM_SETCLOFF(*vsmap, newcl);
1891                         VSM_SETPS(*vsmap, psindex);
1892                 }
1893         } else
1894                 newcl = VSM_CLOFF(*vsmap);
1895
1896         /*
1897          * Fill in pertinent fields of the clmap
1898          */
1899         clmap->cl_ps = VSM_PS(*vsmap);
1900         clmap->cl_numpages = VSCLSIZE(vs);
1901         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1902
1903         /*
1904          * Byte offset in paging segment is byte offset to cluster plus
1905          * byte offset within cluster.  It looks ugly, but should be
1906          * relatively quick.
1907          */
1908         ASSERT(trunc_page(offset) == offset);
1909         newcl = ptoa(newcl) << vs->vs_clshift;
1910         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1911         if (flag == CL_ALLOC) {
1912                 /*
1913                  * set bits in the allocation bitmap according to which
1914                  * pages were requested.  size is in bytes.
1915                  */
1916                 i = atop(newoff);
1917                 while ((size > 0) && (i < VSCLSIZE(vs))) {
1918                         VSM_SETALLOC(*vsmap, i);
1919                         i++;
1920                         size -= vm_page_size;
1921                 }
1922         }
1923         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1924         if (newoff) {
1925                 /*
1926                  * Offset is not cluster aligned, so number of pages
1927                  * and bitmaps must be adjusted
1928                  */
1929                 clmap->cl_numpages -= atop(newoff);
1930                 CLMAP_SHIFT(clmap, vs);
1931                 CLMAP_SHIFTALLOC(clmap, vs);
1932         }
1933
1934         /*
1935          *
1936          * The setting of valid bits and handling of write errors
1937          * must be done here, while we hold the lock on the map.
1938          * It logically should be done in ps_vs_write_complete().
1939          * The size and error information has been passed from
1940          * ps_vs_write_complete().  If the size parameter is non-zero,
1941          * then there is work to be done.  If error is also non-zero,
1942          * then the error number is recorded in the cluster and the
1943          * entire cluster is in error.
1944          */
1945         if (size && flag == CL_FIND) {
1946                 vm_offset_t off = (vm_offset_t) 0;
1947
1948                 if (!error) {
1949                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1950                              i++) {
1951                                 VSM_SETPG(*vsmap, i);
1952                                 size -= vm_page_size;
1953                         }
1954                         ASSERT(i <= VSCLSIZE(vs));
1955                 } else {
1956                         BS_STAT(clmap->cl_ps->ps_bs,
1957                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1958                                         atop(size));
1959                         off = VSM_CLOFF(*vsmap);
1960                         VSM_SETERR(*vsmap, error);
1961                 }
1962                 /*
1963                  * Deallocate cluster if error, and no valid pages
1964                  * already present.
1965                  */
1966                 if (off != (vm_offset_t) 0)
1967                         ps_deallocate_cluster(clmap->cl_ps, off);
1968                 VS_MAP_UNLOCK(vs);
1969                 return (vm_offset_t) 0;
1970         } else
1971                 VS_MAP_UNLOCK(vs);
1972
1973         DEBUG(DEBUG_VS_INTERNAL,
1974               ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1975                newcl+newoff, (int) vs, (int) vsmap, flag));
1976         DEBUG(DEBUG_VS_INTERNAL,
1977               ("        clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1978                (int) clmap->cl_ps, clmap->cl_numpages,
1979                (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1980
1981         return (newcl + newoff);
1982 }
1983
1984 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t);     /* forward */
1985
1986 void
1987 ps_clunmap(
1988         vstruct_t       vs,
1989         vm_offset_t     offset,
1990         vm_size_t       length)
1991 {
1992         vm_offset_t             cluster; /* The cluster number of offset */
1993         struct vs_map           *vsmap;
1994
1995         VS_MAP_LOCK(vs);
1996
1997         /*
1998          * Loop through all clusters in this range, freeing paging segment
1999          * clusters and map entries as encountered.
2000          */
2001         while (length > 0) {
2002                 vm_offset_t     newoff;
2003                 int             i;
2004
2005                 cluster = atop(offset) >> vs->vs_clshift;
2006                 if (vs->vs_indirect)    /* indirect map */
2007                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2008                 else
2009                         vsmap = vs->vs_dmap;
2010                 if (vsmap == NULL) {
2011                         VS_MAP_UNLOCK(vs);
2012                         return;
2013                 }
2014                 vsmap += cluster%CLMAP_ENTRIES;
2015                 if (VSM_ISCLR(*vsmap)) {
2016                         length -= vm_page_size;
2017                         offset += vm_page_size;
2018                         continue;
2019                 }
2020                 /*
2021                  * We've got a valid mapping.  Clear it and deallocate
2022                  * paging segment cluster pages.
2023                  * Optimize for entire cluster cleraing.
2024                  */
2025                 if (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) {
2026                         /*
2027                          * Not cluster aligned.
2028                          */
2029                         ASSERT(trunc_page(newoff) == newoff);
2030                         i = atop(newoff);
2031                 } else
2032                         i = 0;
2033                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2034                         VSM_CLRPG(*vsmap, i);
2035                         VSM_CLRALLOC(*vsmap, i);
2036                         length -= vm_page_size;
2037                         offset += vm_page_size;
2038                         i++;
2039                 }
2040
2041                 /*
2042                  * If map entry is empty, clear and deallocate cluster.
2043                  */
2044                 if (!VSM_ALLOC(*vsmap)) {
2045                         ps_deallocate_cluster(VSM_PS(*vsmap),
2046                                               VSM_CLOFF(*vsmap));
2047                         VSM_CLR(*vsmap);
2048                 }
2049         }
2050
2051         VS_MAP_UNLOCK(vs);
2052 }
2053
2054 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2055
2056 void
2057 ps_vs_write_complete(
2058         vstruct_t       vs,
2059         vm_offset_t     offset,
2060         vm_size_t       size,
2061         int             error)
2062 {
2063         struct clmap    clmap;
2064
2065         /*
2066          * Get the struct vsmap for this cluster.
2067          * Use READ, even though it was written, because the
2068          * cluster MUST be present, unless there was an error
2069          * in the original ps_clmap (e.g. no space), in which
2070          * case, nothing happens.
2071          *
2072          * Must pass enough information to ps_clmap to allow it
2073          * to set the vs_map structure bitmap under lock.
2074          */
2075         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2076 }
2077
2078 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int);    /* forward */
2079
2080 void
2081 vs_cl_write_complete(
2082         vstruct_t               vs,
2083         paging_segment_t        ps,
2084         vm_offset_t             offset,
2085         vm_offset_t             addr,
2086         vm_size_t               size,
2087         boolean_t               async,
2088         int                     error)
2089 {
2090         kern_return_t   kr;
2091
2092         if (error) {
2093                 /*
2094                  * For internal objects, the error is recorded on a
2095                  * per-cluster basis by ps_clmap() which is called
2096                  * by ps_vs_write_complete() below.
2097                  */
2098                 dprintf(("write failed error = 0x%x\n", error));
2099                 /* add upl_abort code here */
2100         } else
2101                 GSTAT(global_stats.gs_pages_out += atop(size));
2102         /*
2103          * Notify the vstruct mapping code, so it can do its accounting.
2104          */
2105         ps_vs_write_complete(vs, offset, size, error);
2106
2107         if (async) {
2108                 VS_LOCK(vs);
2109                 ASSERT(vs->vs_async_pending > 0);
2110                 vs->vs_async_pending -= size;
2111                 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2112                         vs->vs_waiting_async = FALSE;
2113                         VS_UNLOCK(vs);
2114                         /* mutex_unlock(&vs->vs_waiting_async); */
2115                         thread_wakeup(&vs->vs_async_pending);
2116                 } else {
2117                         VS_UNLOCK(vs);
2118                 }
2119         }
2120 }
2121
2122 #ifdef DEVICE_PAGING
2123 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2124
2125 kern_return_t
2126 device_write_reply(
2127         MACH_PORT_FACE  reply_port,
2128         kern_return_t   device_code,
2129         io_buf_len_t    bytes_written)
2130 {
2131         struct vs_async *vsa;
2132
2133         vsa = (struct vs_async *)
2134                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2135
2136         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2137                 device_code = KERN_FAILURE;
2138         }
2139
2140         vsa->vsa_error = device_code;
2141
2142
2143         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2144         if(vsa->vsa_flags & VSA_TRANSFER) {
2145                 /* revisit when async disk segments redone */
2146                 if(vsa->vsa_error) {
2147                    /* need to consider error condition.  re-write data or */
2148                    /* throw it away here. */
2149                    vm_offset_t  ioaddr;
2150                    if(vm_map_copyout(kernel_map, &ioaddr,
2151                                  (vm_map_copy_t)vsa->vsa_addr) != KERN_SUCCESS)
2152                    panic("vs_cluster_write: unable to copy source list\n");
2153                    vm_deallocate(kernel_map, ioaddr, vsa->vsa_size);
2154                 }
2155                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2156                                                 vsa->vsa_size, vsa->vsa_error);
2157         } else {
2158                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2159                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2160                              vsa->vsa_error);
2161         }
2162         VS_FREE_ASYNC(vsa);
2163
2164         return KERN_SUCCESS;
2165 }
2166
2167 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2168 kern_return_t
2169 device_write_reply_inband(
2170         MACH_PORT_FACE          reply_port,
2171         kern_return_t           return_code,
2172         io_buf_len_t            bytes_written)
2173 {
2174         panic("device_write_reply_inband: illegal");
2175         return KERN_SUCCESS;
2176 }
2177
2178 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2179 kern_return_t
2180 device_read_reply(
2181         MACH_PORT_FACE          reply_port,
2182         kern_return_t           return_code,
2183         io_buf_ptr_t            data,
2184         mach_msg_type_number_t  dataCnt)
2185 {
2186         struct vs_async *vsa;
2187         vsa = (struct vs_async *)
2188                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2189         vsa->vsa_addr = (vm_offset_t)data;
2190         vsa->vsa_size = (vm_size_t)dataCnt;
2191         vsa->vsa_error = return_code;
2192         thread_wakeup(&vsa->vsa_lock);
2193         return KERN_SUCCESS;
2194 }
2195
2196 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2197 kern_return_t
2198 device_read_reply_inband(
2199         MACH_PORT_FACE          reply_port,
2200         kern_return_t           return_code,
2201         io_buf_ptr_inband_t     data,
2202         mach_msg_type_number_t  dataCnt)
2203 {
2204         panic("device_read_reply_inband: illegal");
2205         return KERN_SUCCESS;
2206 }
2207
2208 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2209 kern_return_t
2210 device_read_reply_overwrite(
2211         MACH_PORT_FACE          reply_port,
2212         kern_return_t           return_code,
2213         io_buf_len_t            bytes_read)
2214 {
2215         panic("device_read_reply_overwrite: illegal\n");
2216         return KERN_SUCCESS;
2217 }
2218
2219 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2220 kern_return_t
2221 device_open_reply(
2222         MACH_PORT_FACE          reply_port,
2223         kern_return_t           return_code,
2224         MACH_PORT_FACE          device_port)
2225 {
2226         panic("device_open_reply: illegal\n");
2227         return KERN_SUCCESS;
2228 }
2229
2230 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
2231
2232 kern_return_t
2233 ps_read_device(
2234         paging_segment_t        ps,
2235         vm_offset_t             offset,
2236         vm_offset_t             *bufferp,
2237         unsigned int            size,
2238         unsigned int            *residualp,
2239         int                     flags)
2240 {
2241         kern_return_t   kr;
2242         recnum_t        dev_offset;
2243         unsigned int    bytes_wanted;
2244         unsigned int    bytes_read;
2245         unsigned int    total_read;
2246         vm_offset_t     dev_buffer;
2247         vm_offset_t     buf_ptr;
2248         unsigned int    records_read;
2249         struct vs_async *vsa;
2250         mutex_t vs_waiting_read_reply;
2251
2252         device_t        device;
2253         vm_map_copy_t   device_data = NULL;
2254         default_pager_thread_t *dpt = NULL;
2255
2256         device = dev_port_lookup(ps->ps_device);
2257         clustered_reads[atop(size)]++;
2258
2259         dev_offset = (ps->ps_offset +
2260                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2261         bytes_wanted = size;
2262         total_read = 0;
2263         *bufferp = (vm_offset_t)NULL;
2264
2265         do {
2266                 vsa = VS_ALLOC_ASYNC();
2267                 if (vsa) {
2268                         vsa->vsa_vs = NULL;
2269                         vsa->vsa_addr = 0;
2270                         vsa->vsa_offset = 0;
2271                         vsa->vsa_size = 0;
2272                         vsa->vsa_ps = NULL;
2273                 }
2274                 mutex_init(&vsa->vsa_lock, ETAP_DPAGE_VSSEQNO);
2275                 ip_lock(vsa->reply_port);
2276                 vsa->reply_port->ip_sorights++;
2277                 ip_reference(vsa->reply_port);
2278                 ip_unlock(vsa->reply_port);
2279                 kr = ds_device_read_common(device,
2280                                  vsa->reply_port,
2281                                  (mach_msg_type_name_t)
2282                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2283                                  (dev_mode_t) 0,
2284                                  dev_offset,
2285                                  bytes_wanted,
2286                                  (IO_READ | IO_CALL),
2287                                  (io_buf_ptr_t *) &dev_buffer,
2288                                  (mach_msg_type_number_t *) &bytes_read);
2289                 if(kr == MIG_NO_REPLY) {
2290                         assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2291                         thread_block((void(*)(void))0);
2292
2293                         dev_buffer = vsa->vsa_addr;
2294                         bytes_read = (unsigned int)vsa->vsa_size;
2295                         kr = vsa->vsa_error;
2296                 }
2297                 VS_FREE_ASYNC(vsa);
2298                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2299                         break;
2300                 }
2301                 total_read += bytes_read;
2302
2303                 /*
2304                  * If we got the entire range, use the returned dev_buffer.
2305                  */
2306                 if (bytes_read == size) {
2307                         *bufferp = (vm_offset_t)dev_buffer;
2308                         break;
2309                 }
2310
2311 #if 1
2312                 dprintf(("read only %d bytes out of %d\n",
2313                          bytes_read, bytes_wanted));
2314 #endif
2315                 if(dpt == NULL) {
2316                         dpt = get_read_buffer();
2317                         buf_ptr = dpt->dpt_buffer;
2318                         *bufferp = (vm_offset_t)buf_ptr;
2319                 }
2320                 /*
2321                  * Otherwise, copy the data into the provided buffer (*bufferp)
2322                  * and append the rest of the range as it comes in.
2323                  */
2324                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2325                 buf_ptr += bytes_read;
2326                 bytes_wanted -= bytes_read;
2327                 records_read = (bytes_read >>
2328                                 (vm_page_shift - ps->ps_record_shift));
2329                 dev_offset += records_read;
2330                 DEBUG(DEBUG_VS_INTERNAL,
2331                       ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2332                        dev_buffer, bytes_read));
2333                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2334                     != KERN_SUCCESS)
2335                         Panic("dealloc buf");
2336         } while (bytes_wanted);
2337
2338         *residualp = size - total_read;
2339         if((dev_buffer != *bufferp) && (total_read != 0)) {
2340                 vm_offset_t temp_buffer;
2341                 vm_allocate(kernel_map, &temp_buffer, total_read, TRUE);
2342                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2343                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2344                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2345                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2346                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2347                         (vm_map_copy_t *)&device_data, FALSE))
2348                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2349         }
2350         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2351                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2352                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2353                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2354                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2355                         (vm_map_copy_t *)&device_data, FALSE))
2356                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2357         }
2358         else {
2359                 device_data = NULL;
2360         }
2361         *bufferp = (vm_offset_t)device_data;
2362
2363         if(dpt != NULL) {
2364                 /* Free the receive buffer */
2365                 dpt->checked_out = 0;
2366                 thread_wakeup(&dpt_array);
2367         }
2368         return KERN_SUCCESS;
2369 }
2370
2371 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
2372
2373 kern_return_t
2374 ps_write_device(
2375         paging_segment_t        ps,
2376         vm_offset_t             offset,
2377         vm_offset_t             addr,
2378         unsigned int            size,
2379         struct vs_async         *vsa)
2380 {
2381         recnum_t        dev_offset;
2382         io_buf_len_t    bytes_to_write, bytes_written;
2383         recnum_t        records_written;
2384         kern_return_t   kr;
2385         MACH_PORT_FACE  reply_port;
2386
2387
2388
2389         clustered_writes[atop(size)]++;
2390
2391         dev_offset = (ps->ps_offset +
2392                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2393         bytes_to_write = size;
2394
2395         if (vsa) {
2396                 /*
2397                  * Asynchronous write.
2398                  */
2399                 reply_port = vsa->reply_port;
2400                 ip_lock(reply_port);
2401                 reply_port->ip_sorights++;
2402                 ip_reference(reply_port);
2403                 ip_unlock(reply_port);
2404                 {
2405                 device_t        device;
2406                 device = dev_port_lookup(ps->ps_device);
2407
2408                 vsa->vsa_addr = addr;
2409                 kr=ds_device_write_common(device,
2410                         reply_port,
2411                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2412                         (dev_mode_t) 0,
2413                         dev_offset,
2414                         (io_buf_ptr_t)  addr,
2415                         size,
2416                         (IO_WRITE | IO_CALL),
2417                         &bytes_written);
2418                 }
2419                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2420                         if (verbose)
2421                                 dprintf(("%s0x%x, addr=0x%x,"
2422                                          "size=0x%x,offset=0x%x\n",
2423                                          "device_write_request returned ",
2424                                          kr, addr, size, offset));
2425                         BS_STAT(ps->ps_bs,
2426                                 ps->ps_bs->bs_pages_out_fail += atop(size));
2427                         /* do the completion notification to free resources */
2428                         device_write_reply(reply_port, kr, 0);
2429                         return PAGER_ERROR;
2430                 }
2431         } else do {
2432                 /*
2433                  * Synchronous write.
2434                  */
2435                 {
2436                 device_t        device;
2437                 device = dev_port_lookup(ps->ps_device);
2438                 kr=ds_device_write_common(device,
2439                         IP_NULL, 0,
2440                         (dev_mode_t) 0,
2441                         dev_offset,
2442                         (io_buf_ptr_t)  addr,
2443                         size,
2444                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2445                         &bytes_written);
2446                 }
2447                 if (kr != KERN_SUCCESS) {
2448                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2449                                  "device_write returned ",
2450                                  kr, addr, size, offset));
2451                         BS_STAT(ps->ps_bs,
2452                                 ps->ps_bs->bs_pages_out_fail += atop(size));
2453                         return PAGER_ERROR;
2454                 }
2455                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2456                         Panic("fragmented write");
2457                 records_written = (bytes_written >>
2458                                    (vm_page_shift - ps->ps_record_shift));
2459                 dev_offset += records_written;
2460 #if 1
2461                 if (bytes_written != bytes_to_write) {
2462                         dprintf(("wrote only %d bytes out of %d\n",
2463                                  bytes_written, bytes_to_write));
2464                 }
2465 #endif
2466                 bytes_to_write -= bytes_written;
2467                 addr += bytes_written;
2468         } while (bytes_to_write > 0);
2469
2470         return PAGER_SUCCESS;
2471 }
2472
2473
2474 #else /* !DEVICE_PAGING */
2475
2476 kern_return_t
2477 ps_read_device(
2478         paging_segment_t        ps,
2479         vm_offset_t             offset,
2480         vm_offset_t             *bufferp,
2481         unsigned int            size,
2482         unsigned int            *residualp,
2483         int                     flags)
2484 {
2485   panic("ps_read_device not supported");
2486 }
2487
2488 ps_write_device(
2489         paging_segment_t        ps,
2490         vm_offset_t             offset,
2491         vm_offset_t             addr,
2492         unsigned int            size,
2493         struct vs_async         *vsa)
2494 {
2495   panic("ps_write_device not supported");
2496 }
2497
2498 #endif /* DEVICE_PAGING */
2499 void pvs_object_data_provided(vstruct_t, upl_t, vm_offset_t, vm_size_t);        /* forward */
2500
2501 void
2502 pvs_object_data_provided(
2503         vstruct_t       vs,
2504         upl_t           upl,
2505         vm_offset_t     offset,
2506         vm_size_t       size)
2507 {
2508
2509         DEBUG(DEBUG_VS_INTERNAL,
2510               ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2511                upl, offset, size));
2512
2513         ASSERT(size > 0);
2514         GSTAT(global_stats.gs_pages_in += atop(size));
2515
2516
2517 #if     USE_PRECIOUS
2518         ps_clunmap(vs, offset, size);
2519 #endif  /* USE_PRECIOUS */
2520
2521 }
2522
2523 kern_return_t
2524 pvs_cluster_read(
2525         vstruct_t       vs,
2526         vm_offset_t     vs_offset,
2527         vm_size_t       cnt)
2528 {
2529         upl_t                   upl;
2530         kern_return_t           error = KERN_SUCCESS;
2531         int                     size;
2532         unsigned int            residual;
2533         unsigned int            request_flags;
2534         int                     seg_index;
2535         int                     pages_in_cl;
2536         int                     cl_size;
2537         int                     cl_mask;
2538         int                     cl_index;
2539         int                     xfer_size;
2540         vm_offset_t       ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2541         paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2542         struct clmap            clmap;
2543
2544         pages_in_cl = 1 << vs->vs_clshift;
2545         cl_size = pages_in_cl * vm_page_size;
2546         cl_mask = cl_size - 1;
2547
2548         /*
2549          * This loop will be executed multiple times until the entire
2550          * request has been satisfied... if the request spans cluster
2551          * boundaries, the clusters will be checked for logical continunity,
2552          * if contiguous the I/O request will span multiple clusters, otherwise
2553          * it will be broken up into the minimal set of I/O's
2554          *
2555          * If there are holes in a request (either unallocated pages in a paging
2556          * segment or an unallocated paging segment), we stop
2557          * reading at the hole, inform the VM of any data read, inform
2558          * the VM of an unavailable range, then loop again, hoping to
2559          * find valid pages later in the requested range.  This continues until
2560          * the entire range has been examined, and read, if present.
2561          */
2562
2563 #if     USE_PRECIOUS
2564         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_PRECIOUS;
2565 #else
2566         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE ;
2567 #endif
2568         while (cnt && (error == KERN_SUCCESS)) {
2569                 int     ps_info_valid;
2570                 int     page_list_count;
2571
2572                 if (cnt > VM_SUPER_CLUSTER)
2573                         size = VM_SUPER_CLUSTER;
2574                 else
2575                         size = cnt;
2576                 cnt -= size;
2577
2578                 ps_info_valid = 0;
2579                 seg_index     = 0;
2580
2581                 while (size > 0 && error == KERN_SUCCESS) {
2582                         int           abort_size;
2583                         int           failed_size;
2584                         int           beg_pseg;
2585                         int           beg_indx;
2586                         vm_offset_t   cur_offset;
2587
2588
2589                         if ( !ps_info_valid) {
2590                                 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2591                                 psp[seg_index]       = CLMAP_PS(clmap);
2592                                 ps_info_valid = 1;
2593                         }
2594                         /*
2595                          * skip over unallocated physical segments
2596                          */
2597                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2598                                 abort_size = cl_size - (vs_offset & cl_mask);
2599                                 abort_size = MIN(abort_size, size);
2600
2601                                 page_list_count = 0;
2602                                 memory_object_super_upl_request(
2603                                         vs->vs_control,
2604                                         (memory_object_offset_t)vs_offset,
2605                                         abort_size, abort_size,
2606                                         &upl, NULL, &page_list_count,
2607                                         request_flags);
2608
2609                                 if (clmap.cl_error) {
2610                                         upl_abort(upl, UPL_ABORT_ERROR);
2611                                 } else {
2612                                         upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2613                                 }
2614                                 upl_deallocate(upl);
2615
2616                                 size       -= abort_size;
2617                                 vs_offset  += abort_size;
2618
2619                                 seg_index++;
2620                                 ps_info_valid = 0;
2621                                 continue;
2622                         }
2623                         cl_index = (vs_offset & cl_mask) / vm_page_size;
2624
2625                         for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2626                                 /*
2627                                  * skip over unallocated pages
2628                                  */
2629                                 if (CLMAP_ISSET(clmap, cl_index))
2630                                         break;
2631                                 abort_size += vm_page_size;
2632                         }
2633                         if (abort_size) {
2634                                 /*
2635                                  * Let VM system know about holes in clusters.
2636                                  */
2637                                 GSTAT(global_stats.gs_pages_unavail += atop(abort_size));
2638
2639                                 page_list_count = 0;
2640                                 memory_object_super_upl_request(
2641                                         vs->vs_control,
2642                                         (memory_object_offset_t)vs_offset,
2643                                         abort_size, abort_size,
2644                                         &upl, NULL, &page_list_count,
2645                                         request_flags);
2646
2647                                 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2648                                 upl_deallocate(upl);
2649
2650                                 size       -= abort_size;
2651                                 vs_offset  += abort_size;
2652
2653                                 if (cl_index == pages_in_cl) {
2654                                         /*
2655                                          * if we're at the end of this physical cluster
2656                                          * then bump to the next one and continue looking
2657                                          */
2658                                         seg_index++;
2659                                         ps_info_valid = 0;
2660                                         continue;
2661                                 }
2662                                 if (size == 0)
2663                                         break;
2664                         }
2665                         /*
2666                          * remember the starting point of the first allocated page
2667                          * for the I/O we're about to issue
2668                          */
2669                         beg_pseg   = seg_index;
2670                         beg_indx   = cl_index;
2671                         cur_offset = vs_offset;
2672
2673                         /*
2674                          * calculate the size of the I/O that we can do...
2675                          * this may span multiple physical segments if
2676                          * they are contiguous
2677                          */
2678                         for (xfer_size = 0; xfer_size < size; ) {
2679
2680                                 while (cl_index < pages_in_cl && xfer_size < size) {
2681                                         /*
2682                                          * accumulate allocated pages within a physical segment
2683                                          */
2684                                         if (CLMAP_ISSET(clmap, cl_index)) {
2685                                                 xfer_size  += vm_page_size;
2686                                                 cur_offset += vm_page_size;
2687                                                 cl_index++;
2688
2689                                                 BS_STAT(psp[seg_index]->ps_bs,
2690                                                         psp[seg_index]->ps_bs->bs_pages_in++);
2691                                         } else
2692                                                 break;
2693                                 }
2694                                 if (cl_index < pages_in_cl || xfer_size >= size) {
2695                                         /*
2696                                          * we've hit an unallocated page or the
2697                                          * end of this request... go fire the I/O
2698                                          */
2699                                         break;
2700                                 }
2701                                 /*
2702                                  * we've hit the end of the current physical segment
2703                                  * and there's more to do, so try moving to the next one
2704                                  */
2705                                 seg_index++;
2706
2707                                 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2708                                 psp[seg_index]       = CLMAP_PS(clmap);
2709                                 ps_info_valid = 1;
2710
2711                                 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2712                                         /*
2713                                          * if the physical segment we're about to step into
2714                                          * is not contiguous to the one we're currently
2715                                          * in, or it's in a different paging file, or
2716                                          * it hasn't been allocated....
2717                                          * we stop here and generate the I/O
2718                                          */
2719                                         break;
2720                                 }
2721                                 /*
2722                                  * start with first page of the next physical segment
2723                                  */
2724                                 cl_index = 0;
2725                         }
2726                         if (xfer_size) {
2727                                 /*
2728                                  * we have a contiguous range of allocated pages
2729                                  * to read from
2730                                  */
2731                                 page_list_count = 0;
2732                                 memory_object_super_upl_request(vs->vs_control,
2733                                                 (memory_object_offset_t)vs_offset,
2734                                                 xfer_size, xfer_size,
2735                                                 &upl, NULL, &page_list_count,
2736                                                 request_flags | UPL_SET_INTERNAL);
2737
2738                                 error = ps_read_file(psp[beg_pseg], upl, (vm_offset_t) 0,
2739                                                 ps_offset[beg_pseg] + (beg_indx * vm_page_size), xfer_size, &residual, 0);
2740                         } else
2741                                 continue;
2742
2743                         failed_size = 0;
2744
2745                         /*
2746                          * Adjust counts and send response to VM.  Optimize for the
2747                          * common case, i.e. no error and/or partial data.
2748                          * If there was an error, then we need to error the entire
2749                          * range, even if some data was successfully read.
2750                          * If there was a partial read we may supply some
2751                          * data and may error some as well.  In all cases the
2752                          * VM must receive some notification for every page in the
2753                          * range.
2754                          */
2755                         if ((error == KERN_SUCCESS) && (residual == 0)) {
2756                                 /*
2757                                  * Got everything we asked for, supply the data to
2758                                  * the VM.  Note that as a side effect of supplying
2759                                  * the data, the buffer holding the supplied data is
2760                                  * deallocated from the pager's address space.
2761                                  */
2762                                 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
2763                         } else {
2764                                 failed_size = xfer_size;
2765
2766                                 if (error == KERN_SUCCESS) {
2767                                         if (residual == xfer_size) {
2768                                                 /*
2769                                                  * If a read operation returns no error
2770                                                  * and no data moved, we turn it into
2771                                                  * an error, assuming we're reading at
2772                                                  * or beyong EOF.
2773                                                  * Fall through and error the entire
2774                                                  * range.
2775                                                  */
2776                                                 error = KERN_FAILURE;
2777                                         } else {
2778                                                 /*
2779                                                  * Otherwise, we have partial read. If
2780                                                  * the part read is a integral number
2781                                                  * of pages supply it. Otherwise round
2782                                                  * it up to a page boundary, zero fill
2783                                                  * the unread part, and supply it.
2784                                                  * Fall through and error the remainder
2785                                                  * of the range, if any.
2786                                                  */
2787                                                 int fill, lsize;
2788
2789                                                 fill = residual & ~vm_page_size;
2790                                                 lsize = (xfer_size - residual) + fill;
2791                                                 pvs_object_data_provided(vs, upl, vs_offset, lsize);
2792
2793                                                 if (lsize < xfer_size) {
2794                                                         failed_size = xfer_size - lsize;
2795                                                         error = KERN_FAILURE;
2796                                                 }
2797                                         }
2798                                 }
2799                         }
2800                         /*
2801                          * If there was an error in any part of the range, tell
2802                          * the VM. Note that error is explicitly checked again since
2803                          * it can be modified above.
2804                          */
2805                         if (error != KERN_SUCCESS) {
2806                                 BS_STAT(psp[beg_pseg]->ps_bs,
2807                                         psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop(failed_size));
2808                         }
2809                         size       -= xfer_size;
2810                         vs_offset  += xfer_size;
2811                 }
2812
2813         } /* END while (cnt && (error == 0)) */
2814         return error;
2815 }
2816
2817 int vs_do_async_write = 1;
2818
2819 kern_return_t
2820 vs_cluster_write(
2821         vstruct_t       vs,
2822         upl_t           internal_upl,
2823         vm_offset_t     offset,
2824         vm_size_t       cnt,
2825         boolean_t       dp_internal,
2826         int             flags)
2827 {
2828         vm_offset_t     size;
2829         vm_offset_t     transfer_size;
2830         int             error = 0;
2831         struct clmap    clmap;
2832
2833         vm_offset_t     actual_offset;  /* Offset within paging segment */
2834         paging_segment_t ps;
2835         vm_offset_t     subx_size;
2836         vm_offset_t     mobj_base_addr;
2837         vm_offset_t     mobj_target_addr;
2838         int             mobj_size;
2839
2840         struct vs_async *vsa;
2841         vm_map_copy_t   copy;
2842
2843         upl_t           upl;
2844         upl_page_info_t *pl;
2845         int             page_index;
2846         int             list_size;
2847         int             cl_size;
2848
2849         if (!dp_internal) {
2850                 int          page_list_count;
2851                 int          request_flags;
2852                 int          super_size;
2853                 int          first_dirty;
2854                 int          num_dirty;
2855                 int          num_of_pages;
2856                 int          seg_index;
2857                 int          pages_in_cl;
2858                 int          must_abort;
2859                 vm_offset_t  upl_offset;
2860                 vm_offset_t  seg_offset;
2861                 vm_offset_t  ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2862                 paging_segment_t   psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2863
2864
2865                 pages_in_cl = 1 << vs->vs_clshift;
2866                 cl_size = pages_in_cl * vm_page_size;
2867
2868                 if (bs_low) {
2869                         super_size = cl_size;
2870
2871                         request_flags = UPL_NOBLOCK |
2872                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2873                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2874                 } else {
2875                         super_size = VM_SUPER_CLUSTER;
2876
2877                         request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2878                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2879                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2880                 }
2881
2882                 page_list_count = 0;
2883                 memory_object_super_upl_request(vs->vs_control,
2884                                 (memory_object_offset_t)offset,
2885                                 cnt, super_size,
2886                                 &upl, NULL, &page_list_count,
2887                                 request_flags | UPL_PAGEOUT);
2888
2889                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2890
2891                 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
2892
2893                         ps_offset[seg_index] = ps_clmap(vs, upl->offset + (seg_index * cl_size),
2894                                                       &clmap, CL_ALLOC,
2895                                                       transfer_size < cl_size ?
2896                                                       transfer_size : cl_size, 0);
2897
2898                         if (ps_offset[seg_index] == (vm_offset_t) -1) {
2899                                 upl_abort(upl, 0);
2900                                 upl_deallocate(upl);
2901
2902                                 return KERN_FAILURE;
2903
2904                         }
2905                         psp[seg_index] = CLMAP_PS(clmap);
2906
2907                         if (transfer_size > cl_size) {
2908                                 transfer_size -= cl_size;
2909                                 seg_index++;
2910                         } else
2911                                 transfer_size = 0;
2912                 }
2913                 for (page_index = 0, num_of_pages = upl->size / vm_page_size; page_index < num_of_pages; ) {
2914                         /*
2915                          * skip over non-dirty pages
2916                          */
2917                         for ( ; page_index < num_of_pages; page_index++) {
2918                                 if (UPL_DIRTY_PAGE(pl, page_index) || UPL_PRECIOUS_PAGE(pl, page_index))
2919                                         /*
2920                                          * this is a page we need to write
2921                                          * go see if we can buddy it up with others
2922                                          * that are contiguous to it
2923                                          */
2924                                         break;
2925                                 /*
2926                                  * if the page is not-dirty, but present we need to commit it...
2927                                  * this is an unusual case since we only asked for dirty pages
2928                                  */
2929                                 if (UPL_PAGE_PRESENT(pl, page_index)) {
2930                                         boolean_t empty = FALSE;
2931                                         upl_commit_range(upl,
2932                                                  page_index * vm_page_size,
2933                                                  vm_page_size,
2934                                                  UPL_COMMIT_NOTIFY_EMPTY,
2935                                                  pl,
2936                                                  page_list_count,
2937                                                  &empty);
2938                                         if (empty)
2939                                                 upl_deallocate(upl);
2940                                 }
2941                         }
2942                         if (page_index == num_of_pages)
2943                                 /*
2944                                  * no more pages to look at, we're out of here
2945                                  */
2946                                 break;
2947
2948                         /*
2949                          * gather up contiguous dirty pages... we have at least 1
2950                          * otherwise we would have bailed above
2951                          * make sure that each physical segment that we step
2952                          * into is contiguous to the one we're currently in
2953                          * if it's not, we have to stop and write what we have
2954                          */
2955                         for (first_dirty = page_index; page_index < num_of_pages; ) {
2956                                 if ( !UPL_DIRTY_PAGE(pl, page_index) && !UPL_PRECIOUS_PAGE(pl, page_index))
2957                                         break;
2958                                 page_index++;
2959                                 /*
2960                                  * if we just looked at the last page in the UPL
2961                                  * we don't need to check for physical segment
2962                                  * continuity
2963                                  */
2964                                 if (page_index < num_of_pages) {
2965                                         int cur_seg;
2966                                         int nxt_seg;
2967
2968                                         cur_seg = (page_index - 1) / pages_in_cl;
2969                                         nxt_seg = page_index / pages_in_cl;
2970
2971                                         if (cur_seg != nxt_seg) {
2972                                                 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
2973                                                         /*
2974                                                          * if the segment we're about to step into
2975                                                          * is not contiguous to the one we're currently
2976                                                          * in, or it's in a different paging file....
2977                                                          * we stop here and generate the I/O
2978                                                          */
2979                                                         break;
2980                                         }
2981                                 }
2982                         }
2983                         num_dirty = page_index - first_dirty;
2984                         must_abort = 1;
2985
2986                         if (num_dirty) {
2987                                 upl_offset = first_dirty * vm_page_size;
2988                                 seg_index  = first_dirty / pages_in_cl;
2989                                 seg_offset = upl_offset - (seg_index * cl_size);
2990                                 transfer_size = num_dirty * vm_page_size;
2991
2992                                 error = ps_write_file(psp[seg_index], upl, upl_offset,
2993                                                       ps_offset[seg_index] + seg_offset, transfer_size, flags);
2994
2995                                 if (error == 0) {
2996                                         while (transfer_size) {
2997                                                 int seg_size;
2998
2999                                                 if ((seg_size = cl_size - (upl_offset % cl_size)) > transfer_size)
3000                                                         seg_size = transfer_size;
3001
3002                                                 ps_vs_write_complete(vs, upl->offset + upl_offset, seg_size, error);
3003
3004                                                 transfer_size -= seg_size;
3005                                                 upl_offset += seg_size;
3006                                         }
3007                                         must_abort = 0;
3008                                 }
3009                         }
3010                         if (must_abort) {
3011                                 boolean_t empty = FALSE;
3012                                 upl_abort_range(upl,
3013                                                 first_dirty * vm_page_size,
3014                                                 num_dirty   * vm_page_size,
3015                                                 UPL_ABORT_NOTIFY_EMPTY,
3016                                                 &empty);
3017                                 if (empty)
3018                                         upl_deallocate(upl);
3019                         }
3020                 }
3021
3022         } else {
3023                 assert(cnt  <= (vm_page_size << vs->vs_clshift));
3024                 list_size = cnt;
3025
3026                 page_index = 0;
3027                 /* The caller provides a mapped_data which is derived  */
3028                 /* from a temporary object.  The targeted pages are    */
3029                 /* guaranteed to be set at offset 0 in the mapped_data */
3030                 /* The actual offset however must still be derived     */
3031                 /* from the offset in the vs in question               */
3032                 mobj_base_addr = offset;
3033                 mobj_target_addr = mobj_base_addr;
3034
3035                 for (transfer_size = list_size; transfer_size != 0;) {
3036                         actual_offset = ps_clmap(vs, mobj_target_addr,
3037                                 &clmap, CL_ALLOC,
3038                                 transfer_size < cl_size ?
3039                                         transfer_size : cl_size, 0);
3040                         if(actual_offset == (vm_offset_t) -1) {
3041                                 error = 1;
3042                                 break;
3043                         }
3044                         cnt = MIN(transfer_size,
3045                                 CLMAP_NPGS(clmap) * vm_page_size);
3046                         ps = CLMAP_PS(clmap);
3047                         /* Assume that the caller has given us contiguous */
3048                         /* pages */
3049                         if(cnt) {
3050                                 error = ps_write_file(ps, internal_upl,
3051                                                 0, actual_offset,
3052                                                 cnt, flags);
3053                                 if (error)
3054                                         break;
3055                                 ps_vs_write_complete(vs, mobj_target_addr,
3056                                                                 cnt, error);
3057                            }
3058                         if (error)
3059                                 break;
3060                         actual_offset += cnt;
3061                         mobj_target_addr += cnt;
3062                         transfer_size -= cnt;
3063                         cnt = 0;
3064
3065                         if (error)
3066                                 break;
3067                 }
3068         }
3069         if(error)
3070                 return KERN_FAILURE;
3071         else
3072                 return KERN_SUCCESS;
3073 }
3074
3075 vm_size_t
3076 ps_vstruct_allocated_size(
3077         vstruct_t       vs)
3078 {
3079         int             num_pages;
3080         struct vs_map   *vsmap;
3081         int             i, j, k;
3082
3083         num_pages = 0;
3084         if (vs->vs_indirect) {
3085                 /* loop on indirect maps */
3086                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3087                         vsmap = vs->vs_imap[i];
3088                         if (vsmap == NULL)
3089                                 continue;
3090                         /* loop on clusters in this indirect map */
3091                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3092                                 if (VSM_ISCLR(vsmap[j]) ||
3093                                     VSM_ISERR(vsmap[j]))
3094                                         continue;
3095                                 /* loop on pages in this cluster */
3096                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3097                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
3098                                                 num_pages++;
3099                                 }
3100                         }
3101                 }
3102         } else {
3103                 vsmap = vs->vs_dmap;
3104                 if (vsmap == NULL)
3105                         return 0;
3106                 /* loop on clusters in the direct map */
3107                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3108                         if (VSM_ISCLR(vsmap[j]) ||
3109                             VSM_ISERR(vsmap[j]))
3110                                 continue;
3111                         /* loop on pages in this cluster */
3112                         for (k = 0; k < VSCLSIZE(vs); k++) {
3113                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3114                                         num_pages++;
3115                         }
3116                 }
3117         }
3118
3119         return ptoa(num_pages);
3120 }
3121
3122 size_t
3123 ps_vstruct_allocated_pages(
3124         vstruct_t               vs,
3125         default_pager_page_t    *pages,
3126         size_t                  pages_size)
3127 {
3128         int             num_pages;
3129         struct vs_map   *vsmap;
3130         vm_offset_t     offset;
3131         int             i, j, k;
3132
3133         num_pages = 0;
3134         offset = 0;
3135         if (vs->vs_indirect) {
3136                 /* loop on indirect maps */
3137                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3138                         vsmap = vs->vs_imap[i];
3139                         if (vsmap == NULL) {
3140                                 offset += (vm_page_size * CLMAP_ENTRIES *
3141                                            VSCLSIZE(vs));
3142                                 continue;
3143                         }
3144                         /* loop on clusters in this indirect map */
3145                         for (j = 0; j < CLMAP_ENTRIES; j++) {
3146                                 if (VSM_ISCLR(vsmap[j]) ||
3147                                     VSM_ISERR(vsmap[j])) {
3148                                         offset += vm_page_size * VSCLSIZE(vs);
3149                                         continue;
3150                                 }
3151                                 /* loop on pages in this cluster */
3152                                 for (k = 0; k < VSCLSIZE(vs); k++) {
3153                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3154                                                 num_pages++;
3155                                                 if (num_pages < pages_size)
3156                                                         pages++->dpp_offset =
3157                                                                 offset;
3158                                         }
3159                                         offset += vm_page_size;
3160                                 }
3161                         }
3162                 }
3163         } else {
3164                 vsmap = vs->vs_dmap;
3165                 if (vsmap == NULL)
3166                         return 0;
3167                 /* loop on clusters in the direct map */
3168                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3169                         if (VSM_ISCLR(vsmap[j]) ||
3170                             VSM_ISERR(vsmap[j])) {
3171                                 offset += vm_page_size * VSCLSIZE(vs);
3172                                 continue;
3173                         }
3174                         /* loop on pages in this cluster */
3175                         for (k = 0; k < VSCLSIZE(vs); k++) {
3176                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3177                                         num_pages++;
3178                                         if (num_pages < pages_size)
3179                                                 pages++->dpp_offset = offset;
3180                                 }
3181                                 offset += vm_page_size;
3182                         }
3183                 }
3184         }
3185
3186         return num_pages;
3187 }
3188
3189
3190 kern_return_t
3191 ps_vstruct_transfer_from_segment(
3192         vstruct_t        vs,
3193         paging_segment_t segment,
3194 #ifndef ubc_sync_working
3195         vm_object_t     transfer_object)
3196 #else
3197         upl_t            upl)
3198 #endif
3199 {
3200         struct vs_map   *vsmap;
3201         struct vs_map   old_vsmap;
3202         struct vs_map   new_vsmap;
3203         int             i, j, k;
3204
3205         VS_LOCK(vs);    /* block all work on this vstruct */
3206                         /* can't allow the normal multiple write */
3207                         /* semantic because writes may conflict */
3208         vs->vs_xfer_pending = TRUE;
3209         vs_wait_for_sync_writers(vs);
3210         vs_start_write(vs);
3211         vs_wait_for_readers(vs);
3212         /* we will unlock the vs to allow other writes while transferring */
3213         /* and will be guaranteed of the persistance of the vs struct     */
3214         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3215         /* vs_async_pending */
3216         /* OK we now have guaranteed no other parties are accessing this */
3217         /* vs.  Now that we are also supporting simple lock versions of  */
3218         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3219         /* our purpose in holding it before was the multiple write case */
3220         /* we now use the boolean xfer_pending to do that.  We can use  */
3221         /* a boolean instead of a count because we have guaranteed single */
3222         /* file access to this code in its caller */
3223         VS_UNLOCK(vs);
3224 vs_changed:
3225         if (vs->vs_indirect) {
3226                 int     vsmap_size;
3227                 int     clmap_off;
3228                 /* loop on indirect maps */
3229                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3230                         vsmap = vs->vs_imap[i];
3231                         if (vsmap == NULL)
3232                                 continue;
3233                         /* loop on clusters in this indirect map */
3234                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3235                                            VSCLSIZE(vs) * i);
3236                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3237                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3238                         else
3239                                 vsmap_size = CLMAP_ENTRIES;
3240                         for (j = 0; j < vsmap_size; j++) {
3241                                 if (VSM_ISCLR(vsmap[j]) ||
3242                                     VSM_ISERR(vsmap[j]) ||
3243                                     (VSM_PS(vsmap[j]) != segment))
3244                                         continue;
3245                                 if(vs_cluster_transfer(vs,
3246                                         (vm_page_size * (j << vs->vs_clshift))
3247                                         + clmap_off,
3248                                         vm_page_size << vs->vs_clshift,
3249 #ifndef ubc_sync_working
3250                                         transfer_object)
3251 #else
3252                                         upl)
3253 #endif
3254                                                 != KERN_SUCCESS) {
3255                                    VS_LOCK(vs);
3256                                    vs->vs_xfer_pending = FALSE;
3257                                    VS_UNLOCK(vs);
3258                                    vs_finish_write(vs);
3259                                    return KERN_FAILURE;
3260                                 }
3261                                 /* allow other readers/writers during transfer*/
3262                                 VS_LOCK(vs);
3263                                 vs->vs_xfer_pending = FALSE;
3264                                 VS_UNLOCK(vs);
3265                                 vs_finish_write(vs);
3266                                 VS_LOCK(vs);
3267                                 vs->vs_xfer_pending = TRUE;
3268                                 vs_wait_for_sync_writers(vs);
3269                                 vs_start_write(vs);
3270                                 vs_wait_for_readers(vs);
3271                                 VS_UNLOCK(vs);
3272                                 if (!(vs->vs_indirect)) {
3273                                         goto vs_changed;
3274                                 }
3275                         }
3276                 }
3277         } else {
3278                 vsmap = vs->vs_dmap;
3279                 if (vsmap == NULL) {
3280                         VS_LOCK(vs);
3281                         vs->vs_xfer_pending = FALSE;
3282                         VS_UNLOCK(vs);
3283                         vs_finish_write(vs);
3284                         return KERN_SUCCESS;
3285                 }
3286                 /* loop on clusters in the direct map */
3287                 for (j = 0; j < vs->vs_size; j++) {
3288                         if (VSM_ISCLR(vsmap[j]) ||
3289                             VSM_ISERR(vsmap[j]) ||
3290                             (VSM_PS(vsmap[j]) != segment))
3291                                 continue;
3292                         if(vs_cluster_transfer(vs,
3293                                 vm_page_size * (j << vs->vs_clshift),
3294                                 vm_page_size << vs->vs_clshift,
3295 #ifndef ubc_sync_working
3296                                 transfer_object) != KERN_SUCCESS) {
3297 #else
3298                                 upl) != KERN_SUCCESS) {
3299 #endif
3300                            VS_LOCK(vs);
3301                            vs->vs_xfer_pending = FALSE;
3302                            VS_UNLOCK(vs);
3303                            vs_finish_write(vs);
3304                            return KERN_FAILURE;
3305                         }
3306                         /* allow other readers/writers during transfer*/
3307                         VS_LOCK(vs);
3308                         vs->vs_xfer_pending = FALSE;
3309                         VS_UNLOCK(vs);
3310                         vs_finish_write(vs);
3311                         VS_LOCK(vs);
3312                         vs->vs_xfer_pending = TRUE;
3313                         VS_UNLOCK(vs);
3314                         vs_wait_for_sync_writers(vs);
3315                         vs_start_write(vs);
3316                         vs_wait_for_readers(vs);
3317                         if (vs->vs_indirect) {
3318                                 goto vs_changed;
3319                         }
3320                 }
3321         }
3322
3323         VS_LOCK(vs);
3324         vs->vs_xfer_pending = FALSE;
3325         VS_UNLOCK(vs);
3326         vs_finish_write(vs);
3327         return KERN_SUCCESS;
3328 }
3329
3330
3331
3332 vs_map_t
3333 vs_get_map_entry(
3334         vstruct_t       vs,
3335         vm_offset_t     offset)
3336 {
3337         struct vs_map   *vsmap;
3338         vm_offset_t     cluster;
3339
3340         cluster = atop(offset) >> vs->vs_clshift;
3341         if (vs->vs_indirect) {
3342                 long    ind_block = cluster/CLMAP_ENTRIES;
3343
3344                 /* Is the indirect block allocated? */
3345                 vsmap = vs->vs_imap[ind_block];
3346                 if(vsmap == (vs_map_t) NULL)
3347                         return vsmap;
3348         } else
3349                 vsmap = vs->vs_dmap;
3350         vsmap += cluster%CLMAP_ENTRIES;
3351         return vsmap;
3352 }
3353
3354 kern_return_t
3355 vs_cluster_transfer(
3356         vstruct_t       vs,
3357         vm_offset_t     offset,
3358         vm_size_t       cnt,
3359 #ifndef ubc_sync_working
3360         vm_object_t     transfer_object)
3361 #else
3362         upl_t           upl)
3363 #endif
3364 {
3365         vm_offset_t             actual_offset;
3366         paging_segment_t        ps;
3367         struct clmap            clmap;
3368         kern_return_t           error = KERN_SUCCESS;
3369         int                     size, size_wanted, i;
3370         unsigned int            residual;
3371         int                     unavail_size;
3372         default_pager_thread_t  *dpt;
3373         boolean_t               dealloc;
3374         struct  vs_map          *vsmap_ptr;
3375         struct  vs_map          read_vsmap;
3376         struct  vs_map          original_read_vsmap;
3377         struct  vs_map          write_vsmap;
3378         upl_t                   sync_upl;
3379 #ifndef ubc_sync_working
3380         upl_t                   upl;
3381 #endif
3382
3383         vm_offset_t     ioaddr;
3384
3385         /* vs_cluster_transfer reads in the pages of a cluster and
3386          * then writes these pages back to new backing store.  The
3387          * segment the pages are being read from is assumed to have
3388          * been taken off-line and is no longer considered for new
3389          * space requests.
3390          */
3391
3392         /*
3393          * This loop will be executed once per cluster referenced.
3394          * Typically this means once, since it's unlikely that the
3395          * VM system will ask for anything spanning cluster boundaries.
3396          *
3397          * If there are holes in a cluster (in a paging segment), we stop
3398          * reading at the hole, then loop again, hoping to
3399          * find valid pages later in the cluster.  This continues until
3400          * the entire range has been examined, and read, if present.  The
3401          * pages are written as they are read.  If a failure occurs after
3402          * some pages are written the unmap call at the bottom of the loop
3403          * recovers the backing store and the old backing store remains
3404          * in effect.
3405          */
3406
3407         VSM_CLR(write_vsmap);
3408         VSM_CLR(original_read_vsmap);
3409         /* grab the actual object's pages to sync with I/O */
3410         while (cnt && (error == KERN_SUCCESS)) {
3411                 vsmap_ptr = vs_get_map_entry(vs, offset);
3412                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3413
3414                 if (actual_offset == (vm_offset_t) -1) {
3415
3416                         /*
3417                          * Nothing left to write in this cluster at least
3418                          * set write cluster information for any previous
3419                          * write, clear for next cluster, if there is one
3420                          */
3421                         unsigned int local_size, clmask, clsize;
3422
3423                         clsize = vm_page_size << vs->vs_clshift;
3424                         clmask = clsize - 1;
3425                         local_size = clsize - (offset & clmask);
3426                         ASSERT(local_size);
3427                         local_size = MIN(local_size, cnt);
3428
3429                         /* This cluster has no data in it beyond what may */
3430                         /* have been found on a previous iteration through */
3431                         /* the loop "write_vsmap" */
3432                         *vsmap_ptr = write_vsmap;
3433                         VSM_CLR(write_vsmap);
3434                         VSM_CLR(original_read_vsmap);
3435
3436                         cnt -= local_size;
3437                         offset += local_size;
3438                         continue;
3439                 }
3440
3441                 /*
3442                  * Count up contiguous available or unavailable
3443                  * pages.
3444                  */
3445                 ps = CLMAP_PS(clmap);
3446                 ASSERT(ps);
3447                 size = 0;
3448                 unavail_size = 0;
3449                 for (i = 0;
3450                      (size < cnt) && (unavail_size < cnt) &&
3451                      (i < CLMAP_NPGS(clmap)); i++) {
3452                         if (CLMAP_ISSET(clmap, i)) {
3453                                 if (unavail_size != 0)
3454                                         break;
3455                                 size += vm_page_size;
3456                                 BS_STAT(ps->ps_bs,
3457                                         ps->ps_bs->bs_pages_in++);
3458                         } else {
3459                                 if (size != 0)
3460                                         break;
3461                                 unavail_size += vm_page_size;
3462                         }
3463                 }
3464
3465                 if (size == 0) {
3466                         ASSERT(unavail_size);
3467                         cnt -= unavail_size;
3468                         offset += unavail_size;
3469                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3470                                 == 0) {
3471                                 /* There is no more to transfer in this
3472                                    cluster
3473                                 */
3474                                 *vsmap_ptr = write_vsmap;
3475                                 VSM_CLR(write_vsmap);
3476                                 VSM_CLR(original_read_vsmap);
3477                         }
3478                         continue;
3479                 }
3480
3481                 if(VSM_ISCLR(original_read_vsmap))
3482                         original_read_vsmap = *vsmap_ptr;
3483
3484                 if(ps->ps_segtype == PS_PARTITION) {
3485 /*
3486                         NEED TO BE WITH SYNC & NO COMMIT
3487                         error = ps_read_device(ps, actual_offset, &buffer,
3488                                        size, &residual, flags);
3489 */
3490                 } else {
3491 #ifndef ubc_sync_working
3492                         int page_list_count = 0;
3493
3494                         error = vm_object_upl_request(transfer_object,
3495 (vm_object_offset_t) (actual_offset & ((vm_page_size << vs->vs_clshift) - 1)),
3496                                         size, &upl, NULL, &page_list_count,
3497                                         UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
3498                                                     | UPL_SET_INTERNAL);
3499                         if (error == KERN_SUCCESS) {
3500                                 error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset,
3501                                                         size, &residual, 0);
3502                         }
3503
3504 #else
3505                         /* NEED TO BE WITH SYNC & NO COMMIT & NO RDAHEAD*/
3506                         error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset,
3507                                         size, &residual,
3508                                         (UPL_IOSYNC | UPL_NOCOMMIT | UPL_NORDAHEAD));
3509 #endif
3510                 }
3511
3512                 read_vsmap = *vsmap_ptr;
3513
3514
3515                 /*
3516                  * Adjust counts and put data in new BS.  Optimize for the
3517                  * common case, i.e. no error and/or partial data.
3518                  * If there was an error, then we need to error the entire
3519                  * range, even if some data was successfully read.
3520                  *
3521                  */
3522                 if ((error == KERN_SUCCESS) && (residual == 0)) {
3523                         int page_list_count = 0;
3524
3525                         /*
3526                          * Got everything we asked for, supply the data to
3527                          * the new BS.  Note that as a side effect of supplying
3528                          * the data, the buffer holding the supplied data is
3529                          * deallocated from the pager's address space unless
3530                          * the write is unsuccessful.
3531                          */
3532
3533                         /* note buffer will be cleaned up in all cases by */
3534                         /* internal_cluster_write or if an error on write */
3535                         /* the vm_map_copy_page_discard call              */
3536                         *vsmap_ptr = write_vsmap;
3537
3538 #ifndef ubc_sync_working
3539                         error = vm_object_upl_request(transfer_object,
3540                                         (vm_object_offset_t)
3541                                         (actual_offset & ((vm_page_size << vs->vs_clshift) - 1)),
3542                                          size, &upl, NULL, &page_list_count,
3543                                          UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL);
3544                         if(vs_cluster_write(vs, upl, offset,
3545                                         size, TRUE, 0) != KERN_SUCCESS) {
3546                                 upl_commit(upl, NULL);
3547                                 upl_deallocate(upl);
3548 #else
3549                         if(vs_cluster_write(vs, upl, offset,
3550                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3551 #endif
3552                                 error = KERN_FAILURE;
3553                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
3554                                         /* unmap the new backing store object */
3555                                         ps_clunmap(vs, offset, size);
3556                                 }
3557                                 /* original vsmap */
3558                                 *vsmap_ptr = original_read_vsmap;
3559                                 VSM_CLR(write_vsmap);
3560                         } else {
3561                                if((offset + size) &
3562                                         ((vm_page_size << vs->vs_clshift)
3563                                         - 1)) {
3564                                         /* There is more to transfer in this
3565                                            cluster
3566                                         */
3567                                         write_vsmap = *vsmap_ptr;
3568                                         *vsmap_ptr = read_vsmap;
3569                                 } else {
3570                                         /* discard the old backing object */
3571                                         write_vsmap = *vsmap_ptr;
3572                                         *vsmap_ptr = read_vsmap;
3573                                         ps_clunmap(vs, offset, size);
3574                                         *vsmap_ptr = write_vsmap;
3575                                         VSM_CLR(write_vsmap);
3576                                         VSM_CLR(original_read_vsmap);
3577                                 }
3578                         }
3579                 } else {
3580                         size_wanted = size;
3581                         if (error == KERN_SUCCESS) {
3582                                 if (residual == size) {
3583                                         /*
3584                                          * If a read operation returns no error
3585                                          * and no data moved, we turn it into
3586                                          * an error, assuming we're reading at
3587                                          * or beyond EOF.
3588                                          * Fall through and error the entire
3589                                          * range.
3590                                          */
3591                                         error = KERN_FAILURE;
3592                                         *vsmap_ptr = write_vsmap;
3593                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3594                                         /* unmap the new backing store object */
3595                                         ps_clunmap(vs, offset, size);
3596                                         }
3597                                         *vsmap_ptr = original_read_vsmap;
3598                                         VSM_CLR(write_vsmap);
3599                                         continue;
3600                                 } else {
3601                                         /*
3602                                          * Otherwise, we have partial read.
3603                                          * This is also considered an error
3604                                          * for the purposes of cluster transfer
3605                                          */
3606                                         error = KERN_FAILURE;
3607                                         *vsmap_ptr = write_vsmap;
3608                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3609                                         /* unmap the new backing store object */
3610                                         ps_clunmap(vs, offset, size);
3611                                         }
3612                                         *vsmap_ptr = original_read_vsmap;
3613                                         VSM_CLR(write_vsmap);
3614                                         continue;
3615                                 }
3616                         }
3617
3618                 }
3619                 cnt -= size;
3620                 offset += size;
3621
3622         } /* END while (cnt && (error == 0)) */
3623         if(!VSM_ISCLR(write_vsmap))
3624                 *vsmap_ptr = write_vsmap;
3625
3626         return error;
3627 }
3628
3629 kern_return_t
3630 default_pager_add_file(MACH_PORT_FACE backing_store,
3631         int             *vp,
3632         int             record_size,
3633         long            size)
3634 {
3635         backing_store_t         bs;
3636         paging_segment_t        ps;
3637         int                     i;
3638         int                     error;
3639
3640         if ((bs = backing_store_lookup(backing_store))
3641             == BACKING_STORE_NULL)
3642                 return KERN_INVALID_ARGUMENT;
3643
3644         PSL_LOCK();
3645         for (i = 0; i <= paging_segment_max; i++) {
3646                 ps = paging_segments[i];
3647                 if (ps == PAGING_SEGMENT_NULL)
3648                         continue;
3649                 if (ps->ps_segtype != PS_FILE)
3650                         continue;
3651
3652                 /*
3653                  * Check for overlap on same device.
3654                  */
3655                 if (ps->ps_vnode == (struct vnode *)vp) {
3656                         PSL_UNLOCK();
3657                         BS_UNLOCK(bs);
3658                         return KERN_INVALID_ARGUMENT;
3659                 }
3660         }
3661         PSL_UNLOCK();
3662
3663         /*
3664          * Set up the paging segment
3665          */
3666         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3667         if (ps == PAGING_SEGMENT_NULL) {
3668                 BS_UNLOCK(bs);
3669                 return KERN_RESOURCE_SHORTAGE;
3670         }
3671
3672         ps->ps_segtype = PS_FILE;
3673         ps->ps_vnode = (struct vnode *)vp;
3674         ps->ps_offset = 0;
3675         ps->ps_record_shift = local_log2(vm_page_size / record_size);
3676         ps->ps_recnum = size;
3677         ps->ps_pgnum = size >> ps->ps_record_shift;
3678
3679         ps->ps_pgcount = ps->ps_pgnum;
3680         ps->ps_clshift = local_log2(bs->bs_clsize);
3681         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3682         ps->ps_hint = 0;
3683
3684         PS_LOCK_INIT(ps);
3685         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3686         if (!ps->ps_bmap) {
3687                 kfree((vm_offset_t)ps, sizeof *ps);
3688                 BS_UNLOCK(bs);
3689                 return KERN_RESOURCE_SHORTAGE;
3690         }
3691         for (i = 0; i < ps->ps_ncls; i++) {
3692                 clrbit(ps->ps_bmap, i);
3693         }
3694
3695         ps->ps_going_away = FALSE;
3696         ps->ps_bs = bs;
3697
3698         if ((error = ps_enter(ps)) != 0) {
3699                 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3700                 kfree((vm_offset_t)ps, sizeof *ps);
3701                 BS_UNLOCK(bs);
3702                 return KERN_RESOURCE_SHORTAGE;
3703         }
3704
3705         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3706         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3707         PSL_LOCK();
3708         dp_pages_free += ps->ps_pgcount;
3709         PSL_UNLOCK();
3710
3711         BS_UNLOCK(bs);
3712
3713         bs_more_space(ps->ps_clcount);
3714
3715         DEBUG(DEBUG_BS_INTERNAL,
3716               ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3717                device, offset, size, record_size,
3718                ps->ps_record_shift, ps->ps_pgnum));
3719
3720         return KERN_SUCCESS;
3721 }
3722
3723
3724
3725 kern_return_t
3726 ps_read_file(
3727         paging_segment_t        ps,
3728         upl_t                   upl,
3729         vm_offset_t             upl_offset,
3730         vm_offset_t             offset,
3731         unsigned int            size,
3732         unsigned int            *residualp,
3733         int                     flags)
3734 {
3735         vm_object_offset_t      f_offset;
3736         int                     error = 0;
3737         int                     result;
3738
3739
3740         clustered_reads[atop(size)]++;
3741
3742         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3743
3744         /* for transfer case we need to pass uploffset and flags */
3745         error = vnode_pagein(ps->ps_vnode,
3746                                    upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
3747
3748         /* The vnode_pagein semantic is somewhat at odds with the existing   */
3749         /* device_read semantic.  Partial reads are not experienced at this  */
3750         /* level.  It is up to the bit map code and cluster read code to     */
3751         /* check that requested data locations are actually backed, and the  */
3752         /* pagein code to either read all of the requested data or return an */
3753         /* error. */
3754
3755         if (error)
3756                 result = KERN_FAILURE;
3757         else {
3758                 *residualp = 0;
3759                 result = KERN_SUCCESS;
3760         }
3761         return result;
3762 }
3763
3764 kern_return_t
3765 ps_write_file(
3766         paging_segment_t        ps,
3767         upl_t                   upl,
3768         vm_offset_t             upl_offset,
3769         vm_offset_t             offset,
3770         unsigned int            size,
3771         int                     flags)
3772 {
3773         vm_object_offset_t      f_offset;
3774         kern_return_t           result;
3775
3776         int             error = 0;
3777
3778         clustered_writes[atop(size)]++;
3779         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3780
3781         if (vnode_pageout(ps->ps_vnode,
3782                                 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3783                 result = KERN_FAILURE;
3784         else
3785                 result = KERN_SUCCESS;
3786
3787         return result;
3788 }
3789
3790 kern_return_t
3791 default_pager_triggers(MACH_PORT_FACE default_pager,
3792         int             hi_wat,
3793         int             lo_wat,
3794         int             flags,
3795         MACH_PORT_FACE  trigger_port)
3796 {
3797         MACH_PORT_FACE release;
3798         kern_return_t kr;
3799
3800         PSL_LOCK();
3801         if (flags == HI_WAT_ALERT) {
3802                 release = min_pages_trigger_port;
3803                 min_pages_trigger_port = trigger_port;
3804                 minimum_pages_remaining = hi_wat/vm_page_size;
3805                 bs_low = FALSE;
3806                 kr = KERN_SUCCESS;
3807         } else if (flags ==  LO_WAT_ALERT) {
3808                 release = max_pages_trigger_port;
3809                 max_pages_trigger_port = trigger_port;
3810                 maximum_pages_free = lo_wat/vm_page_size;
3811                 kr = KERN_SUCCESS;
3812         } else {
3813                 release = trigger_port;
3814                 kr =  KERN_INVALID_ARGUMENT;
3815         }
3816         PSL_UNLOCK();
3817
3818         if (IP_VALID(release))
3819                 ipc_port_release_send(release);
3820
3821         return kr;
3822 }