osfmk/default_pager/dp_backing_store.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*
  23  * @OSF_COPYRIGHT@
  24  */
  25 /*
  26  * Mach Operating System
  27  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  28  * All Rights Reserved.
  29  *
  30  * Permission to use, copy, modify and distribute this software and its
  31  * documentation is hereby granted, provided that both the copyright
  32  * notice and this permission notice appear in all copies of the
  33  * software, derivative works or modified versions, and any portions
  34  * thereof, and that both notices appear in supporting documentation.
  35  *
  36  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  37  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  38  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  39  *
  40  * Carnegie Mellon requests users of this software to return to
  41  *
  42  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  43  *  School of Computer Science
  44  *  Carnegie Mellon University
  45  *  Pittsburgh PA 15213-3890
  46  *
  47  * any improvements or extensions that they make and grant Carnegie Mellon
  48  * the rights to redistribute these changes.
  49  */
  50
  51 /*
  52  *      Default Pager.
  53  *              Paging File Management.
  54  */
  55
  56 #include <mach/memory_object_server.h>
  57 #include "default_pager_internal.h"
  58 #include <default_pager/default_pager_alerts.h>
  59 #include <ipc/ipc_port.h>
  60 #include <ipc/ipc_space.h>
  61 #include <kern/queue.h>
  62 #include <kern/counters.h>
  63 #include <kern/sched_prim.h>
  64 #include <vm/vm_kern.h>
  65 #include <vm/vm_pageout.h>
  66 /* CDY CDY */
  67 #include <vm/vm_map.h>
  68
  69 /* MAXPHYS derived from bsd/bsd/ppc/param.h, we need a */
  70 /* universal originating in the kernel, or a formal means of exporting */
  71 /* from the bsd component */
  72
  73 #define MAXPHYS  (64 * 1024)
  74 int physical_transfer_cluster_count = 0;
  75
  76 #define VM_SUPER_CLUSTER        0x10000
  77
  78 /*
  79  * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
  80  * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
  81  */
  82 #define VSTRUCT_DEF_CLSHIFT     2
  83 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
  84 int default_pager_clsize = 0;
  85
  86 /* statistics */
  87 unsigned int clustered_writes[MAX_CLUSTER_SIZE+1];
  88 unsigned int clustered_reads[MAX_CLUSTER_SIZE+1];
  89
  90 /*
  91  * Globals used for asynchronous paging operations:
  92  *      vs_async_list:  head of list of to-be-completed I/O ops
  93  *      async_num_queued: number of pages completed, but not yet
  94  *              processed by async thread.
  95  *      async_requests_out: number of pages of requests not completed.
  96  */
  97
  98 #if 0
  99 struct vs_async *vs_async_list;
 100 int     async_num_queued;
 101 int     async_requests_out;
 102 #endif
 103
 104
 105 #define VS_ASYNC_REUSE 1
 106 struct vs_async *vs_async_free_list;
 107
 108 mutex_t default_pager_async_lock;       /* Protects globals above */
 109
 110
 111 int vs_alloc_async_failed = 0;                  /* statistics */
 112 int vs_alloc_async_count = 0;                   /* statistics */
 113 struct vs_async *vs_alloc_async(void);          /* forward */
 114 void vs_free_async(struct vs_async *vsa);       /* forward */
 115
 116
 117 #define VS_ALLOC_ASYNC()        vs_alloc_async()
 118 #define VS_FREE_ASYNC(vsa)      vs_free_async(vsa)
 119
 120 #define VS_ASYNC_LOCK()         mutex_lock(&default_pager_async_lock)
 121 #define VS_ASYNC_UNLOCK()       mutex_unlock(&default_pager_async_lock)
 122 #define VS_ASYNC_LOCK_INIT()    mutex_init(&default_pager_async_lock,  \
 123                                                 ETAP_IO_DEV_PAGEH)
 124 #define VS_ASYNC_LOCK_ADDR()    (&default_pager_async_lock)
 125 /*
 126  *  Paging Space Hysteresis triggers and the target notification port
 127  *
 128  */
 129
 130 unsigned int    minimum_pages_remaining = 0;
 131 unsigned int    maximum_pages_free = 0;
 132 ipc_port_t      min_pages_trigger_port = NULL;
 133 ipc_port_t      max_pages_trigger_port = NULL;
 134
 135 boolean_t       bs_low = FALSE;
 136
 137
 138
 139 /*
 140  * Object sizes are rounded up to the next power of 2,
 141  * unless they are bigger than a given maximum size.
 142  */
 143 vm_size_t       max_doubled_size = 4 * 1024 * 1024;     /* 4 meg */
 144
 145 /*
 146  * List of all backing store and segments.
 147  */
 148 struct backing_store_list_head backing_store_list;
 149 paging_segment_t        paging_segments[MAX_NUM_PAGING_SEGMENTS];
 150 mutex_t                 paging_segments_lock;
 151 int                     paging_segment_max = 0;
 152 int                     paging_segment_count = 0;
 153 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
 154
 155
 156 /*
 157  * Total pages free in system
 158  * This differs from clusters committed/avail which is a measure of the
 159  * over commitment of paging segments to backing store.  An idea which is
 160  * likely to be deprecated.
 161  */
 162 unsigned  int   dp_pages_free = 0;
 163 unsigned  int   cluster_transfer_minimum = 100;
 164
 165 kern_return_t ps_write_file(paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, int);      /* forward */
 166
 167 default_pager_thread_t *
 168 get_read_buffer()
 169 {
 170         int     i;
 171
 172         DPT_LOCK(dpt_lock);
 173         while(TRUE) {
 174                 for (i=0; i<default_pager_internal_count; i++) {
 175                         if(dpt_array[i]->checked_out == FALSE) {
 176                           dpt_array[i]->checked_out = TRUE;
 177                           DPT_UNLOCK(dpt_lock);
 178                           return  dpt_array[i];
 179                         }
 180                 }
 181                 assert_wait(&dpt_array, THREAD_UNINT);
 182                 DPT_UNLOCK(dpt_lock);
 183                 thread_block((void(*)(void))0);
 184         }
 185 }
 186
 187 void
 188 bs_initialize(void)
 189 {
 190         int i;
 191
 192         /*
 193          * List of all backing store.
 194          */
 195         BSL_LOCK_INIT();
 196         queue_init(&backing_store_list.bsl_queue);
 197         PSL_LOCK_INIT();
 198
 199         VS_ASYNC_LOCK_INIT();
 200 #if     VS_ASYNC_REUSE
 201         vs_async_free_list = NULL;
 202 #endif  /* VS_ASYNC_REUSE */
 203
 204         for (i = 0; i < MAX_CLUSTER_SIZE+1; i++) {
 205                 clustered_writes[i] = 0;
 206                 clustered_reads[i] = 0;
 207         }
 208
 209 }
 210
 211 /*
 212  * When things do not quite workout...
 213  */
 214 void bs_no_paging_space(boolean_t);     /* forward */
 215
 216 void
 217 bs_no_paging_space(
 218         boolean_t out_of_memory)
 219 {
 220         static char     here[] = "bs_no_paging_space";
 221
 222         if (out_of_memory)
 223                 dprintf(("*** OUT OF MEMORY ***\n"));
 224         panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
 225 }
 226
 227 void bs_more_space(int);        /* forward */
 228 void bs_commit(int);            /* forward */
 229
 230 boolean_t       user_warned = FALSE;
 231 unsigned int    clusters_committed = 0;
 232 unsigned int    clusters_available = 0;
 233 unsigned int    clusters_committed_peak = 0;
 234
 235 void
 236 bs_more_space(
 237         int     nclusters)
 238 {
 239         BSL_LOCK();
 240         /*
 241          * Account for new paging space.
 242          */
 243         clusters_available += nclusters;
 244
 245         if (clusters_available >= clusters_committed) {
 246                 if (verbose && user_warned) {
 247                         printf("%s%s - %d excess clusters now.\n",
 248                                my_name,
 249                                "paging space is OK now",
 250                                clusters_available - clusters_committed);
 251                         user_warned = FALSE;
 252                         clusters_committed_peak = 0;
 253                 }
 254         } else {
 255                 if (verbose && user_warned) {
 256                         printf("%s%s - still short of %d clusters.\n",
 257                                my_name,
 258                                "WARNING: paging space over-committed",
 259                                clusters_committed - clusters_available);
 260                         clusters_committed_peak -= nclusters;
 261                 }
 262         }
 263         BSL_UNLOCK();
 264
 265         return;
 266 }
 267
 268 void
 269 bs_commit(
 270         int     nclusters)
 271 {
 272         BSL_LOCK();
 273         clusters_committed += nclusters;
 274         if (clusters_committed > clusters_available) {
 275                 if (verbose && !user_warned) {
 276                         user_warned = TRUE;
 277                         printf("%s%s - short of %d clusters.\n",
 278                                my_name,
 279                                "WARNING: paging space over-committed",
 280                                clusters_committed - clusters_available);
 281                 }
 282                 if (clusters_committed > clusters_committed_peak) {
 283                         clusters_committed_peak = clusters_committed;
 284                 }
 285         } else {
 286                 if (verbose && user_warned) {
 287                         printf("%s%s - was short of up to %d clusters.\n",
 288                                my_name,
 289                                "paging space is OK now",
 290                                clusters_committed_peak - clusters_available);
 291                         user_warned = FALSE;
 292                         clusters_committed_peak = 0;
 293                 }
 294         }
 295         BSL_UNLOCK();
 296
 297         return;
 298 }
 299
 300 int default_pager_info_verbose = 1;
 301
 302 void
 303 bs_global_info(
 304         vm_size_t       *totalp,
 305         vm_size_t       *freep)
 306 {
 307         vm_size_t               pages_total, pages_free;
 308         paging_segment_t        ps;
 309         int                     i;
 310         static char             here[] = "bs_global_info";
 311
 312         PSL_LOCK();
 313         pages_total = pages_free = 0;
 314         for (i = 0; i <= paging_segment_max; i++) {
 315                 ps = paging_segments[i];
 316                 if (ps == PAGING_SEGMENT_NULL)
 317                         continue;
 318
 319                 /*
 320                  * no need to lock: by the time this data
 321                  * gets back to any remote requestor it
 322                  * will be obsolete anyways
 323                  */
 324                 pages_total += ps->ps_pgnum;
 325                 pages_free += ps->ps_clcount << ps->ps_clshift;
 326                 DEBUG(DEBUG_BS_INTERNAL,
 327                       ("segment #%d: %d total, %d free\n",
 328                        i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
 329         }
 330         *totalp = pages_total;
 331         *freep = pages_free;
 332         if (verbose && user_warned && default_pager_info_verbose) {
 333                 if (clusters_available < clusters_committed) {
 334                         printf("%s %d clusters committed, %d available.\n",
 335                                my_name,
 336                                clusters_committed,
 337                                clusters_available);
 338                 }
 339         }
 340         PSL_UNLOCK();
 341 }
 342
 343 backing_store_t backing_store_alloc(void);      /* forward */
 344
 345 backing_store_t
 346 backing_store_alloc(void)
 347 {
 348         backing_store_t bs;
 349         static char     here[] = "backing_store_alloc";
 350
 351         bs = (backing_store_t) kalloc(sizeof (struct backing_store));
 352         if (bs == BACKING_STORE_NULL)
 353                 panic("backing_store_alloc: no memory");
 354
 355         BS_LOCK_INIT(bs);
 356         bs->bs_port = MACH_PORT_NULL;
 357         bs->bs_priority = 0;
 358         bs->bs_clsize = 0;
 359         bs->bs_pages_total = 0;
 360         bs->bs_pages_in = 0;
 361         bs->bs_pages_in_fail = 0;
 362         bs->bs_pages_out = 0;
 363         bs->bs_pages_out_fail = 0;
 364
 365         return bs;
 366 }
 367
 368 backing_store_t backing_store_lookup(MACH_PORT_FACE);   /* forward */
 369
 370 /* Even in both the component space and external versions of this pager, */
 371 /* backing_store_lookup will be called from tasks in the application space */
 372 backing_store_t
 373 backing_store_lookup(
 374         MACH_PORT_FACE port)
 375 {
 376         backing_store_t bs;
 377
 378 /*
 379         port is currently backed with a vs structure in the alias field
 380         we could create an ISBS alias and a port_is_bs call but frankly
 381         I see no reason for the test, the bs->port == port check below
 382         will work properly on junk entries.
 383
 384         if ((port == MACH_PORT_NULL) || port_is_vs(port))
 385 */
 386         if ((port == MACH_PORT_NULL))
 387                 return BACKING_STORE_NULL;
 388
 389         BSL_LOCK();
 390         queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
 391                       bs_links) {
 392                 BS_LOCK(bs);
 393                 if (bs->bs_port == port) {
 394                         BSL_UNLOCK();
 395                         /* Success, return it locked. */
 396                         return bs;
 397                 }
 398                 BS_UNLOCK(bs);
 399         }
 400         BSL_UNLOCK();
 401         return BACKING_STORE_NULL;
 402 }
 403
 404 void backing_store_add(backing_store_t);        /* forward */
 405
 406 void
 407 backing_store_add(
 408         backing_store_t bs)
 409 {
 410         MACH_PORT_FACE          port = bs->bs_port;
 411         MACH_PORT_FACE          pset = default_pager_default_set;
 412         kern_return_t           kr = KERN_SUCCESS;
 413         static char             here[] = "backing_store_add";
 414
 415         if (kr != KERN_SUCCESS)
 416                 panic("backing_store_add: add to set");
 417
 418 }
 419
 420 /*
 421  * Set up default page shift, but only if not already
 422  * set and argument is within range.
 423  */
 424 boolean_t
 425 bs_set_default_clsize(unsigned int npages)
 426 {
 427         switch(npages){
 428             case 1:
 429             case 2:
 430             case 4:
 431             case 8:
 432                 if (default_pager_clsize == 0)  /* if not yet set */
 433                         vstruct_def_clshift = local_log2(npages);
 434                 return(TRUE);
 435         }
 436         return(FALSE);
 437 }
 438
 439 int bs_get_global_clsize(int clsize);   /* forward */
 440
 441 int
 442 bs_get_global_clsize(
 443         int     clsize)
 444 {
 445         int                     i;
 446         MACH_PORT_FACE          DMM;
 447         kern_return_t           kr;
 448         static char             here[] = "bs_get_global_clsize";
 449
 450         /*
 451          * Only allow setting of cluster size once. If called
 452          * with no cluster size (default), we use the compiled-in default
 453          * for the duration. The same cluster size is used for all
 454          * paging segments.
 455          */
 456         if (default_pager_clsize == 0) {
 457                 if (norma_mk) {
 458                         /*
 459                          * On NORMA, don't use clustered paging because
 460                          * XMM can't handle it.
 461                          */
 462                         vstruct_def_clshift = 0;
 463                 }
 464                 /*
 465                  * Keep cluster size in bit shift because it's quicker
 466                  * arithmetic, and easier to keep at a power of 2.
 467                  */
 468                 if (clsize != NO_CLSIZE) {
 469                         for (i = 0; (1 << i) < clsize; i++);
 470                         if (i > MAX_CLUSTER_SHIFT)
 471                                 i = MAX_CLUSTER_SHIFT;
 472                         vstruct_def_clshift = i;
 473                 }
 474                 default_pager_clsize = (1 << vstruct_def_clshift);
 475
 476                 /*
 477                  * Let the user know the new (and definitive) cluster size.
 478                  */
 479                 if (verbose)
 480                         printf("%scluster size = %d page%s\n",
 481                                 my_name, default_pager_clsize,
 482                                 (default_pager_clsize == 1) ? "" : "s");
 483                 /*
 484                  * Let the kernel know too, in case it hasn't used the
 485                  * default value provided in main() yet.
 486                  */
 487                 DMM = default_pager_default_port;
 488                 clsize = default_pager_clsize * vm_page_size;   /* in bytes */
 489                 kr = host_default_memory_manager(host_priv_self(),
 490                                                  &DMM,
 491                                                  clsize);
 492                 if (kr != KERN_SUCCESS) {
 493                    panic("bs_get_global_cl_size:host_default_memory_manager");
 494                 }
 495                 if (DMM != default_pager_default_port) {
 496                   panic("bs_get_global_cl_size:there is another default pager");
 497                 }
 498         }
 499         ASSERT(default_pager_clsize > 0 &&
 500                (default_pager_clsize & (default_pager_clsize - 1)) == 0);
 501
 502         return default_pager_clsize;
 503 }
 504
 505 kern_return_t
 506 default_pager_backing_store_create(
 507         MACH_PORT_FACE  pager,
 508         int             priority,
 509         int             clsize,         /* in bytes */
 510         MACH_PORT_FACE  *backing_store)
 511 {
 512         backing_store_t bs;
 513         MACH_PORT_FACE  port;
 514         kern_return_t   kr;
 515         struct vstruct_alias *alias_struct;
 516         static char here[] = "default_pager_backing_store_create";
 517
 518         if (pager != default_pager_default_port)
 519                 return KERN_INVALID_ARGUMENT;
 520
 521         bs = backing_store_alloc();
 522         port = ipc_port_alloc_kernel();
 523         ipc_port_make_send(port);
 524         assert (port != IP_NULL);
 525
 526         DEBUG(DEBUG_BS_EXTERNAL,
 527               ("priority=%d clsize=%d bs_port=0x%x\n",
 528                priority, clsize, (int) backing_store));
 529
 530         alias_struct = (struct vstruct_alias *)
 531                                 kalloc(sizeof (struct vstruct_alias));
 532         if(alias_struct != NULL) {
 533                 alias_struct->vs = (struct vstruct *)bs;
 534                 alias_struct->name = ISVS;
 535                 port->alias = (int) alias_struct;
 536         }
 537         else {
 538                 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
 539                 kfree((vm_offset_t)bs, sizeof (struct backing_store));
 540                 return KERN_RESOURCE_SHORTAGE;
 541         }
 542
 543         bs->bs_port = port;
 544         if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
 545                 priority = BS_MAXPRI;
 546         else if (priority == BS_NOPRI)
 547                 priority = BS_MAXPRI;
 548         else
 549                 priority = BS_MINPRI;
 550         bs->bs_priority = priority;
 551
 552         bs->bs_clsize = bs_get_global_clsize(atop(clsize));
 553
 554         BSL_LOCK();
 555         queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
 556                     bs_links);
 557         BSL_UNLOCK();
 558
 559         backing_store_add(bs);
 560
 561         *backing_store = port;
 562         return KERN_SUCCESS;
 563 }
 564
 565 kern_return_t
 566 default_pager_backing_store_info(
 567         MACH_PORT_FACE          backing_store,
 568         backing_store_flavor_t  flavour,
 569         backing_store_info_t    info,
 570         mach_msg_type_number_t  *size)
 571 {
 572         backing_store_t                 bs;
 573         backing_store_basic_info_t      basic;
 574         int                             i;
 575         paging_segment_t                ps;
 576
 577         if (flavour != BACKING_STORE_BASIC_INFO ||
 578             *size < BACKING_STORE_BASIC_INFO_COUNT)
 579                 return KERN_INVALID_ARGUMENT;
 580
 581         basic = (backing_store_basic_info_t)info;
 582         *size = BACKING_STORE_BASIC_INFO_COUNT;
 583
 584         VSTATS_LOCK(&global_stats.gs_lock);
 585         basic->pageout_calls    = global_stats.gs_pageout_calls;
 586         basic->pagein_calls     = global_stats.gs_pagein_calls;
 587         basic->pages_in         = global_stats.gs_pages_in;
 588         basic->pages_out        = global_stats.gs_pages_out;
 589         basic->pages_unavail    = global_stats.gs_pages_unavail;
 590         basic->pages_init       = global_stats.gs_pages_init;
 591         basic->pages_init_writes= global_stats.gs_pages_init_writes;
 592         VSTATS_UNLOCK(&global_stats.gs_lock);
 593
 594         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 595                 return KERN_INVALID_ARGUMENT;
 596
 597         basic->bs_pages_total   = bs->bs_pages_total;
 598         PSL_LOCK();
 599         bs->bs_pages_free = 0;
 600         for (i = 0; i <= paging_segment_max; i++) {
 601                 ps = paging_segments[i];
 602                 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
 603                         PS_LOCK(ps);
 604                         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
 605                         PS_UNLOCK(ps);
 606                 }
 607         }
 608         PSL_UNLOCK();
 609         basic->bs_pages_free    = bs->bs_pages_free;
 610         basic->bs_pages_in      = bs->bs_pages_in;
 611         basic->bs_pages_in_fail = bs->bs_pages_in_fail;
 612         basic->bs_pages_out     = bs->bs_pages_out;
 613         basic->bs_pages_out_fail= bs->bs_pages_out_fail;
 614
 615         basic->bs_priority      = bs->bs_priority;
 616         basic->bs_clsize        = ptoa(bs->bs_clsize);  /* in bytes */
 617
 618         BS_UNLOCK(bs);
 619
 620         return KERN_SUCCESS;
 621 }
 622
 623 int ps_delete(paging_segment_t);        /* forward */
 624
 625 int
 626 ps_delete(
 627         paging_segment_t ps)
 628 {
 629         vstruct_t       vs;
 630         kern_return_t   error = KERN_SUCCESS;
 631         int             vs_count;
 632
 633         VSL_LOCK();             /* get the lock on the list of vs's      */
 634
 635         /* The lock relationship and sequence is farily complicated      */
 636         /* this code looks at a live list, locking and unlocking the list */
 637         /* as it traverses it.  It depends on the locking behavior of    */
 638         /* default_pager_no_senders.  no_senders always locks the vstruct */
 639         /* targeted for removal before locking the vstruct list.  However */
 640         /* it will remove that member of the list without locking its    */
 641         /* neighbors.  We can be sure when we hold a lock on a vstruct   */
 642         /* it cannot be removed from the list but we must hold the list  */
 643         /* lock to be sure that its pointers to its neighbors are valid. */
 644         /* Also, we can hold off destruction of a vstruct when the list  */
 645         /* lock and the vs locks are not being held by bumping the       */
 646         /* vs_async_pending count.      */
 647
 648         /* we will choose instead to hold a send right */
 649         vs_count = vstruct_list.vsl_count;
 650         vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
 651         if(vs == (vstruct_t)&vstruct_list)  {
 652                 VSL_UNLOCK();
 653                 return KERN_SUCCESS;
 654         }
 655         VS_LOCK(vs);
 656         vs_async_wait(vs);  /* wait for any pending async writes */
 657         if ((vs_count != 0) && (vs != NULL))
 658                 vs->vs_async_pending += 1;  /* hold parties calling  */
 659                                             /* vs_async_wait */
 660         VS_UNLOCK(vs);
 661         VSL_UNLOCK();
 662         while((vs_count != 0) && (vs != NULL)) {
 663                 /* We take the count of AMO's before beginning the         */
 664                 /* transfer of of the target segment.                      */
 665                 /* We are guaranteed that the target segment cannot get    */
 666                 /* more users.  We also know that queue entries are        */
 667                 /* made at the back of the list.  If some of the entries   */
 668                 /* we would check disappear while we are traversing the    */
 669                 /* list then we will either check new entries which        */
 670                 /* do not have any backing store in the target segment     */
 671                 /* or re-check old entries.  This might not be optimal     */
 672                 /* but it will always be correct. The alternative is to    */
 673                 /* take a snapshot of the list.                            */
 674                 vstruct_t       next_vs;
 675
 676                 if(dp_pages_free < cluster_transfer_minimum)
 677                         error = KERN_FAILURE;
 678                 else {
 679                         vm_object_t     transfer_object;
 680                         upl_t           upl;
 681
 682                         transfer_object = vm_object_allocate(VM_SUPER_CLUSTER);
 683                         error = vm_fault_list_request(transfer_object,
 684                                         (vm_object_offset_t)0,
 685                                         VM_SUPER_CLUSTER, &upl, NULL,
 686                                         0, UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
 687                                                 | UPL_SET_INTERNAL);
 688                         if(error == KERN_SUCCESS) {
 689 #ifndef ubc_sync_working
 690                                 uc_upl_commit(upl, NULL);
 691                                 error = ps_vstruct_transfer_from_segment(
 692                                                 vs, ps, transfer_object);
 693 #else
 694                                 error = ps_vstruct_transfer_from_segment(
 695                                                         vs, ps, upl);
 696                                 uc_upl_commit(upl, NULL);
 697 #endif
 698                                 vm_object_deallocate(transfer_object);
 699                         } else {
 700                                 vm_object_deallocate(transfer_object);
 701                                 error = KERN_FAILURE;
 702                         }
 703                 }
 704                 if(error) {
 705                         VS_LOCK(vs);
 706                         vs->vs_async_pending -= 1;  /* release vs_async_wait */
 707                         if (vs->vs_async_pending == 0) {
 708                                 VS_UNLOCK(vs);
 709                                 thread_wakeup(&vs->vs_waiting_async);
 710                         } else {
 711                                 VS_UNLOCK(vs);
 712                         }
 713                         return KERN_FAILURE;
 714                 }
 715
 716                 VSL_LOCK();
 717                 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
 718                 if((next_vs != (vstruct_t)&vstruct_list) &&
 719                                 (vs != next_vs) && (vs_count != 1)) {
 720                         VS_LOCK(next_vs);
 721                         vs_async_wait(next_vs);  /* wait for any  */
 722                                                  /* pending async writes */
 723                         next_vs->vs_async_pending += 1; /* hold parties  */
 724                                                 /* calling vs_async_wait */
 725                         VS_UNLOCK(next_vs);
 726                 }
 727                 VSL_UNLOCK();
 728                 VS_LOCK(vs);
 729                 vs->vs_async_pending -= 1;
 730                 if (vs->vs_async_pending == 0) {
 731                         VS_UNLOCK(vs);
 732                         thread_wakeup(&vs->vs_waiting_async);
 733                 } else {
 734                         VS_UNLOCK(vs);
 735                 }
 736                 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
 737                         vs = NULL;
 738                 else
 739                         vs = next_vs;
 740                 vs_count--;
 741         }
 742         return KERN_SUCCESS;
 743 }
 744
 745
 746 kern_return_t
 747 default_pager_backing_store_delete(
 748         MACH_PORT_FACE backing_store)
 749 {
 750         backing_store_t         bs;
 751         int                     i;
 752         paging_segment_t        ps;
 753         int                     error;
 754         int                     interim_pages_removed = 0;
 755         kern_return_t           kr;
 756         static char here[] = "default_pager_backing_store_delete";
 757
 758         if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
 759                 return KERN_INVALID_ARGUMENT;
 760
 761 #if 0
 762         /* not implemented */
 763         BS_UNLOCK(bs);
 764         return KERN_FAILURE;
 765 #endif
 766
 767     restart:
 768         PSL_LOCK();
 769         error = KERN_SUCCESS;
 770         for (i = 0; i <= paging_segment_max; i++) {
 771                 ps = paging_segments[i];
 772                 if (ps != PAGING_SEGMENT_NULL &&
 773                     ps->ps_bs == bs &&
 774                     ! ps->ps_going_away) {
 775                         PS_LOCK(ps);
 776                         /* disable access to this segment */
 777                         ps->ps_going_away = TRUE;
 778                         PS_UNLOCK(ps);
 779                         /*
 780                          * The "ps" segment is "off-line" now,
 781                          * we can try and delete it...
 782                          */
 783                         if(dp_pages_free < (cluster_transfer_minimum
 784                                                         + ps->ps_pgcount)) {
 785                                 error = KERN_FAILURE;
 786                                 PSL_UNLOCK();
 787                         }
 788                         else {
 789                                 /* remove all pages associated with the  */
 790                                 /* segment from the list of free pages   */
 791                                 /* when transfer is through, all target  */
 792                                 /* segment pages will appear to be free  */
 793
 794                                 dp_pages_free -=  ps->ps_pgcount;
 795                                 interim_pages_removed += ps->ps_pgcount;
 796                                 PSL_UNLOCK();
 797                                 error = ps_delete(ps);
 798                         }
 799                         if (error != KERN_SUCCESS) {
 800                                 /*
 801                                  * We couldn't delete the segment,
 802                                  * probably because there's not enough
 803                                  * virtual memory left.
 804                                  * Re-enable all the segments.
 805                                  */
 806                                 PSL_LOCK();
 807                                 break;
 808                         }
 809                         goto restart;
 810                 }
 811         }
 812
 813         if (error != KERN_SUCCESS) {
 814                 for (i = 0; i <= paging_segment_max; i++) {
 815                         ps = paging_segments[i];
 816                         if (ps != PAGING_SEGMENT_NULL &&
 817                             ps->ps_bs == bs &&
 818                             ps->ps_going_away) {
 819                                 PS_LOCK(ps);
 820                                 /* re-enable access to this segment */
 821                                 ps->ps_going_away = FALSE;
 822                                 PS_UNLOCK(ps);
 823                         }
 824                 }
 825                 dp_pages_free += interim_pages_removed;
 826                 PSL_UNLOCK();
 827                 BS_UNLOCK(bs);
 828                 return error;
 829         }
 830
 831         for (i = 0; i <= paging_segment_max; i++) {
 832                 ps = paging_segments[i];
 833                 if (ps != PAGING_SEGMENT_NULL &&
 834                     ps->ps_bs == bs) {
 835                         if(ps->ps_going_away) {
 836                                 paging_segments[i] = PAGING_SEGMENT_NULL;
 837                                 paging_segment_count--;
 838                                 PS_LOCK(ps);
 839                                 kfree((vm_offset_t)ps->ps_bmap,
 840                                                 RMAPSIZE(ps->ps_ncls));
 841                                 kfree((vm_offset_t)ps, sizeof *ps);
 842                         }
 843                 }
 844         }
 845
 846         /* Scan the entire ps array separately to make certain we find the */
 847         /* proper paging_segment_max                                       */
 848         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 849                 if(paging_segments[i] != PAGING_SEGMENT_NULL)
 850                    paging_segment_max = i;
 851         }
 852
 853         PSL_UNLOCK();
 854
 855         /*
 856          * All the segments have been deleted.
 857          * We can remove the backing store.
 858          */
 859
 860         /*
 861          * Disable lookups of this backing store.
 862          */
 863         if((void *)bs->bs_port->alias != NULL)
 864                 kfree((vm_offset_t) bs->bs_port->alias,
 865                                 sizeof (struct vstruct_alias));
 866         pager_mux_hash_delete((ipc_port_t) (bs->bs_port));
 867         ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
 868         bs->bs_port = MACH_PORT_NULL;
 869         BS_UNLOCK(bs);
 870
 871         /*
 872          * Remove backing store from backing_store list.
 873          */
 874         BSL_LOCK();
 875         queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
 876                      bs_links);
 877         BSL_UNLOCK();
 878
 879         /*
 880          * Free the backing store structure.
 881          */
 882         kfree((vm_offset_t)bs, sizeof *bs);
 883
 884         return KERN_SUCCESS;
 885 }
 886
 887 int     ps_enter(paging_segment_t);     /* forward */
 888
 889 int
 890 ps_enter(
 891         paging_segment_t ps)
 892 {
 893         int i;
 894
 895         PSL_LOCK();
 896
 897         for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
 898                 if (paging_segments[i] == PAGING_SEGMENT_NULL)
 899                         break;
 900         }
 901
 902         if (i < MAX_NUM_PAGING_SEGMENTS) {
 903                 paging_segments[i] = ps;
 904                 if (i > paging_segment_max)
 905                         paging_segment_max = i;
 906                 paging_segment_count++;
 907                 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
 908                         (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
 909                         ps_select_array[ps->ps_bs->bs_priority] = 0;
 910                 i = 0;
 911         } else {
 912                 PSL_UNLOCK();
 913                 return KERN_RESOURCE_SHORTAGE;
 914         }
 915
 916         PSL_UNLOCK();
 917         return i;
 918 }
 919
 920 #ifdef DEVICE_PAGING
 921 kern_return_t
 922 default_pager_add_segment(
 923         MACH_PORT_FACE  backing_store,
 924         MACH_PORT_FACE  device,
 925         recnum_t        offset,
 926         recnum_t        count,
 927         int             record_size)
 928 {
 929         backing_store_t         bs;
 930         paging_segment_t        ps;
 931         int                     i;
 932         int                     error;
 933         static char here[] = "default_pager_add_segment";
 934
 935         if ((bs = backing_store_lookup(backing_store))
 936             == BACKING_STORE_NULL)
 937                 return KERN_INVALID_ARGUMENT;
 938
 939         PSL_LOCK();
 940         for (i = 0; i <= paging_segment_max; i++) {
 941                 ps = paging_segments[i];
 942                 if (ps == PAGING_SEGMENT_NULL)
 943                         continue;
 944
 945                 /*
 946                  * Check for overlap on same device.
 947                  */
 948                 if (!(ps->ps_device != device
 949                       || offset >= ps->ps_offset + ps->ps_recnum
 950                       || offset + count <= ps->ps_offset)) {
 951                         PSL_UNLOCK();
 952                         BS_UNLOCK(bs);
 953                         return KERN_INVALID_ARGUMENT;
 954                 }
 955         }
 956         PSL_UNLOCK();
 957
 958         /*
 959          * Set up the paging segment
 960          */
 961         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
 962         if (ps == PAGING_SEGMENT_NULL) {
 963                 BS_UNLOCK(bs);
 964                 return KERN_RESOURCE_SHORTAGE;
 965         }
 966
 967         ps->ps_segtype = PS_PARTITION;
 968         ps->ps_device = device;
 969         ps->ps_offset = offset;
 970         ps->ps_record_shift = local_log2(vm_page_size / record_size);
 971         ps->ps_recnum = count;
 972         ps->ps_pgnum = count >> ps->ps_record_shift;
 973
 974         ps->ps_pgcount = ps->ps_pgnum;
 975         ps->ps_clshift = local_log2(bs->bs_clsize);
 976         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
 977         ps->ps_hint = 0;
 978
 979         PS_LOCK_INIT(ps);
 980         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
 981         if (!ps->ps_bmap) {
 982                 kfree((vm_offset_t)ps, sizeof *ps);
 983                 BS_UNLOCK(bs);
 984                 return KERN_RESOURCE_SHORTAGE;
 985         }
 986         for (i = 0; i < ps->ps_ncls; i++) {
 987                 clrbit(ps->ps_bmap, i);
 988         }
 989
 990         ps->ps_going_away = FALSE;
 991         ps->ps_bs = bs;
 992
 993         if ((error = ps_enter(ps)) != 0) {
 994                 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
 995                 kfree((vm_offset_t)ps, sizeof *ps);
 996                 BS_UNLOCK(bs);
 997                 return KERN_RESOURCE_SHORTAGE;
 998         }
 999
1000         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1001         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1002         BS_UNLOCK(bs);
1003
1004         PSL_LOCK();
1005         dp_pages_free += ps->ps_pgcount;
1006         PSL_UNLOCK();
1007
1008         bs_more_space(ps->ps_clcount);
1009
1010         DEBUG(DEBUG_BS_INTERNAL,
1011               ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1012                device, offset, count, record_size,
1013                ps->ps_record_shift, ps->ps_pgnum));
1014
1015         return KERN_SUCCESS;
1016 }
1017
1018 boolean_t
1019 bs_add_device(
1020         char            *dev_name,
1021         MACH_PORT_FACE  master)
1022 {
1023         security_token_t        null_security_token = {
1024                 { 0, 0 }
1025         };
1026         MACH_PORT_FACE  device;
1027         int             info[DEV_GET_SIZE_COUNT];
1028         mach_msg_type_number_t info_count;
1029         MACH_PORT_FACE  bs = MACH_PORT_NULL;
1030         unsigned int    rec_size;
1031         recnum_t        count;
1032         int             clsize;
1033         MACH_PORT_FACE  reply_port;
1034
1035         if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1036                         null_security_token, dev_name, &device))
1037                 return FALSE;
1038
1039         info_count = DEV_GET_SIZE_COUNT;
1040         if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1041                 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1042                 count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1043                 clsize = bs_get_global_clsize(0);
1044                 if (!default_pager_backing_store_create(
1045                                         default_pager_default_port,
1046                                         DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1047                                         (clsize * vm_page_size),
1048                                         &bs)) {
1049                         if (!default_pager_add_segment(bs, device,
1050                                                        0, count, rec_size)) {
1051                                 return TRUE;
1052                         }
1053                         ipc_port_release_receive(bs);
1054                 }
1055         }
1056
1057         ipc_port_release_send(device);
1058         return FALSE;
1059 }
1060 #endif /* DEVICE_PAGING */
1061
1062 #if     VS_ASYNC_REUSE
1063
1064 struct vs_async *
1065 vs_alloc_async(void)
1066 {
1067         struct vs_async *vsa;
1068         MACH_PORT_FACE  reply_port;
1069         kern_return_t   kr;
1070
1071         VS_ASYNC_LOCK();
1072         if (vs_async_free_list == NULL) {
1073                 VS_ASYNC_UNLOCK();
1074                 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1075                 if (vsa != NULL) {
1076                         /*
1077                          * Try allocating a reply port named after the
1078                          * address of the vs_async structure.
1079                          */
1080                         struct vstruct_alias    *alias_struct;
1081
1082                         reply_port = ipc_port_alloc_kernel();
1083                         alias_struct = (struct vstruct_alias *)
1084                                 kalloc(sizeof (struct vstruct_alias));
1085                         if(alias_struct != NULL) {
1086                                 alias_struct->vs = (struct vstruct *)vsa;
1087                                 alias_struct->name = ISVS;
1088                                 reply_port->alias = (int) alias_struct;
1089                                 vsa->reply_port = reply_port;
1090                                 vs_alloc_async_count++;
1091                         }
1092                         else {
1093                                 vs_alloc_async_failed++;
1094                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1095                                                                 (reply_port));
1096                                 kfree((vm_offset_t)vsa,
1097                                                 sizeof (struct vs_async));
1098                                 vsa = NULL;
1099                         }
1100                 }
1101         } else {
1102                 vsa = vs_async_free_list;
1103                 vs_async_free_list = vs_async_free_list->vsa_next;
1104                 VS_ASYNC_UNLOCK();
1105         }
1106
1107         return vsa;
1108 }
1109
1110 void
1111 vs_free_async(
1112         struct vs_async *vsa)
1113 {
1114         VS_ASYNC_LOCK();
1115         vsa->vsa_next = vs_async_free_list;
1116         vs_async_free_list = vsa;
1117         VS_ASYNC_UNLOCK();
1118 }
1119
1120 #else   /* VS_ASYNC_REUSE */
1121
1122 struct vs_async *
1123 vs_alloc_async(void)
1124 {
1125         struct vs_async *vsa;
1126         MACH_PORT_FACE  reply_port;
1127         kern_return_t   kr;
1128
1129         vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1130         if (vsa != NULL) {
1131                 /*
1132                  * Try allocating a reply port named after the
1133                  * address of the vs_async structure.
1134                  */
1135                         reply_port = ipc_port_alloc_kernel();
1136                         alias_struct = (vstruct_alias *)
1137                                 kalloc(sizeof (struct vstruct_alias));
1138                         if(alias_struct != NULL) {
1139                                 alias_struct->vs = reply_port;
1140                                 alias_struct->name = ISVS;
1141                                 reply_port->alias = (int) vsa;
1142                                 vsa->reply_port = reply_port;
1143                                 vs_alloc_async_count++;
1144                         }
1145                         else {
1146                                 vs_alloc_async_failed++;
1147                                 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1148                                                                 (reply_port));
1149                                 kfree((vm_offset_t) vsa,
1150                                                 sizeof (struct vs_async));
1151                                 vsa = NULL;
1152                         }
1153         }
1154
1155         return vsa;
1156 }
1157
1158 void
1159 vs_free_async(
1160         struct vs_async *vsa)
1161 {
1162         static char     here[] = "vs_free_async";
1163         MACH_PORT_FACE  reply_port;
1164         kern_return_t   kr;
1165
1166         reply_port = vsa->reply_port;
1167         kfree((vm_offset_t) reply_port->alias, sizeof (struct vstuct_alias));
1168         kfree((vm_offset_t) vsa, sizeof (struct vs_async));
1169         pager_mux_hash_delete(reply_port);
1170         ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1171 #if 0
1172         VS_ASYNC_LOCK();
1173         vs_alloc_async_count--;
1174         VS_ASYNC_UNLOCK();
1175 #endif
1176 }
1177
1178 #endif  /* VS_ASYNC_REUSE */
1179
1180 vstruct_t
1181 ps_vstruct_create(
1182         vm_size_t size)
1183 {
1184         vstruct_t       vs;
1185         int             i;
1186         static char here[] = "ps_vstruct_create";
1187
1188         vs = (vstruct_t) kalloc(sizeof (struct vstruct));
1189         if (vs == VSTRUCT_NULL) {
1190                 return VSTRUCT_NULL;
1191         }
1192
1193         VS_LOCK_INIT(vs);
1194
1195         /*
1196          * The following fields will be provided later.
1197          */
1198         vs->vs_mem_obj_port = MACH_PORT_NULL;
1199         vs->vs_seqno = 0;
1200         vs->vs_control_port = MACH_PORT_NULL;
1201         vs->vs_control_refs = 0;
1202         vs->vs_object_name = MACH_PORT_NULL;
1203         vs->vs_name_refs = 0;
1204
1205 #ifdef MACH_KERNEL
1206         vs->vs_waiting_seqno = FALSE;
1207         vs->vs_waiting_read = FALSE;
1208         vs->vs_waiting_write = FALSE;
1209         vs->vs_waiting_refs = FALSE;
1210         vs->vs_waiting_async = FALSE;
1211 #else
1212         mutex_init(&vs->vs_waiting_seqno, ETAP_DPAGE_VSSEQNO);
1213         mutex_init(&vs->vs_waiting_read, ETAP_DPAGE_VSREAD);
1214         mutex_init(&vs->vs_waiting_write, ETAP_DPAGE_VSWRITE);
1215         mutex_init(&vs->vs_waiting_refs, ETAP_DPAGE_VSREFS);
1216         mutex_init(&vs->vs_waiting_async, ETAP_DPAGE_VSASYNC);
1217 #endif
1218
1219         vs->vs_readers = 0;
1220         vs->vs_writers = 0;
1221
1222         vs->vs_errors = 0;
1223
1224         vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1225         vs->vs_size = ((atop(round_page(size)) - 1) >> vs->vs_clshift) + 1;
1226         vs->vs_async_pending = 0;
1227
1228         /*
1229          * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1230          * depending on the size of the memory object.
1231          */
1232         if (INDIRECT_CLMAP(vs->vs_size)) {
1233                 vs->vs_imap = (struct vs_map **)
1234                         kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1235                 vs->vs_indirect = TRUE;
1236         } else {
1237                 vs->vs_dmap = (struct vs_map *)
1238                         kalloc(CLMAP_SIZE(vs->vs_size));
1239                 vs->vs_indirect = FALSE;
1240         }
1241         vs->vs_xfer_pending = FALSE;
1242         DEBUG(DEBUG_VS_INTERNAL,
1243               ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1244
1245         /*
1246          * Check to see that we got the space.
1247          */
1248         if (!vs->vs_dmap) {
1249                 kfree((vm_offset_t)vs, sizeof *vs);
1250                 return VSTRUCT_NULL;
1251         }
1252
1253         /*
1254          * Zero the indirect pointers, or clear the direct pointers.
1255          */
1256         if (vs->vs_indirect)
1257                 memset(vs->vs_imap, 0,
1258                        INDIRECT_CLMAP_SIZE(vs->vs_size));
1259         else
1260                 for (i = 0; i < vs->vs_size; i++)
1261                         VSM_CLR(vs->vs_dmap[i]);
1262
1263         VS_MAP_LOCK_INIT(vs);
1264
1265         bs_commit(vs->vs_size);
1266
1267         return vs;
1268 }
1269
1270 paging_segment_t ps_select_segment(int, int *); /* forward */
1271
1272 paging_segment_t
1273 ps_select_segment(
1274         int     shift,
1275         int     *psindex)
1276 {
1277         paging_segment_t        ps;
1278         int                     i;
1279         int                     j;
1280         static char here[] = "ps_select_segment";
1281
1282         /*
1283          * Optimize case where there's only one segment.
1284          * paging_segment_max will index the one and only segment.
1285          */
1286
1287         PSL_LOCK();
1288         if (paging_segment_count == 1) {
1289                 paging_segment_t lps;   /* used to avoid extra PS_UNLOCK */
1290
1291                 ps = paging_segments[paging_segment_max];
1292                 *psindex = paging_segment_max;
1293                 PS_LOCK(ps);
1294                 if (ps->ps_going_away) {
1295                         /* this segment is being turned off */
1296                         lps = PAGING_SEGMENT_NULL;
1297                 } else {
1298                         ASSERT(ps->ps_clshift >= shift);
1299                         if (ps->ps_clcount) {
1300                                 ps->ps_clcount--;
1301                                 dp_pages_free -=  1 << ps->ps_clshift;
1302                                 if(min_pages_trigger_port &&
1303                                   (dp_pages_free < minimum_pages_remaining)) {
1304                                         default_pager_space_alert(
1305                                                 min_pages_trigger_port,
1306                                                 HI_WAT_ALERT);
1307                                         min_pages_trigger_port = NULL;
1308                                         bs_low = TRUE;
1309                                 }
1310                                 lps = ps;
1311                         } else
1312                                 lps = PAGING_SEGMENT_NULL;
1313                 }
1314                 PS_UNLOCK(ps);
1315                 PSL_UNLOCK();
1316                 return lps;
1317         }
1318
1319         if (paging_segment_count == 0) {
1320                 PSL_UNLOCK();
1321                 return PAGING_SEGMENT_NULL;
1322         }
1323
1324         for (i = BS_MAXPRI;
1325              i >= BS_MINPRI; i--) {
1326                 int start_index;
1327
1328                 if ((ps_select_array[i] == BS_NOPRI) ||
1329                                 (ps_select_array[i] == BS_FULLPRI))
1330                         continue;
1331                 start_index = ps_select_array[i];
1332
1333                 if(!(paging_segments[start_index])) {
1334                         j = start_index+1;
1335                         physical_transfer_cluster_count = 0;
1336                 }
1337                 else if ((physical_transfer_cluster_count+1) == (MAXPHYS >>
1338                                 (((paging_segments[start_index])->ps_clshift)
1339                                 + page_shift))) {
1340                         physical_transfer_cluster_count = 0;
1341                         j = start_index + 1;
1342                 } else {
1343                         physical_transfer_cluster_count+=1;
1344                         j = start_index;
1345                         if(start_index == 0)
1346                                 start_index = paging_segment_max;
1347                         else
1348                                 start_index = start_index - 1;
1349                 }
1350
1351                 while (1) {
1352                         if (j > paging_segment_max)
1353                                 j = 0;
1354                         if ((ps = paging_segments[j]) &&
1355                             (ps->ps_bs->bs_priority == i)) {
1356                                 /*
1357                                  * Force the ps cluster size to be
1358                                  * >= that of the vstruct.
1359                                  */
1360                                 PS_LOCK(ps);
1361                                 if (ps->ps_going_away) {
1362                                         /* this segment is being turned off */
1363                                 } else if ((ps->ps_clcount) &&
1364                                            (ps->ps_clshift >= shift)) {
1365                                         ps->ps_clcount--;
1366                                         dp_pages_free -=  1 << ps->ps_clshift;
1367                                         if(min_pages_trigger_port &&
1368                                                 (dp_pages_free <
1369                                                 minimum_pages_remaining)) {
1370                                                 default_pager_space_alert(
1371                                                         min_pages_trigger_port,
1372                                                         HI_WAT_ALERT);
1373                                                 min_pages_trigger_port = NULL;
1374                                         }
1375                                         PS_UNLOCK(ps);
1376                                         /*
1377                                          * found one, quit looking.
1378                                          */
1379                                         ps_select_array[i] = j;
1380                                         PSL_UNLOCK();
1381                                         *psindex = j;
1382                                         return ps;
1383                                 }
1384                                 PS_UNLOCK(ps);
1385                         }
1386                         if (j == start_index) {
1387                                 /*
1388                                  * none at this priority -- mark it full
1389                                  */
1390                                 ps_select_array[i] = BS_FULLPRI;
1391                                 break;
1392                         }
1393                         j++;
1394                 }
1395         }
1396         PSL_UNLOCK();
1397         return PAGING_SEGMENT_NULL;
1398 }
1399
1400 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1401
1402 vm_offset_t
1403 ps_allocate_cluster(
1404         vstruct_t               vs,
1405         int                     *psindex,
1406         paging_segment_t        use_ps)
1407 {
1408         int                     byte_num;
1409         int                     bit_num = 0;
1410         paging_segment_t        ps;
1411         vm_offset_t             cluster;
1412         static char here[] = "ps_allocate_cluster";
1413
1414         /*
1415          * Find best paging segment.
1416          * ps_select_segment will decrement cluster count on ps.
1417          * Must pass cluster shift to find the most appropriate segment.
1418          */
1419         /* NOTE:  The addition of paging segment delete capability threatened
1420          * to seriously complicate the treatment of paging segments in this
1421          * module and the ones that call it (notably ps_clmap), because of the
1422          * difficulty in assuring that the paging segment would continue to
1423          * exist between being unlocked and locked.   This was
1424          * avoided because all calls to this module are based in either
1425          * dp_memory_object calls which rely on the vs lock, or by
1426          * the transfer function which is part of the segment delete path.
1427          * The transfer function which is part of paging segment delete is
1428          * protected from multiple callers by the backing store lock.
1429          * The paging segment delete function treats mappings to a paging
1430          * segment on a vstruct by vstruct basis, locking the vstruct targeted
1431          * while data is transferred to the remaining segments.  This is in
1432          * line with the view that incomplete or in-transition mappings between
1433          * data, a vstruct, and backing store are protected by the vs lock.
1434          * This and the ordering of the paging segment "going_away" bit setting
1435          * protects us.
1436          */
1437         if (use_ps != PAGING_SEGMENT_NULL) {
1438                 ps = use_ps;
1439                 PSL_LOCK();
1440                 PS_LOCK(ps);
1441                 ps->ps_clcount--;
1442                 dp_pages_free -=  1 << ps->ps_clshift;
1443                 PSL_UNLOCK();
1444                 if(min_pages_trigger_port &&
1445                                 (dp_pages_free < minimum_pages_remaining)) {
1446                         default_pager_space_alert(
1447                                 min_pages_trigger_port,
1448                                 HI_WAT_ALERT);
1449                         min_pages_trigger_port = NULL;
1450                 }
1451                 PS_UNLOCK(ps);
1452         } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1453                    PAGING_SEGMENT_NULL) {
1454 #if 0
1455                 bs_no_paging_space(TRUE);
1456 #endif
1457 #if 0
1458                 if (verbose)
1459 #endif
1460                         dprintf(("no space in available paging segments; "
1461                                  "swapon suggested\n"));
1462                 /* the count got off maybe, reset to zero */
1463                 dp_pages_free = 0;
1464                 if(min_pages_trigger_port) {
1465                         default_pager_space_alert(
1466                                         min_pages_trigger_port, HI_WAT_ALERT);
1467                         min_pages_trigger_port = NULL;
1468                         bs_low = TRUE;
1469                 }
1470                 return (vm_offset_t) -1;
1471         }
1472         ASSERT(ps->ps_clcount != 0);
1473
1474         /*
1475          * Look for an available cluster.  At the end of the loop,
1476          * byte_num is the byte offset and bit_num is the bit offset of the
1477          * first zero bit in the paging segment bitmap.
1478          */
1479         PS_LOCK(ps);
1480         byte_num = ps->ps_hint;
1481         for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1482                 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1483                         for (bit_num = 0; bit_num < NBBY; bit_num++) {
1484                                 if (isclr((ps->ps_bmap + byte_num), bit_num))
1485                                         break;
1486                         }
1487                         ASSERT(bit_num != NBBY);
1488                         break;
1489                 }
1490         }
1491         ps->ps_hint = byte_num;
1492         cluster = (byte_num*NBBY) + bit_num;
1493
1494         /* Space was reserved, so this must be true */
1495         ASSERT(cluster < ps->ps_ncls);
1496
1497         setbit(ps->ps_bmap, cluster);
1498         PS_UNLOCK(ps);
1499
1500         return cluster;
1501 }
1502
1503 void ps_deallocate_cluster(paging_segment_t, vm_offset_t);      /* forward */
1504
1505 void
1506 ps_deallocate_cluster(
1507         paging_segment_t        ps,
1508         vm_offset_t             cluster)
1509 {
1510
1511         if (cluster >= (vm_offset_t) ps->ps_ncls)
1512                 panic("ps_deallocate_cluster: Invalid cluster number");
1513
1514         /*
1515          * Lock the paging segment, clear the cluster's bitmap and increment the
1516          * number of free cluster.
1517          */
1518         PSL_LOCK();
1519         PS_LOCK(ps);
1520         clrbit(ps->ps_bmap, cluster);
1521         ++ps->ps_clcount;
1522         dp_pages_free +=  1 << ps->ps_clshift;
1523         PSL_UNLOCK();
1524         if(max_pages_trigger_port && (dp_pages_free > maximum_pages_free)) {
1525                 default_pager_space_alert(max_pages_trigger_port, LO_WAT_ALERT);
1526                 max_pages_trigger_port = NULL;
1527         }
1528
1529         /*
1530          * Move the hint down to the freed cluster if it is
1531          * less than the current hint.
1532          */
1533         if ((cluster/NBBY) < ps->ps_hint) {
1534                 ps->ps_hint = (cluster/NBBY);
1535         }
1536
1537         PS_UNLOCK(ps);
1538
1539         /*
1540          * If we're freeing space on a full priority, reset the array.
1541          */
1542         PSL_LOCK();
1543         if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1544                 ps_select_array[ps->ps_bs->bs_priority] = 0;
1545         PSL_UNLOCK();
1546
1547         return;
1548 }
1549
1550 void ps_dealloc_vsmap(struct vs_map *, vm_size_t);      /* forward */
1551
1552 void
1553 ps_dealloc_vsmap(
1554         struct vs_map   *vsmap,
1555         vm_size_t       size)
1556 {
1557         int i;
1558         for (i = 0; i < size; i++)
1559                 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1560                         ps_deallocate_cluster(VSM_PS(vsmap[i]),
1561                                               VSM_CLOFF(vsmap[i]));
1562 }
1563
1564 void
1565 ps_vstruct_dealloc(
1566         vstruct_t vs)
1567 {
1568         int     i;
1569         spl_t   s;
1570         static char here[] = "ps_vstruct_dealloc";
1571
1572         VS_MAP_LOCK(vs);
1573
1574         /*
1575          * If this is an indirect structure, then we walk through the valid
1576          * (non-zero) indirect pointers and deallocate the clusters
1577          * associated with each used map entry (via ps_dealloc_vsmap).
1578          * When all of the clusters in an indirect block have been
1579          * freed, we deallocate the block.  When all of the indirect
1580          * blocks have been deallocated we deallocate the memory
1581          * holding the indirect pointers.
1582          */
1583         if (vs->vs_indirect) {
1584                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1585                         if (vs->vs_imap[i] != NULL) {
1586                                 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1587                                 kfree((vm_offset_t)vs->vs_imap[i],
1588                                                         CLMAP_THRESHOLD);
1589                         }
1590                 }
1591                 kfree((vm_offset_t)vs->vs_imap,
1592                                         INDIRECT_CLMAP_SIZE(vs->vs_size));
1593         } else {
1594                 /*
1595                  * Direct map.  Free used clusters, then memory.
1596                  */
1597                 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1598                 kfree((vm_offset_t)vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1599         }
1600         VS_MAP_UNLOCK(vs);
1601
1602         bs_commit(- vs->vs_size);
1603
1604         ip_lock(vs_to_port(vs));
1605         (vs_to_port(vs))->ip_destination = 0;
1606         (vs_to_port(vs))->ip_receiver_name = MACH_PORT_NULL;
1607
1608         s= splsched();
1609         imq_lock(&vs_to_port(vs)->ip_messages);
1610         (vs_to_port(vs))->ip_mscount = 0;
1611         (vs_to_port(vs))->ip_messages.imq_seqno = 0;
1612         imq_unlock(&vs_to_port(vs)->ip_messages);
1613         splx(s);
1614
1615         ip_unlock(vs_to_port(vs));
1616         pager_mux_hash_delete((ipc_port_t) vs_to_port(vs));
1617         ipc_port_release_receive(vs_to_port(vs));
1618         /*
1619          * Do this *after* deallocating the port name
1620          */
1621         kfree((vm_offset_t)vs, sizeof *vs);
1622 }
1623
1624 int ps_map_extend(vstruct_t, int);      /* forward */
1625
1626 int ps_map_extend(
1627         vstruct_t       vs,
1628         int             new_size)
1629 {
1630         struct vs_map   **new_imap;
1631         struct vs_map   *new_dmap = NULL;
1632         int             newdsize;
1633         int             i;
1634         void            *old_map = NULL;
1635         int             old_map_size = 0;
1636
1637         if (vs->vs_size >= new_size) {
1638                 /*
1639                  * Someone has already done the work.
1640                  */
1641                 return 0;
1642         }
1643
1644         /*
1645          * If the new size extends into the indirect range, then we have one
1646          * of two cases: we are going from indirect to indirect, or we are
1647          * going from direct to indirect.  If we are going from indirect to
1648          * indirect, then it is possible that the new size will fit in the old
1649          * indirect map.  If this is the case, then just reset the size of the
1650          * vstruct map and we are done.  If the new size will not
1651          * fit into the old indirect map, then we have to allocate a new
1652          * indirect map and copy the old map pointers into this new map.
1653          *
1654          * If we are going from direct to indirect, then we have to allocate a
1655          * new indirect map and copy the old direct pages into the first
1656          * indirect page of the new map.
1657          * NOTE: allocating memory here is dangerous, as we're in the
1658          * pageout path.
1659          */
1660         if (INDIRECT_CLMAP(new_size)) {
1661                 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1662
1663                 /*
1664                  * Get a new indirect map and zero it.
1665                  */
1666                 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1667                 if (vs->vs_indirect &&
1668                     (new_map_size == old_map_size)) {
1669                         bs_commit(new_size - vs->vs_size);
1670                         vs->vs_size = new_size;
1671                         return 0;
1672                 }
1673
1674                 new_imap = (struct vs_map **)kalloc(new_map_size);
1675                 if (new_imap == NULL) {
1676                         return -1;
1677                 }
1678                 memset(new_imap, 0, new_map_size);
1679
1680                 if (vs->vs_indirect) {
1681                         /* Copy old entries into new map */
1682                         memcpy(new_imap, vs->vs_imap, old_map_size);
1683                         /* Arrange to free the old map */
1684                         old_map = (void *) vs->vs_imap;
1685                         newdsize = 0;
1686                 } else {        /* Old map was a direct map */
1687                         /* Allocate an indirect page */
1688                         if ((new_imap[0] = (struct vs_map *)
1689                              kalloc(CLMAP_THRESHOLD)) == NULL) {
1690                                 kfree((vm_offset_t)new_imap, new_map_size);
1691                                 return -1;
1692                         }
1693                         new_dmap = new_imap[0];
1694                         newdsize = CLMAP_ENTRIES;
1695                 }
1696         } else {
1697                 new_imap = NULL;
1698                 newdsize = new_size;
1699                 /*
1700                  * If the new map is a direct map, then the old map must
1701                  * also have been a direct map.  All we have to do is
1702                  * to allocate a new direct map, copy the old entries
1703                  * into it and free the old map.
1704                  */
1705                 if ((new_dmap = (struct vs_map *)
1706                      kalloc(CLMAP_SIZE(new_size))) == NULL) {
1707                         return -1;
1708                 }
1709         }
1710         if (newdsize) {
1711
1712                 /* Free the old map */
1713                 old_map = (void *) vs->vs_dmap;
1714                 old_map_size = CLMAP_SIZE(vs->vs_size);
1715
1716                 /* Copy info from the old map into the new map */
1717                 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1718
1719                 /* Initialize the rest of the new map */
1720                 for (i = vs->vs_size; i < newdsize; i++)
1721                         VSM_CLR(new_dmap[i]);
1722         }
1723         if (new_imap) {
1724                 vs->vs_imap = new_imap;
1725                 vs->vs_indirect = TRUE;
1726         } else
1727                 vs->vs_dmap = new_dmap;
1728         bs_commit(new_size - vs->vs_size);
1729         vs->vs_size = new_size;
1730         if (old_map)
1731                 kfree((vm_offset_t)old_map, old_map_size);
1732         return 0;
1733 }
1734
1735 vm_offset_t
1736 ps_clmap(
1737         vstruct_t       vs,
1738         vm_offset_t     offset,
1739         struct clmap    *clmap,
1740         int             flag,
1741         vm_size_t       size,
1742         int             error)
1743 {
1744         vm_offset_t     cluster;        /* The cluster of offset.       */
1745         vm_offset_t     newcl;          /* The new cluster allocated.   */
1746         vm_offset_t     newoff;
1747         int             i;
1748         struct vs_map   *vsmap;
1749         static char here[] = "ps_clmap";
1750
1751         VS_MAP_LOCK(vs);
1752
1753         ASSERT(vs->vs_dmap);
1754         cluster = atop(offset) >> vs->vs_clshift;
1755
1756         /*
1757          * Initialize cluster error value
1758          */
1759         clmap->cl_error = 0;
1760
1761         /*
1762          * If the object has grown, extend the page map.
1763          */
1764         if (cluster >= vs->vs_size) {
1765                 if (flag == CL_FIND) {
1766                         /* Do not allocate if just doing a lookup */
1767                         VS_MAP_UNLOCK(vs);
1768                         return (vm_offset_t) -1;
1769                 }
1770                 if (ps_map_extend(vs, cluster + 1)) {
1771                         VS_MAP_UNLOCK(vs);
1772                         return (vm_offset_t) -1;
1773                 }
1774         }
1775
1776         /*
1777          * Look for the desired cluster.  If the map is indirect, then we
1778          * have a two level lookup.  First find the indirect block, then
1779          * find the actual cluster.  If the indirect block has not yet
1780          * been allocated, then do so.  If the cluster has not yet been
1781          * allocated, then do so.
1782          *
1783          * If any of the allocations fail, then return an error.
1784          * Don't allocate if just doing a lookup.
1785          */
1786         if (vs->vs_indirect) {
1787                 long    ind_block = cluster/CLMAP_ENTRIES;
1788
1789                 /* Is the indirect block allocated? */
1790                 vsmap = vs->vs_imap[ind_block];
1791                 if (vsmap == NULL) {
1792                         if (flag == CL_FIND) {
1793                                 VS_MAP_UNLOCK(vs);
1794                                 return (vm_offset_t) -1;
1795                         }
1796
1797                         /* Allocate the indirect block */
1798                         vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1799                         if (vsmap == NULL) {
1800                                 VS_MAP_UNLOCK(vs);
1801                                 return (vm_offset_t) -1;
1802                         }
1803                         /* Initialize the cluster offsets */
1804                         for (i = 0; i < CLMAP_ENTRIES; i++)
1805                                 VSM_CLR(vsmap[i]);
1806                         vs->vs_imap[ind_block] = vsmap;
1807                 }
1808         } else
1809                 vsmap = vs->vs_dmap;
1810
1811         ASSERT(vsmap);
1812         vsmap += cluster%CLMAP_ENTRIES;
1813
1814         /*
1815          * At this point, vsmap points to the struct vs_map desired.
1816          *
1817          * Look in the map for the cluster, if there was an error on a
1818          * previous write, flag it and return.  If it is not yet
1819          * allocated, then allocate it, if we're writing; if we're
1820          * doing a lookup and the cluster's not allocated, return error.
1821          */
1822         if (VSM_ISERR(*vsmap)) {
1823                 clmap->cl_error = VSM_GETERR(*vsmap);
1824                 VS_MAP_UNLOCK(vs);
1825                 return (vm_offset_t) -1;
1826         } else if (VSM_ISCLR(*vsmap)) {
1827                 int psindex;
1828
1829                 if (flag == CL_FIND) {
1830                         /*
1831                          * If there's an error and the entry is clear, then
1832                          * we've run out of swap space.  Record the error
1833                          * here and return.
1834                          */
1835                         if (error) {
1836                                 VSM_SETERR(*vsmap, error);
1837                         }
1838                         VS_MAP_UNLOCK(vs);
1839                         return (vm_offset_t) -1;
1840                 } else {
1841                         /*
1842                          * Attempt to allocate a cluster from the paging segment
1843                          */
1844                         newcl = ps_allocate_cluster(vs, &psindex,
1845                                                     PAGING_SEGMENT_NULL);
1846                         if (newcl == -1) {
1847                                 VS_MAP_UNLOCK(vs);
1848                                 return (vm_offset_t) -1;
1849                         }
1850                         VSM_CLR(*vsmap);
1851                         VSM_SETCLOFF(*vsmap, newcl);
1852                         VSM_SETPS(*vsmap, psindex);
1853                 }
1854         } else
1855                 newcl = VSM_CLOFF(*vsmap);
1856
1857         /*
1858          * Fill in pertinent fields of the clmap
1859          */
1860         clmap->cl_ps = VSM_PS(*vsmap);
1861         clmap->cl_numpages = VSCLSIZE(vs);
1862         clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1863
1864         /*
1865          * Byte offset in paging segment is byte offset to cluster plus
1866          * byte offset within cluster.  It looks ugly, but should be
1867          * relatively quick.
1868          */
1869         ASSERT(trunc_page(offset) == offset);
1870         newcl = ptoa(newcl) << vs->vs_clshift;
1871         newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1872         if (flag == CL_ALLOC) {
1873                 /*
1874                  * set bits in the allocation bitmap according to which
1875                  * pages were requested.  size is in bytes.
1876                  */
1877                 i = atop(newoff);
1878                 while ((size > 0) && (i < VSCLSIZE(vs))) {
1879                         VSM_SETALLOC(*vsmap, i);
1880                         i++;
1881                         size -= vm_page_size;
1882                 }
1883         }
1884         clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1885         if (newoff) {
1886                 /*
1887                  * Offset is not cluster aligned, so number of pages
1888                  * and bitmaps must be adjusted
1889                  */
1890                 clmap->cl_numpages -= atop(newoff);
1891                 CLMAP_SHIFT(clmap, vs);
1892                 CLMAP_SHIFTALLOC(clmap, vs);
1893         }
1894
1895         /*
1896          *
1897          * The setting of valid bits and handling of write errors
1898          * must be done here, while we hold the lock on the map.
1899          * It logically should be done in ps_vs_write_complete().
1900          * The size and error information has been passed from
1901          * ps_vs_write_complete().  If the size parameter is non-zero,
1902          * then there is work to be done.  If error is also non-zero,
1903          * then the error number is recorded in the cluster and the
1904          * entire cluster is in error.
1905          */
1906         if (size && flag == CL_FIND) {
1907                 vm_offset_t off = (vm_offset_t) 0;
1908
1909                 if (!error) {
1910                         for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1911                              i++) {
1912                                 VSM_SETPG(*vsmap, i);
1913                                 size -= vm_page_size;
1914                         }
1915                         ASSERT(i <= VSCLSIZE(vs));
1916                 } else {
1917                         BS_STAT(clmap->cl_ps->ps_bs,
1918                                 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1919                                         atop(size));
1920                         off = VSM_CLOFF(*vsmap);
1921                         VSM_SETERR(*vsmap, error);
1922                 }
1923                 /*
1924                  * Deallocate cluster if error, and no valid pages
1925                  * already present.
1926                  */
1927                 if (off != (vm_offset_t) 0)
1928                         ps_deallocate_cluster(clmap->cl_ps, off);
1929                 VS_MAP_UNLOCK(vs);
1930                 return (vm_offset_t) 0;
1931         } else
1932                 VS_MAP_UNLOCK(vs);
1933
1934         DEBUG(DEBUG_VS_INTERNAL,
1935               ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1936                newcl+newoff, (int) vs, (int) vsmap, flag));
1937         DEBUG(DEBUG_VS_INTERNAL,
1938               ("        clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1939                (int) clmap->cl_ps, clmap->cl_numpages,
1940                (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1941
1942         return (newcl + newoff);
1943 }
1944
1945 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t);     /* forward */
1946
1947 void
1948 ps_clunmap(
1949         vstruct_t       vs,
1950         vm_offset_t     offset,
1951         vm_size_t       length)
1952 {
1953         vm_offset_t             cluster; /* The cluster number of offset */
1954         struct vs_map           *vsmap;
1955         static char here[] = "ps_clunmap";
1956
1957         VS_MAP_LOCK(vs);
1958
1959         /*
1960          * Loop through all clusters in this range, freeing paging segment
1961          * clusters and map entries as encountered.
1962          */
1963         while (length > 0) {
1964                 vm_offset_t     newoff;
1965                 int             i;
1966
1967                 cluster = atop(offset) >> vs->vs_clshift;
1968                 if (vs->vs_indirect)    /* indirect map */
1969                         vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
1970                 else
1971                         vsmap = vs->vs_dmap;
1972                 if (vsmap == NULL) {
1973                         VS_MAP_UNLOCK(vs);
1974                         return;
1975                 }
1976                 vsmap += cluster%CLMAP_ENTRIES;
1977                 if (VSM_ISCLR(*vsmap)) {
1978                         length -= vm_page_size;
1979                         offset += vm_page_size;
1980                         continue;
1981                 }
1982                 /*
1983                  * We've got a valid mapping.  Clear it and deallocate
1984                  * paging segment cluster pages.
1985                  * Optimize for entire cluster cleraing.
1986                  */
1987                 if (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) {
1988                         /*
1989                          * Not cluster aligned.
1990                          */
1991                         ASSERT(trunc_page(newoff) == newoff);
1992                         i = atop(newoff);
1993                 } else
1994                         i = 0;
1995                 while ((i < VSCLSIZE(vs)) && (length > 0)) {
1996                         VSM_CLRPG(*vsmap, i);
1997                         VSM_CLRALLOC(*vsmap, i);
1998                         length -= vm_page_size;
1999                         offset += vm_page_size;
2000                         i++;
2001                 }
2002
2003                 /*
2004                  * If map entry is empty, clear and deallocate cluster.
2005                  */
2006                 if (!VSM_ALLOC(*vsmap)) {
2007                         ps_deallocate_cluster(VSM_PS(*vsmap),
2008                                               VSM_CLOFF(*vsmap));
2009                         VSM_CLR(*vsmap);
2010                 }
2011         }
2012
2013         VS_MAP_UNLOCK(vs);
2014 }
2015
2016 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2017
2018 void
2019 ps_vs_write_complete(
2020         vstruct_t       vs,
2021         vm_offset_t     offset,
2022         vm_size_t       size,
2023         int             error)
2024 {
2025         struct clmap    clmap;
2026
2027         /*
2028          * Get the struct vsmap for this cluster.
2029          * Use READ, even though it was written, because the
2030          * cluster MUST be present, unless there was an error
2031          * in the original ps_clmap (e.g. no space), in which
2032          * case, nothing happens.
2033          *
2034          * Must pass enough information to ps_clmap to allow it
2035          * to set the vs_map structure bitmap under lock.
2036          */
2037         (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2038 }
2039
2040 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int);    /* forward */
2041
2042 void
2043 vs_cl_write_complete(
2044         vstruct_t               vs,
2045         paging_segment_t        ps,
2046         vm_offset_t             offset,
2047         vm_offset_t             addr,
2048         vm_size_t               size,
2049         boolean_t               async,
2050         int                     error)
2051 {
2052         static char here[] = "vs_cl_write_complete";
2053         kern_return_t   kr;
2054
2055         if (error) {
2056                 /*
2057                  * For internal objects, the error is recorded on a
2058                  * per-cluster basis by ps_clmap() which is called
2059                  * by ps_vs_write_complete() below.
2060                  */
2061                 dprintf(("write failed error = 0x%x\n", error));
2062                 /* add upl_abort code here */
2063         } else
2064                 GSTAT(global_stats.gs_pages_out += atop(size));
2065         /*
2066          * Notify the vstruct mapping code, so it can do its accounting.
2067          */
2068         ps_vs_write_complete(vs, offset, size, error);
2069
2070         if (async) {
2071                 VS_LOCK(vs);
2072                 ASSERT(vs->vs_async_pending > 0);
2073                 vs->vs_async_pending -= size;
2074                 if (vs->vs_async_pending == 0) {
2075                         VS_UNLOCK(vs);
2076                         /* mutex_unlock(&vs->vs_waiting_async); */
2077                         thread_wakeup(&vs->vs_waiting_async);
2078                 } else {
2079                         VS_UNLOCK(vs);
2080                 }
2081         }
2082 }
2083
2084 #ifdef DEVICE_PAGING
2085 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2086
2087 kern_return_t
2088 device_write_reply(
2089         MACH_PORT_FACE  reply_port,
2090         kern_return_t   device_code,
2091         io_buf_len_t    bytes_written)
2092 {
2093         struct vs_async *vsa;
2094         static char here[] = "device_write_reply";
2095
2096         vsa = (struct vs_async *)
2097                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2098
2099         if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2100                 device_code = KERN_FAILURE;
2101         }
2102
2103         vsa->vsa_error = device_code;
2104
2105
2106         ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2107         if(vsa->vsa_flags & VSA_TRANSFER) {
2108                 /* revisit when async disk segments redone */
2109                 if(vsa->vsa_error) {
2110                    /* need to consider error condition.  re-write data or */
2111                    /* throw it away here. */
2112                    vm_offset_t  ioaddr;
2113                    if(vm_map_copyout(kernel_map, &ioaddr,
2114                                  (vm_map_copy_t)vsa->vsa_addr) != KERN_SUCCESS)
2115                    panic("vs_cluster_write: unable to copy source list\n");
2116                    vm_deallocate(kernel_map, ioaddr, vsa->vsa_size);
2117                 }
2118                 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2119                                                 vsa->vsa_size, vsa->vsa_error);
2120         } else {
2121                 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2122                              vsa->vsa_addr, vsa->vsa_size, TRUE,
2123                              vsa->vsa_error);
2124         }
2125         VS_FREE_ASYNC(vsa);
2126
2127         return KERN_SUCCESS;
2128 }
2129
2130 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2131 kern_return_t
2132 device_write_reply_inband(
2133         MACH_PORT_FACE          reply_port,
2134         kern_return_t           return_code,
2135         io_buf_len_t            bytes_written)
2136 {
2137         panic("device_write_reply_inband: illegal");
2138         return KERN_SUCCESS;
2139 }
2140
2141 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2142 kern_return_t
2143 device_read_reply(
2144         MACH_PORT_FACE          reply_port,
2145         kern_return_t           return_code,
2146         io_buf_ptr_t            data,
2147         mach_msg_type_number_t  dataCnt)
2148 {
2149         struct vs_async *vsa;
2150         vsa = (struct vs_async *)
2151                 ((struct vstruct_alias *)(reply_port->alias))->vs;
2152         vsa->vsa_addr = (vm_offset_t)data;
2153         vsa->vsa_size = (vm_size_t)dataCnt;
2154         vsa->vsa_error = return_code;
2155         thread_wakeup(&vsa->vsa_lock);
2156         return KERN_SUCCESS;
2157 }
2158
2159 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2160 kern_return_t
2161 device_read_reply_inband(
2162         MACH_PORT_FACE          reply_port,
2163         kern_return_t           return_code,
2164         io_buf_ptr_inband_t     data,
2165         mach_msg_type_number_t  dataCnt)
2166 {
2167         panic("device_read_reply_inband: illegal");
2168         return KERN_SUCCESS;
2169 }
2170
2171 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2172 kern_return_t
2173 device_read_reply_overwrite(
2174         MACH_PORT_FACE          reply_port,
2175         kern_return_t           return_code,
2176         io_buf_len_t            bytes_read)
2177 {
2178         panic("device_read_reply_overwrite: illegal\n");
2179         return KERN_SUCCESS;
2180 }
2181
2182 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2183 kern_return_t
2184 device_open_reply(
2185         MACH_PORT_FACE          reply_port,
2186         kern_return_t           return_code,
2187         MACH_PORT_FACE          device_port)
2188 {
2189         panic("device_open_reply: illegal\n");
2190         return KERN_SUCCESS;
2191 }
2192
2193 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);  /* forward */
2194
2195 kern_return_t
2196 ps_read_device(
2197         paging_segment_t        ps,
2198         vm_offset_t             offset,
2199         vm_offset_t             *bufferp,
2200         unsigned int            size,
2201         unsigned int            *residualp,
2202         int                     flags)
2203 {
2204         kern_return_t   kr;
2205         recnum_t        dev_offset;
2206         unsigned int    bytes_wanted;
2207         unsigned int    bytes_read;
2208         unsigned int    total_read;
2209         vm_offset_t     dev_buffer;
2210         vm_offset_t     buf_ptr;
2211         unsigned int    records_read;
2212         static char     here[] = "ps_read_device";
2213         struct vs_async *vsa;
2214         mutex_t vs_waiting_read_reply;
2215
2216         device_t        device;
2217         vm_map_copy_t   device_data = NULL;
2218         default_pager_thread_t *dpt = NULL;
2219
2220         device = dev_port_lookup(ps->ps_device);
2221         clustered_reads[atop(size)]++;
2222
2223         dev_offset = (ps->ps_offset +
2224                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2225         bytes_wanted = size;
2226         total_read = 0;
2227         *bufferp = (vm_offset_t)NULL;
2228
2229         do {
2230                 vsa = VS_ALLOC_ASYNC();
2231                 if (vsa) {
2232                         vsa->vsa_vs = NULL;
2233                         vsa->vsa_addr = 0;
2234                         vsa->vsa_offset = 0;
2235                         vsa->vsa_size = 0;
2236                         vsa->vsa_ps = NULL;
2237                 }
2238                 mutex_init(&vsa->vsa_lock, ETAP_DPAGE_VSSEQNO);
2239                 ip_lock(vsa->reply_port);
2240                 vsa->reply_port->ip_sorights++;
2241                 ip_reference(vsa->reply_port);
2242                 ip_unlock(vsa->reply_port);
2243                 kr = ds_device_read_common(device,
2244                                  vsa->reply_port,
2245                                  (mach_msg_type_name_t)
2246                                         MACH_MSG_TYPE_MOVE_SEND_ONCE,
2247                                  (dev_mode_t) 0,
2248                                  dev_offset,
2249                                  bytes_wanted,
2250                                  (IO_READ | IO_CALL),
2251                                  (io_buf_ptr_t *) &dev_buffer,
2252                                  (mach_msg_type_number_t *) &bytes_read);
2253                 if(kr == MIG_NO_REPLY) {
2254                         assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2255                         thread_block((void(*)(void))0);
2256
2257                         dev_buffer = vsa->vsa_addr;
2258                         bytes_read = (unsigned int)vsa->vsa_size;
2259                         kr = vsa->vsa_error;
2260                 }
2261                 VS_FREE_ASYNC(vsa);
2262                 if (kr != KERN_SUCCESS || bytes_read == 0) {
2263                         break;
2264                 }
2265                 total_read += bytes_read;
2266
2267                 /*
2268                  * If we got the entire range, use the returned dev_buffer.
2269                  */
2270                 if (bytes_read == size) {
2271                         *bufferp = (vm_offset_t)dev_buffer;
2272                         break;
2273                 }
2274
2275 #if 1
2276                 dprintf(("read only %d bytes out of %d\n",
2277                          bytes_read, bytes_wanted));
2278 #endif
2279                 if(dpt == NULL) {
2280                         dpt = get_read_buffer();
2281                         buf_ptr = dpt->dpt_buffer;
2282                         *bufferp = (vm_offset_t)buf_ptr;
2283                 }
2284                 /*
2285                  * Otherwise, copy the data into the provided buffer (*bufferp)
2286                  * and append the rest of the range as it comes in.
2287                  */
2288                 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2289                 buf_ptr += bytes_read;
2290                 bytes_wanted -= bytes_read;
2291                 records_read = (bytes_read >>
2292                                 (vm_page_shift - ps->ps_record_shift));
2293                 dev_offset += records_read;
2294                 DEBUG(DEBUG_VS_INTERNAL,
2295                       ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2296                        dev_buffer, bytes_read));
2297                 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2298                     != KERN_SUCCESS)
2299                         Panic("dealloc buf");
2300         } while (bytes_wanted);
2301
2302         *residualp = size - total_read;
2303         if((dev_buffer != *bufferp) && (total_read != 0)) {
2304                 vm_offset_t temp_buffer;
2305                 vm_allocate(kernel_map, &temp_buffer, total_read, TRUE);
2306                 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2307                 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2308                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2309                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2310                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2311                         (vm_map_copy_t *)&device_data, FALSE))
2312                                 panic("ps_read_device: cannot copyin locally provided buffer\n");
2313         }
2314         else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2315                 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2316                         VM_MAP_COPYIN_OPT_SRC_DESTROY |
2317                         VM_MAP_COPYIN_OPT_STEAL_PAGES |
2318                         VM_MAP_COPYIN_OPT_PMAP_ENTER,
2319                         (vm_map_copy_t *)&device_data, FALSE))
2320                                 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2321         }
2322         else {
2323                 device_data = NULL;
2324         }
2325         *bufferp = (vm_offset_t)device_data;
2326
2327         if(dpt != NULL) {
2328                 /* Free the receive buffer */
2329                 dpt->checked_out = 0;
2330                 thread_wakeup(&dpt_array);
2331         }
2332         return KERN_SUCCESS;
2333 }
2334
2335 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *);     /* forward */
2336
2337 kern_return_t
2338 ps_write_device(
2339         paging_segment_t        ps,
2340         vm_offset_t             offset,
2341         vm_offset_t             addr,
2342         unsigned int            size,
2343         struct vs_async         *vsa)
2344 {
2345         recnum_t        dev_offset;
2346         io_buf_len_t    bytes_to_write, bytes_written;
2347         recnum_t        records_written;
2348         kern_return_t   kr;
2349         MACH_PORT_FACE  reply_port;
2350         static char here[] = "ps_write_device";
2351
2352
2353
2354         clustered_writes[atop(size)]++;
2355
2356         dev_offset = (ps->ps_offset +
2357                       (offset >> (vm_page_shift - ps->ps_record_shift)));
2358         bytes_to_write = size;
2359
2360         if (vsa) {
2361                 /*
2362                  * Asynchronous write.
2363                  */
2364                 reply_port = vsa->reply_port;
2365                 ip_lock(reply_port);
2366                 reply_port->ip_sorights++;
2367                 ip_reference(reply_port);
2368                 ip_unlock(reply_port);
2369                 {
2370                 device_t        device;
2371                 device = dev_port_lookup(ps->ps_device);
2372
2373                 vsa->vsa_addr = addr;
2374                 kr=ds_device_write_common(device,
2375                         reply_port,
2376                         (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2377                         (dev_mode_t) 0,
2378                         dev_offset,
2379                         (io_buf_ptr_t)  addr,
2380                         size,
2381                         (IO_WRITE | IO_CALL),
2382                         &bytes_written);
2383                 }
2384                 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2385                         if (verbose)
2386                                 dprintf(("%s0x%x, addr=0x%x,"
2387                                          "size=0x%x,offset=0x%x\n",
2388                                          "device_write_request returned ",
2389                                          kr, addr, size, offset));
2390                         BS_STAT(ps->ps_bs,
2391                                 ps->ps_bs->bs_pages_out_fail += atop(size));
2392                         /* do the completion notification to free resources */
2393                         device_write_reply(reply_port, kr, 0);
2394                         return PAGER_ERROR;
2395                 }
2396         } else do {
2397                 /*
2398                  * Synchronous write.
2399                  */
2400                 {
2401                 device_t        device;
2402                 device = dev_port_lookup(ps->ps_device);
2403                 kr=ds_device_write_common(device,
2404                         IP_NULL, 0,
2405                         (dev_mode_t) 0,
2406                         dev_offset,
2407                         (io_buf_ptr_t)  addr,
2408                         size,
2409                         (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2410                         &bytes_written);
2411                 }
2412                 if (kr != KERN_SUCCESS) {
2413                         dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2414                                  "device_write returned ",
2415                                  kr, addr, size, offset));
2416                         BS_STAT(ps->ps_bs,
2417                                 ps->ps_bs->bs_pages_out_fail += atop(size));
2418                         return PAGER_ERROR;
2419                 }
2420                 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2421                         Panic("fragmented write");
2422                 records_written = (bytes_written >>
2423                                    (vm_page_shift - ps->ps_record_shift));
2424                 dev_offset += records_written;
2425 #if 1
2426                 if (bytes_written != bytes_to_write) {
2427                         dprintf(("wrote only %d bytes out of %d\n",
2428                                  bytes_written, bytes_to_write));
2429                 }
2430 #endif
2431                 bytes_to_write -= bytes_written;
2432                 addr += bytes_written;
2433         } while (bytes_to_write > 0);
2434
2435         return PAGER_SUCCESS;
2436 }
2437
2438
2439 #else /* !DEVICE_PAGING */
2440
2441 kern_return_t
2442 ps_read_device(
2443         paging_segment_t        ps,
2444         vm_offset_t             offset,
2445         vm_offset_t             *bufferp,
2446         unsigned int            size,
2447         unsigned int            *residualp,
2448         int                     flags)
2449 {
2450   panic("ps_read_device not supported");
2451 }
2452
2453 ps_write_device(
2454         paging_segment_t        ps,
2455         vm_offset_t             offset,
2456         vm_offset_t             addr,
2457         unsigned int            size,
2458         struct vs_async         *vsa)
2459 {
2460   panic("ps_write_device not supported");
2461 }
2462
2463 #endif /* DEVICE_PAGING */
2464 void pvs_object_data_provided(vstruct_t, upl_t, vm_offset_t, vm_size_t);        /* forward */
2465
2466 void
2467 pvs_object_data_provided(
2468         vstruct_t       vs,
2469         upl_t           upl,
2470         vm_offset_t     offset,
2471         vm_size_t       size)
2472 {
2473         static char here[] = "pvs_object_data_provided";
2474
2475         DEBUG(DEBUG_VS_INTERNAL,
2476               ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2477                upl, offset, size));
2478
2479         ASSERT(size > 0);
2480         GSTAT(global_stats.gs_pages_in += atop(size));
2481
2482
2483 #if     USE_PRECIOUS
2484         ps_clunmap(vs, offset, size);
2485 #endif  /* USE_PRECIOUS */
2486
2487 }
2488
2489 kern_return_t
2490 pvs_cluster_read(
2491         vstruct_t       vs,
2492         vm_offset_t     offset,
2493         vm_size_t       cnt)
2494 {
2495         vm_offset_t             actual_offset;
2496         vm_offset_t             buffer;
2497         paging_segment_t        ps;
2498         struct clmap            clmap;
2499         upl_t                   upl;
2500         kern_return_t           error = KERN_SUCCESS;
2501         int                     size, size_wanted, i;
2502         unsigned int            residual;
2503         unsigned int            request_flags;
2504         int                     unavail_size;
2505         default_pager_thread_t  *dpt;
2506         boolean_t               dealloc;
2507         static char here[] = "pvs_cluster_read";
2508
2509         /*
2510          * This loop will be executed once per cluster referenced.
2511          * Typically this means once, since it's unlikely that the
2512          * VM system will ask for anything spanning cluster boundaries.
2513          *
2514          * If there are holes in a cluster (in a paging segment), we stop
2515          * reading at the hole, inform the VM of any data read, inform
2516          * the VM of an unavailable range, then loop again, hoping to
2517          * find valid pages later in the cluster.  This continues until
2518          * the entire range has been examined, and read, if present.
2519          */
2520
2521 #if     USE_PRECIOUS
2522         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE | UPL_PRECIOUS;
2523 #else
2524         request_flags = UPL_NO_SYNC |  UPL_CLEAN_IN_PLACE ;
2525 #endif
2526         while (cnt && (error == KERN_SUCCESS)) {
2527                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
2528
2529                 if (actual_offset == (vm_offset_t) -1) {
2530
2531                         /*
2532                          * Either a failure due to an error on a previous
2533                          * write or a zero fill on demand page.  In either case,
2534                          * optimize to do one reply for all pages up to next
2535                          * cluster boundary.
2536                          */
2537                         unsigned int local_size, clmask, clsize;
2538
2539                         clmask = (vm_page_size << vs->vs_clshift) - 1;
2540                         clsize = vm_page_size << vs->vs_clshift;
2541                         clmask = clsize - 1;
2542                         local_size = clsize - (offset & clmask);
2543                         ASSERT(local_size);
2544                         local_size = MIN(local_size, cnt);
2545
2546                         upl_system_list_request((vm_object_t)
2547                                         vs->vs_control_port->ip_kobject,
2548                                         offset, local_size, local_size,
2549                                         &upl, NULL, 0, request_flags);
2550                         if (clmap.cl_error) {
2551                                 uc_upl_abort(upl, UPL_ABORT_ERROR);
2552                         } else {
2553                                 uc_upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2554                         }
2555
2556                         cnt -= local_size;
2557                         offset += local_size;
2558                         continue;
2559                 }
2560
2561                 /*
2562                  * Count up contiguous available or unavailable
2563                  * pages.
2564                  */
2565                 ps = CLMAP_PS(clmap);
2566                 ASSERT(ps);
2567                 size = 0;
2568                 unavail_size = 0;
2569
2570                 for (i = 0;
2571                      (size < cnt) && (unavail_size < cnt) &&
2572                      (i < CLMAP_NPGS(clmap)); i++) {
2573                         if (CLMAP_ISSET(clmap, i)) {
2574                                 if (unavail_size != 0)
2575                                         break;
2576                                 size += vm_page_size;
2577                                 BS_STAT(ps->ps_bs,
2578                                         ps->ps_bs->bs_pages_in++);
2579                         } else {
2580                                 if (size != 0)
2581                                         break;
2582                                 unavail_size += vm_page_size;
2583                         }
2584                 }
2585                 /*
2586                  * Let VM system know about holes in clusters.
2587                  */
2588                 if (size == 0) {
2589                         ASSERT(unavail_size);
2590                         GSTAT(global_stats.gs_pages_unavail +=
2591                               atop(unavail_size));
2592                         upl_system_list_request((vm_object_t)
2593                                         vs->vs_control_port->ip_kobject,
2594                                         offset, unavail_size,
2595                                         unavail_size, &upl, NULL, 0,
2596                                         request_flags);
2597                         uc_upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2598                         cnt -= unavail_size;
2599                         offset += unavail_size;
2600                         continue;
2601                 }
2602
2603                 upl_system_list_request((vm_object_t)
2604                                         vs->vs_control_port->ip_kobject,
2605                                         offset, size, size, &upl,
2606                                         NULL, 0, request_flags | UPL_SET_INTERNAL);
2607                 if(ps->ps_segtype == PS_PARTITION) {
2608 /*
2609                         error = ps_read_device(ps, actual_offset, upl,
2610                                        size, &residual, 0);
2611 */
2612                 } else {
2613                         error = ps_read_file(ps, upl, actual_offset,
2614                                              size, &residual, 0);
2615                 }
2616
2617                 /*
2618                  * Adjust counts and send response to VM.  Optimize for the
2619                  * common case, i.e. no error and/or partial data.
2620                  * If there was an error, then we need to error the entire
2621                  * range, even if some data was successfully read.
2622                  * If there was a partial read we may supply some
2623                  * data and may error some as well.  In all cases the
2624                  * VM must receive some notification for every page in the
2625                  * range.
2626                  */
2627                 if ((error == KERN_SUCCESS) && (residual == 0)) {
2628                         /*
2629                          * Got everything we asked for, supply the data to
2630                          * the VM.  Note that as a side effect of supplying
2631                          * the data, the buffer holding the supplied data is
2632                          * deallocated from the pager's address space.
2633                          */
2634                         pvs_object_data_provided(vs, upl, offset, size);
2635                 } else {
2636                         size_wanted = size;
2637                         if (error == KERN_SUCCESS) {
2638                                 if (residual == size) {
2639                                         /*
2640                                          * If a read operation returns no error
2641                                          * and no data moved, we turn it into
2642                                          * an error, assuming we're reading at
2643                                          * or beyong EOF.
2644                                          * Fall through and error the entire
2645                                          * range.
2646                                          */
2647                                         error = KERN_FAILURE;
2648                                 } else {
2649                                         /*
2650                                          * Otherwise, we have partial read. If
2651                                          * the part read is a integral number
2652                                          * of pages supply it. Otherwise round
2653                                          * it up to a page boundary, zero fill
2654                                          * the unread part, and supply it.
2655                                          * Fall through and error the remainder
2656                                          * of the range, if any.
2657                                          */
2658                                         int fill, lsize;
2659
2660                                         fill = residual & ~vm_page_size;
2661                                         lsize = (size - residual) + fill;
2662                                         pvs_object_data_provided(vs, upl,
2663                                                         offset, lsize);
2664                                         cnt -= lsize;
2665                                         offset += lsize;
2666                                         if (size -= lsize) {
2667                                                 error = KERN_FAILURE;
2668                                         }
2669                                 }
2670                         }
2671
2672                         /*
2673                          * If there was an error in any part of the range, tell
2674                          * the VM.  Deallocate the remainder of the buffer.
2675                          * Note that error is explicitly checked again since
2676                          * it can be modified above.
2677                          */
2678                         if (error != KERN_SUCCESS) {
2679                                 BS_STAT(ps->ps_bs,
2680                                         ps->ps_bs->bs_pages_in_fail +=
2681                                         atop(size));
2682                         }
2683                 }
2684                 cnt -= size;
2685                 offset += size;
2686
2687         } /* END while (cnt && (error == 0)) */
2688         return error;
2689 }
2690
2691 int vs_do_async_write = 1;
2692
2693 kern_return_t
2694 vs_cluster_write(
2695         vstruct_t       vs,
2696         upl_t           internal_upl,
2697         vm_offset_t     offset,
2698         vm_size_t       cnt,
2699         boolean_t       dp_internal,
2700         int             flags)
2701 {
2702         vm_offset_t     actual_offset;  /* Offset within paging segment */
2703         vm_offset_t     size;
2704         vm_offset_t     transfer_size;
2705         vm_offset_t     subx_size;
2706         int             error = 0;
2707         struct clmap    clmap;
2708         paging_segment_t ps;
2709         struct vs_async *vsa;
2710         vm_map_copy_t   copy;
2711         static char here[] = "vs_cluster_write";
2712
2713         upl_t           upl;
2714         upl_page_info_t *page_list;
2715         upl_page_info_t pl[20];
2716         vm_offset_t     mobj_base_addr;
2717         vm_offset_t     mobj_target_addr;
2718         int             mobj_size;
2719         int             page_index;
2720         int             list_size;
2721         int             cl_size;
2722
2723
2724         ps = PAGING_SEGMENT_NULL;
2725
2726         if (!dp_internal) {
2727                 int          request_flags;
2728                 int          super_size;
2729                 vm_offset_t  upl_offset;
2730
2731                 cl_size = (1 << vs->vs_clshift) * vm_page_size;
2732
2733                 if (bs_low) {
2734                         super_size = cl_size;
2735                         request_flags = UPL_NOBLOCK |
2736                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2737                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2738                 } else {
2739                         super_size = VM_SUPER_CLUSTER;
2740                         request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2741                                 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2742                                 UPL_NO_SYNC | UPL_SET_INTERNAL;
2743                 }
2744
2745
2746                 upl_system_list_request((vm_object_t)
2747                                 vs->vs_control_port->ip_kobject,
2748                                 offset, cnt, super_size,
2749                                 &upl, NULL,
2750                                 0, request_flags);
2751
2752                 mobj_base_addr = upl->offset;
2753                 list_size = upl->size;
2754
2755                 page_list = UPL_GET_INTERNAL_PAGE_LIST(upl);
2756                 memcpy(pl, page_list,
2757                         sizeof(upl_page_info_t) * (list_size/page_size));
2758
2759                 /* Now parcel up the 64k transfer, do at most cluster size */
2760                 /*  at a time. */
2761                 upl_offset = 0;
2762                 page_index = 0;
2763                 mobj_target_addr = mobj_base_addr;
2764
2765                 for (transfer_size = list_size; transfer_size != 0;) {
2766                         actual_offset = ps_clmap(vs, mobj_target_addr,
2767                                 &clmap, CL_ALLOC,
2768                                 transfer_size < cl_size ?
2769                                         transfer_size : cl_size, 0);
2770
2771                         if (actual_offset == (vm_offset_t) -1) {
2772                                 for(;transfer_size != 0;) {
2773                                   if(UPL_PAGE_PRESENT(pl, page_index)) {
2774                                         uc_upl_abort_range(upl,
2775                                                 upl_offset,
2776                                                 transfer_size,
2777                                                 UPL_ABORT_FREE_ON_EMPTY);
2778                                         break;
2779                                    }
2780                                    transfer_size-=page_size;
2781                                    upl_offset += vm_page_size;
2782                                    page_index++;
2783                                 }
2784                                 error = 1;
2785                                 break;
2786                         }
2787                         cnt = MIN(transfer_size,
2788                                   CLMAP_NPGS(clmap) * vm_page_size);
2789                         ps = CLMAP_PS(clmap);
2790
2791                         while (cnt > 0) {
2792                            /* attempt to send entire cluster */
2793                            subx_size = 0;
2794
2795                            while (cnt > 0) {
2796                               /* do the biggest contiguous transfer of dirty */
2797                               /* pages */
2798                               if (UPL_DIRTY_PAGE(pl, page_index) ||
2799                                    UPL_PRECIOUS_PAGE(pl, page_index)){
2800                                   page_index++;
2801                                   subx_size += vm_page_size;
2802                                   cnt -= vm_page_size;
2803                               } else {
2804                                  if (subx_size == 0) {
2805                                         actual_offset += vm_page_size;
2806                                         mobj_target_addr += vm_page_size;
2807
2808                                          if(UPL_PAGE_PRESENT(pl, page_index)) {
2809                                                 uc_upl_commit_range(upl,
2810                                                         upl_offset,
2811                                                         vm_page_size,
2812                                                         TRUE,  pl);
2813                                         }
2814
2815                                         upl_offset += vm_page_size;
2816                                         transfer_size -= vm_page_size;
2817                                         page_index++;
2818                                         cnt -= vm_page_size;
2819                                  } else {
2820                                         break;
2821                                  }
2822                               }
2823                            }
2824                            if (subx_size) {
2825
2826                                 error = ps_write_file(ps, upl, upl_offset,
2827                                                       actual_offset, subx_size, flags);
2828                                 if (error) {
2829                                    actual_offset += subx_size;
2830                                    mobj_target_addr += subx_size;
2831                                    upl_offset += subx_size;
2832                                    transfer_size -= subx_size;
2833
2834                                    for(;transfer_size != 0;) {
2835                                       if(UPL_PAGE_PRESENT(pl, page_index)) {
2836                                          uc_upl_abort_range(upl,
2837                                                 upl_offset,
2838                                                 transfer_size,
2839                                                 UPL_ABORT_FREE_ON_EMPTY);
2840                                          break;
2841                                       }
2842                                       transfer_size-=page_size;
2843                                       upl_offset += vm_page_size;
2844                                       page_index++;
2845                                    }
2846                                    break;
2847                                 }
2848
2849                                 ps_vs_write_complete(vs, mobj_target_addr,
2850                                                         subx_size, error);
2851                            }
2852                            actual_offset += subx_size;
2853                            mobj_target_addr += subx_size;
2854                            upl_offset += subx_size;
2855
2856                            transfer_size -= subx_size;
2857                            subx_size = 0;
2858                         }
2859                         if (error)
2860                                 break;
2861                 }
2862         } else {
2863                 assert(cnt  <= (vm_page_size << vs->vs_clshift));
2864                 list_size = cnt;
2865
2866                 page_index = 0;
2867                 /* The caller provides a mapped_data which is derived  */
2868                 /* from a temporary object.  The targeted pages are    */
2869                 /* guaranteed to be set at offset 0 in the mapped_data */
2870                 /* The actual offset however must still be derived     */
2871                 /* from the offset in the vs in question               */
2872                 mobj_base_addr = offset;
2873                 mobj_target_addr = mobj_base_addr;
2874
2875                 for (transfer_size = list_size; transfer_size != 0;) {
2876                         actual_offset = ps_clmap(vs, mobj_target_addr,
2877                                 &clmap, CL_ALLOC,
2878                                 transfer_size < cl_size ?
2879                                         transfer_size : cl_size, 0);
2880                         if(actual_offset == (vm_offset_t) -1) {
2881                                 error = 1;
2882                                 break;
2883                         }
2884                         cnt = MIN(transfer_size,
2885                                 CLMAP_NPGS(clmap) * vm_page_size);
2886                         ps = CLMAP_PS(clmap);
2887                         /* Assume that the caller has given us contiguous */
2888                         /* pages */
2889                         if(cnt) {
2890                                 error = ps_write_file(ps, internal_upl,
2891                                                 0, actual_offset,
2892                                                 cnt, flags);
2893                                 if (error)
2894                                         break;
2895                                 ps_vs_write_complete(vs, mobj_target_addr,
2896                                                                 cnt, error);
2897                            }
2898                         if (error)
2899                                 break;
2900                         actual_offset += cnt;
2901                         mobj_target_addr += cnt;
2902                         transfer_size -= cnt;
2903                         cnt = 0;
2904
2905                         if (error)
2906                                 break;
2907                 }
2908         }
2909         if(error)
2910                 return KERN_FAILURE;
2911         else
2912                 return KERN_SUCCESS;
2913 }
2914
2915 vm_size_t
2916 ps_vstruct_allocated_size(
2917         vstruct_t       vs)
2918 {
2919         int             num_pages;
2920         struct vs_map   *vsmap;
2921         int             i, j, k;
2922
2923         num_pages = 0;
2924         if (vs->vs_indirect) {
2925                 /* loop on indirect maps */
2926                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
2927                         vsmap = vs->vs_imap[i];
2928                         if (vsmap == NULL)
2929                                 continue;
2930                         /* loop on clusters in this indirect map */
2931                         for (j = 0; j < CLMAP_ENTRIES; j++) {
2932                                 if (VSM_ISCLR(vsmap[j]) ||
2933                                     VSM_ISERR(vsmap[j]))
2934                                         continue;
2935                                 /* loop on pages in this cluster */
2936                                 for (k = 0; k < VSCLSIZE(vs); k++) {
2937                                         if ((VSM_BMAP(vsmap[j])) & (1 << k))
2938                                                 num_pages++;
2939                                 }
2940                         }
2941                 }
2942         } else {
2943                 vsmap = vs->vs_dmap;
2944                 if (vsmap == NULL)
2945                         return 0;
2946                 /* loop on clusters in the direct map */
2947                 for (j = 0; j < CLMAP_ENTRIES; j++) {
2948                         if (VSM_ISCLR(vsmap[j]) ||
2949                             VSM_ISERR(vsmap[j]))
2950                                 continue;
2951                         /* loop on pages in this cluster */
2952                         for (k = 0; k < VSCLSIZE(vs); k++) {
2953                                 if ((VSM_BMAP(vsmap[j])) & (1 << k))
2954                                         num_pages++;
2955                         }
2956                 }
2957         }
2958
2959         return ptoa(num_pages);
2960 }
2961
2962 size_t
2963 ps_vstruct_allocated_pages(
2964         vstruct_t               vs,
2965         default_pager_page_t    *pages,
2966         size_t                  pages_size)
2967 {
2968         int             num_pages;
2969         struct vs_map   *vsmap;
2970         vm_offset_t     offset;
2971         int             i, j, k;
2972
2973         num_pages = 0;
2974         offset = 0;
2975         if (vs->vs_indirect) {
2976                 /* loop on indirect maps */
2977                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
2978                         vsmap = vs->vs_imap[i];
2979                         if (vsmap == NULL) {
2980                                 offset += (vm_page_size * CLMAP_ENTRIES *
2981                                            VSCLSIZE(vs));
2982                                 continue;
2983                         }
2984                         /* loop on clusters in this indirect map */
2985                         for (j = 0; j < CLMAP_ENTRIES; j++) {
2986                                 if (VSM_ISCLR(vsmap[j]) ||
2987                                     VSM_ISERR(vsmap[j])) {
2988                                         offset += vm_page_size * VSCLSIZE(vs);
2989                                         continue;
2990                                 }
2991                                 /* loop on pages in this cluster */
2992                                 for (k = 0; k < VSCLSIZE(vs); k++) {
2993                                         if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
2994                                                 num_pages++;
2995                                                 if (num_pages < pages_size)
2996                                                         pages++->dpp_offset =
2997                                                                 offset;
2998                                         }
2999                                         offset += vm_page_size;
3000                                 }
3001                         }
3002                 }
3003         } else {
3004                 vsmap = vs->vs_dmap;
3005                 if (vsmap == NULL)
3006                         return 0;
3007                 /* loop on clusters in the direct map */
3008                 for (j = 0; j < CLMAP_ENTRIES; j++) {
3009                         if (VSM_ISCLR(vsmap[j]) ||
3010                             VSM_ISERR(vsmap[j])) {
3011                                 offset += vm_page_size * VSCLSIZE(vs);
3012                                 continue;
3013                         }
3014                         /* loop on pages in this cluster */
3015                         for (k = 0; k < VSCLSIZE(vs); k++) {
3016                                 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3017                                         num_pages++;
3018                                         if (num_pages < pages_size)
3019                                                 pages++->dpp_offset = offset;
3020                                 }
3021                                 offset += vm_page_size;
3022                         }
3023                 }
3024         }
3025
3026         return num_pages;
3027 }
3028
3029
3030 kern_return_t
3031 ps_vstruct_transfer_from_segment(
3032         vstruct_t        vs,
3033         paging_segment_t segment,
3034 #ifndef ubc_sync_working
3035         vm_object_t     transfer_object)
3036 #else
3037         upl_t            upl)
3038 #endif
3039 {
3040         struct vs_map   *vsmap;
3041         struct vs_map   old_vsmap;
3042         struct vs_map   new_vsmap;
3043         int             i, j, k;
3044
3045         VS_LOCK(vs);    /* block all work on this vstruct */
3046                         /* can't allow the normal multiple write */
3047                         /* semantic because writes may conflict */
3048         vs->vs_xfer_pending = TRUE;
3049         vs_wait_for_sync_writers(vs);
3050         vs_start_write(vs);
3051         vs_wait_for_readers(vs);
3052         /* we will unlock the vs to allow other writes while transferring */
3053         /* and will be guaranteed of the persistance of the vs struct     */
3054         /* because the caller of  ps_vstruct_transfer_from_segment bumped */
3055         /* vs_async_pending */
3056         /* OK we now have guaranteed no other parties are accessing this */
3057         /* vs.  Now that we are also supporting simple lock versions of  */
3058         /* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3059         /* our purpose in holding it before was the multiple write case */
3060         /* we now use the boolean xfer_pending to do that.  We can use  */
3061         /* a boolean instead of a count because we have guaranteed single */
3062         /* file access to this code in its caller */
3063         VS_UNLOCK(vs);
3064 vs_changed:
3065         if (vs->vs_indirect) {
3066                 int     vsmap_size;
3067                 int     clmap_off;
3068                 /* loop on indirect maps */
3069                 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3070                         vsmap = vs->vs_imap[i];
3071                         if (vsmap == NULL)
3072                                 continue;
3073                         /* loop on clusters in this indirect map */
3074                         clmap_off = (vm_page_size * CLMAP_ENTRIES *
3075                                            VSCLSIZE(vs) * i);
3076                         if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3077                                 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3078                         else
3079                                 vsmap_size = CLMAP_ENTRIES;
3080                         for (j = 0; j < vsmap_size; j++) {
3081                                 if (VSM_ISCLR(vsmap[j]) ||
3082                                     VSM_ISERR(vsmap[j]) ||
3083                                     (VSM_PS(vsmap[j]) != segment))
3084                                         continue;
3085                                 if(vs_cluster_transfer(vs,
3086                                         (vm_page_size * (j << vs->vs_clshift))
3087                                         + clmap_off,
3088                                         vm_page_size << vs->vs_clshift,
3089 #ifndef ubc_sync_working
3090                                         transfer_object)
3091 #else
3092                                         upl)
3093 #endif
3094                                                 != KERN_SUCCESS) {
3095                                    VS_LOCK(vs);
3096                                    vs->vs_xfer_pending = FALSE;
3097                                    VS_UNLOCK(vs);
3098                                    vs_finish_write(vs);
3099                                    return KERN_FAILURE;
3100                                 }
3101                                 /* allow other readers/writers during transfer*/
3102                                 VS_LOCK(vs);
3103                                 vs->vs_xfer_pending = FALSE;
3104                                 VS_UNLOCK(vs);
3105                                 vs_finish_write(vs);
3106                                 VS_LOCK(vs);
3107                                 vs->vs_xfer_pending = TRUE;
3108                                 VS_UNLOCK(vs);
3109                                 vs_wait_for_sync_writers(vs);
3110                                 vs_start_write(vs);
3111                                 vs_wait_for_readers(vs);
3112                                 if (!(vs->vs_indirect)) {
3113                                         goto vs_changed;
3114                                 }
3115                         }
3116                 }
3117         } else {
3118                 vsmap = vs->vs_dmap;
3119                 if (vsmap == NULL) {
3120                         VS_LOCK(vs);
3121                         vs->vs_xfer_pending = FALSE;
3122                         VS_UNLOCK(vs);
3123                         vs_finish_write(vs);
3124                         return KERN_SUCCESS;
3125                 }
3126                 /* loop on clusters in the direct map */
3127                 for (j = 0; j < vs->vs_size; j++) {
3128                         if (VSM_ISCLR(vsmap[j]) ||
3129                             VSM_ISERR(vsmap[j]) ||
3130                             (VSM_PS(vsmap[j]) != segment))
3131                                 continue;
3132                         if(vs_cluster_transfer(vs,
3133                                 vm_page_size * (j << vs->vs_clshift),
3134                                 vm_page_size << vs->vs_clshift,
3135 #ifndef ubc_sync_working
3136                                 transfer_object) != KERN_SUCCESS) {
3137 #else
3138                                 upl) != KERN_SUCCESS) {
3139 #endif
3140                            VS_LOCK(vs);
3141                            vs->vs_xfer_pending = FALSE;
3142                            VS_UNLOCK(vs);
3143                            vs_finish_write(vs);
3144                            return KERN_FAILURE;
3145                         }
3146                         /* allow other readers/writers during transfer*/
3147                         VS_LOCK(vs);
3148                         vs->vs_xfer_pending = FALSE;
3149                         VS_UNLOCK(vs);
3150                         vs_finish_write(vs);
3151                         VS_LOCK(vs);
3152                         vs->vs_xfer_pending = TRUE;
3153                         VS_UNLOCK(vs);
3154                         vs_wait_for_sync_writers(vs);
3155                         vs_start_write(vs);
3156                         vs_wait_for_readers(vs);
3157                         if (vs->vs_indirect) {
3158                                 goto vs_changed;
3159                         }
3160                 }
3161         }
3162
3163         VS_LOCK(vs);
3164         vs->vs_xfer_pending = FALSE;
3165         VS_UNLOCK(vs);
3166         vs_finish_write(vs);
3167         return KERN_SUCCESS;
3168 }
3169
3170
3171
3172 vs_map_t
3173 vs_get_map_entry(
3174         vstruct_t       vs,
3175         vm_offset_t     offset)
3176 {
3177         struct vs_map   *vsmap;
3178         vm_offset_t     cluster;
3179
3180         cluster = atop(offset) >> vs->vs_clshift;
3181         if (vs->vs_indirect) {
3182                 long    ind_block = cluster/CLMAP_ENTRIES;
3183
3184                 /* Is the indirect block allocated? */
3185                 vsmap = vs->vs_imap[ind_block];
3186                 if(vsmap == (vs_map_t) NULL)
3187                         return vsmap;
3188         } else
3189                 vsmap = vs->vs_dmap;
3190         vsmap += cluster%CLMAP_ENTRIES;
3191         return vsmap;
3192 }
3193
3194 kern_return_t
3195 vs_cluster_transfer(
3196         vstruct_t       vs,
3197         vm_offset_t     offset,
3198         vm_size_t       cnt,
3199 #ifndef ubc_sync_working
3200         vm_object_t     transfer_object)
3201 #else
3202         upl_t           upl)
3203 #endif
3204 {
3205         vm_offset_t             actual_offset;
3206         paging_segment_t        ps;
3207         struct clmap            clmap;
3208         kern_return_t           error = KERN_SUCCESS;
3209         int                     size, size_wanted, i;
3210         unsigned int            residual;
3211         int                     unavail_size;
3212         default_pager_thread_t  *dpt;
3213         boolean_t               dealloc;
3214         struct  vs_map          *vsmap_ptr;
3215         struct  vs_map          read_vsmap;
3216         struct  vs_map          original_read_vsmap;
3217         struct  vs_map          write_vsmap;
3218         upl_t                   sync_upl;
3219 #ifndef ubc_sync_working
3220         upl_t                   upl;
3221 #endif
3222
3223         vm_offset_t     ioaddr;
3224
3225         static char here[] = "vs_cluster_transfer";
3226
3227         /* vs_cluster_transfer reads in the pages of a cluster and
3228          * then writes these pages back to new backing store.  The
3229          * segment the pages are being read from is assumed to have
3230          * been taken off-line and is no longer considered for new
3231          * space requests.
3232          */
3233
3234         /*
3235          * This loop will be executed once per cluster referenced.
3236          * Typically this means once, since it's unlikely that the
3237          * VM system will ask for anything spanning cluster boundaries.
3238          *
3239          * If there are holes in a cluster (in a paging segment), we stop
3240          * reading at the hole, then loop again, hoping to
3241          * find valid pages later in the cluster.  This continues until
3242          * the entire range has been examined, and read, if present.  The
3243          * pages are written as they are read.  If a failure occurs after
3244          * some pages are written the unmap call at the bottom of the loop
3245          * recovers the backing store and the old backing store remains
3246          * in effect.
3247          */
3248
3249         /* uc_upl_map(kernel_map, upl, &ioaddr); */
3250
3251         VSM_CLR(write_vsmap);
3252         VSM_CLR(original_read_vsmap);
3253         /* grab the actual object's pages to sync with I/O */
3254         while (cnt && (error == KERN_SUCCESS)) {
3255                 vsmap_ptr = vs_get_map_entry(vs, offset);
3256                 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3257
3258                 if (actual_offset == (vm_offset_t) -1) {
3259
3260                         /*
3261                          * Nothing left to write in this cluster at least
3262                          * set write cluster information for any previous
3263                          * write, clear for next cluster, if there is one
3264                          */
3265                         unsigned int local_size, clmask, clsize;
3266
3267                         clsize = vm_page_size << vs->vs_clshift;
3268                         clmask = clsize - 1;
3269                         local_size = clsize - (offset & clmask);
3270                         ASSERT(local_size);
3271                         local_size = MIN(local_size, cnt);
3272
3273                         /* This cluster has no data in it beyond what may */
3274                         /* have been found on a previous iteration through */
3275                         /* the loop "write_vsmap" */
3276                         *vsmap_ptr = write_vsmap;
3277                         VSM_CLR(write_vsmap);
3278                         VSM_CLR(original_read_vsmap);
3279
3280                         cnt -= local_size;
3281                         offset += local_size;
3282                         continue;
3283                 }
3284
3285                 /*
3286                  * Count up contiguous available or unavailable
3287                  * pages.
3288                  */
3289                 ps = CLMAP_PS(clmap);
3290                 ASSERT(ps);
3291                 size = 0;
3292                 unavail_size = 0;
3293                 for (i = 0;
3294                      (size < cnt) && (unavail_size < cnt) &&
3295                      (i < CLMAP_NPGS(clmap)); i++) {
3296                         if (CLMAP_ISSET(clmap, i)) {
3297                                 if (unavail_size != 0)
3298                                         break;
3299                                 size += vm_page_size;
3300                                 BS_STAT(ps->ps_bs,
3301                                         ps->ps_bs->bs_pages_in++);
3302                         } else {
3303                                 if (size != 0)
3304                                         break;
3305                                 unavail_size += vm_page_size;
3306                         }
3307                 }
3308
3309                 if (size == 0) {
3310                         ASSERT(unavail_size);
3311                         cnt -= unavail_size;
3312                         offset += unavail_size;
3313                         if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3314                                 == 0) {
3315                                 /* There is no more to transfer in this
3316                                    cluster
3317                                 */
3318                                 *vsmap_ptr = write_vsmap;
3319                                 VSM_CLR(write_vsmap);
3320                                 VSM_CLR(original_read_vsmap);
3321                         }
3322                         continue;
3323                 }
3324
3325                 if(VSM_ISCLR(original_read_vsmap))
3326                         original_read_vsmap = *vsmap_ptr;
3327
3328                 if(ps->ps_segtype == PS_PARTITION) {
3329 /*
3330                         NEED TO BE WITH SYNC & NO COMMIT
3331                         error = ps_read_device(ps, actual_offset, &buffer,
3332                                        size, &residual, flags);
3333 */
3334                 } else {
3335 #ifndef ubc_sync_working
3336                         error = vm_fault_list_request(transfer_object,
3337 (vm_object_offset_t) (actual_offset & ((vm_page_size << vs->vs_clshift) - 1)),
3338                                         size, &upl, NULL,
3339                                         0, UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
3340                                                 | UPL_SET_INTERNAL);
3341                         if (error == KERN_SUCCESS) {
3342                                 error = ps_read_file(ps, upl, actual_offset,
3343                                                         size, &residual, 0);
3344                                 if(error)
3345                                         uc_upl_commit(upl, NULL);
3346                         }
3347
3348 #else
3349                         /* NEED TO BE WITH SYNC & NO COMMIT & NO RDAHEAD*/
3350                         error = ps_read_file(ps, upl, actual_offset,
3351                                         size, &residual,
3352                                         (UPL_IOSYNC | UPL_NOCOMMIT | UPL_NORDAHEAD));
3353 #endif
3354                 }
3355
3356                 read_vsmap = *vsmap_ptr;
3357
3358
3359                 /*
3360                  * Adjust counts and put data in new BS.  Optimize for the
3361                  * common case, i.e. no error and/or partial data.
3362                  * If there was an error, then we need to error the entire
3363                  * range, even if some data was successfully read.
3364                  *
3365                  */
3366                 if ((error == KERN_SUCCESS) && (residual == 0)) {
3367                         /*
3368                          * Got everything we asked for, supply the data to
3369                          * the new BS.  Note that as a side effect of supplying
3370                          * the data, the buffer holding the supplied data is
3371                          * deallocated from the pager's address space unless
3372                          * the write is unsuccessful.
3373                          */
3374
3375                         /* note buffer will be cleaned up in all cases by */
3376                         /* internal_cluster_write or if an error on write */
3377                         /* the vm_map_copy_page_discard call              */
3378                         *vsmap_ptr = write_vsmap;
3379
3380 #ifndef ubc_sync_working
3381                         error = vm_fault_list_request(transfer_object,
3382 (vm_object_offset_t) (actual_offset & ((vm_page_size << vs->vs_clshift) - 1)),
3383                                         size, &upl, NULL,
3384                                         0, UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
3385                                                 | UPL_SET_INTERNAL);
3386                         if(vs_cluster_write(vs, upl, offset,
3387                                         size, TRUE, 0) != KERN_SUCCESS) {
3388                                 uc_upl_commit(upl, NULL);
3389 #else
3390                         if(vs_cluster_write(vs, upl, offset,
3391                                         size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3392 #endif
3393                                 error = KERN_FAILURE;
3394                                 if(!(VSM_ISCLR(*vsmap_ptr))) {
3395                                         /* unmap the new backing store object */
3396                                         ps_clunmap(vs, offset, size);
3397                                 }
3398                                 /* original vsmap */
3399                                 *vsmap_ptr = original_read_vsmap;
3400                                 VSM_CLR(write_vsmap);
3401                         } else {
3402                                if((offset + size) &
3403                                         ((vm_page_size << vs->vs_clshift)
3404                                         - 1)) {
3405                                         /* There is more to transfer in this
3406                                            cluster
3407                                         */
3408                                         write_vsmap = *vsmap_ptr;
3409                                         *vsmap_ptr = read_vsmap;
3410                                 } else {
3411                                         /* discard the old backing object */
3412                                         write_vsmap = *vsmap_ptr;
3413                                         *vsmap_ptr = read_vsmap;
3414                                         ps_clunmap(vs, offset, size);
3415                                         *vsmap_ptr = write_vsmap;
3416                                         VSM_CLR(write_vsmap);
3417                                         VSM_CLR(original_read_vsmap);
3418                                 }
3419                         }
3420                 } else {
3421                         size_wanted = size;
3422                         if (error == KERN_SUCCESS) {
3423                                 if (residual == size) {
3424                                         /*
3425                                          * If a read operation returns no error
3426                                          * and no data moved, we turn it into
3427                                          * an error, assuming we're reading at
3428                                          * or beyond EOF.
3429                                          * Fall through and error the entire
3430                                          * range.
3431                                          */
3432                                         error = KERN_FAILURE;
3433                                         *vsmap_ptr = write_vsmap;
3434                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3435                                         /* unmap the new backing store object */
3436                                         ps_clunmap(vs, offset, size);
3437                                         }
3438                                         *vsmap_ptr = original_read_vsmap;
3439                                         VSM_CLR(write_vsmap);
3440                                         continue;
3441                                 } else {
3442                                         /*
3443                                          * Otherwise, we have partial read.
3444                                          * This is also considered an error
3445                                          * for the purposes of cluster transfer
3446                                          */
3447                                         error = KERN_FAILURE;
3448                                         *vsmap_ptr = write_vsmap;
3449                                         if(!(VSM_ISCLR(*vsmap_ptr))) {
3450                                         /* unmap the new backing store object */
3451                                         ps_clunmap(vs, offset, size);
3452                                         }
3453                                         *vsmap_ptr = original_read_vsmap;
3454                                         VSM_CLR(write_vsmap);
3455                                         continue;
3456                                 }
3457                         }
3458
3459                 }
3460                 cnt -= size;
3461                 offset += size;
3462
3463         } /* END while (cnt && (error == 0)) */
3464         if(!VSM_ISCLR(write_vsmap))
3465                 *vsmap_ptr = write_vsmap;
3466
3467         /* uc_upl_un_map(kernel_map, upl); */
3468         return error;
3469 }
3470
3471 kern_return_t
3472 default_pager_add_file(MACH_PORT_FACE backing_store,
3473         int             *vp,
3474         int             record_size,
3475         long            size)
3476 {
3477         backing_store_t         bs;
3478         paging_segment_t        ps;
3479         int                     i;
3480         int                     error;
3481         static char here[] = "default_pager_add_file";
3482
3483         if ((bs = backing_store_lookup(backing_store))
3484             == BACKING_STORE_NULL)
3485                 return KERN_INVALID_ARGUMENT;
3486
3487         PSL_LOCK();
3488         for (i = 0; i <= paging_segment_max; i++) {
3489                 ps = paging_segments[i];
3490                 if (ps == PAGING_SEGMENT_NULL)
3491                         continue;
3492                 if (ps->ps_segtype != PS_FILE)
3493                         continue;
3494
3495                 /*
3496                  * Check for overlap on same device.
3497                  */
3498                 if (ps->ps_vnode == (struct vnode *)vp) {
3499                         PSL_UNLOCK();
3500                         BS_UNLOCK(bs);
3501                         return KERN_INVALID_ARGUMENT;
3502                 }
3503         }
3504         PSL_UNLOCK();
3505
3506         /*
3507          * Set up the paging segment
3508          */
3509         ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3510         if (ps == PAGING_SEGMENT_NULL) {
3511                 BS_UNLOCK(bs);
3512                 return KERN_RESOURCE_SHORTAGE;
3513         }
3514
3515         ps->ps_segtype = PS_FILE;
3516         ps->ps_vnode = (struct vnode *)vp;
3517         ps->ps_offset = 0;
3518         ps->ps_record_shift = local_log2(vm_page_size / record_size);
3519         ps->ps_recnum = size;
3520         ps->ps_pgnum = size >> ps->ps_record_shift;
3521
3522         ps->ps_pgcount = ps->ps_pgnum;
3523         ps->ps_clshift = local_log2(bs->bs_clsize);
3524         ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3525         ps->ps_hint = 0;
3526
3527         PS_LOCK_INIT(ps);
3528         ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3529         if (!ps->ps_bmap) {
3530                 kfree((vm_offset_t)ps, sizeof *ps);
3531                 BS_UNLOCK(bs);
3532                 return KERN_RESOURCE_SHORTAGE;
3533         }
3534         for (i = 0; i < ps->ps_ncls; i++) {
3535                 clrbit(ps->ps_bmap, i);
3536         }
3537
3538         ps->ps_going_away = FALSE;
3539         ps->ps_bs = bs;
3540
3541         if ((error = ps_enter(ps)) != 0) {
3542                 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3543                 kfree((vm_offset_t)ps, sizeof *ps);
3544                 BS_UNLOCK(bs);
3545                 return KERN_RESOURCE_SHORTAGE;
3546         }
3547
3548         bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3549         bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3550         PSL_LOCK();
3551         dp_pages_free += ps->ps_pgcount;
3552         PSL_UNLOCK();
3553
3554         BS_UNLOCK(bs);
3555
3556         bs_more_space(ps->ps_clcount);
3557
3558         DEBUG(DEBUG_BS_INTERNAL,
3559               ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3560                device, offset, size, record_size,
3561                ps->ps_record_shift, ps->ps_pgnum));
3562
3563         return KERN_SUCCESS;
3564 }
3565
3566
3567
3568 kern_return_t ps_read_file(paging_segment_t, upl_t, vm_offset_t, unsigned int, unsigned int *, int);    /* forward */
3569
3570 kern_return_t
3571 ps_read_file(
3572         paging_segment_t        ps,
3573         upl_t                   upl,
3574         vm_offset_t             offset,
3575         unsigned int            size,
3576         unsigned int            *residualp,
3577         int                     flags)
3578 {
3579         vm_object_offset_t      f_offset;
3580         int                     error = 0;
3581         int                     result;
3582         static char             here[] = "ps_read_file";
3583
3584
3585         clustered_reads[atop(size)]++;
3586
3587         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3588
3589         /* for transfer case we need to pass uploffset and flags */
3590         error = vnode_pagein(ps->ps_vnode,
3591                                    upl, (vm_offset_t)0, f_offset, (vm_size_t)size, flags, NULL);
3592
3593         /* The vnode_pagein semantic is somewhat at odds with the existing   */
3594         /* device_read semantic.  Partial reads are not experienced at this  */
3595         /* level.  It is up to the bit map code and cluster read code to     */
3596         /* check that requested data locations are actually backed, and the  */
3597         /* pagein code to either read all of the requested data or return an */
3598         /* error. */
3599
3600         if (error)
3601                 result = KERN_FAILURE;
3602         else {
3603                 *residualp = 0;
3604                 result = KERN_SUCCESS;
3605         }
3606         return result;
3607
3608 }
3609
3610 kern_return_t
3611 ps_write_file(
3612         paging_segment_t        ps,
3613         upl_t                   upl,
3614         vm_offset_t             upl_offset,
3615         vm_offset_t             offset,
3616         unsigned int            size,
3617         int                     flags)
3618 {
3619         vm_object_offset_t      f_offset;
3620         kern_return_t           result;
3621         static char here[] = "ps_write_file";
3622
3623         int             error = 0;
3624
3625         clustered_writes[atop(size)]++;
3626         f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3627
3628         if (vnode_pageout(ps->ps_vnode,
3629                                 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3630                 result = KERN_FAILURE;
3631         else
3632                 result = KERN_SUCCESS;
3633
3634         return result;
3635 }
3636
3637 kern_return_t
3638 default_pager_triggers(MACH_PORT_FACE default_pager,
3639         int             hi_wat,
3640         int             lo_wat,
3641         int             flags,
3642         MACH_PORT_FACE  trigger_port)
3643 {
3644
3645         if(flags & HI_WAT_ALERT) {
3646                 if(min_pages_trigger_port)
3647                         ipc_port_release_send(min_pages_trigger_port);
3648                 min_pages_trigger_port = trigger_port;
3649                 minimum_pages_remaining = hi_wat/vm_page_size;
3650                 bs_low = FALSE;
3651         }
3652         if(flags & LO_WAT_ALERT) {
3653                 if(max_pages_trigger_port)
3654                         ipc_port_release_send(max_pages_trigger_port);
3655                 max_pages_trigger_port = trigger_port;
3656                 maximum_pages_free = lo_wat/vm_page_size;
3657         }
3658 }