]> git.saurik.com Git - apple/xnu.git/blame - osfmk/default_pager/dp_backing_store.c
xnu-517.3.15.tar.gz
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
CommitLineData
1c79356b 1/*
55e303ae 2 * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
1c79356b
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
43866e37
A
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
43866e37
A
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
1c79356b
A
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25/*
26 * @OSF_COPYRIGHT@
27 */
28/*
29 * Mach Operating System
30 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
31 * All Rights Reserved.
32 *
33 * Permission to use, copy, modify and distribute this software and its
34 * documentation is hereby granted, provided that both the copyright
35 * notice and this permission notice appear in all copies of the
36 * software, derivative works or modified versions, and any portions
37 * thereof, and that both notices appear in supporting documentation.
38 *
39 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
40 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
41 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
42 *
43 * Carnegie Mellon requests users of this software to return to
44 *
45 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
46 * School of Computer Science
47 * Carnegie Mellon University
48 * Pittsburgh PA 15213-3890
49 *
50 * any improvements or extensions that they make and grant Carnegie Mellon
51 * the rights to redistribute these changes.
52 */
53
54/*
55 * Default Pager.
56 * Paging File Management.
57 */
58
0b4e3aa0 59#include <mach/memory_object_control.h>
1c79356b
A
60#include <mach/memory_object_server.h>
61#include "default_pager_internal.h"
62#include <default_pager/default_pager_alerts.h>
63#include <ipc/ipc_port.h>
64#include <ipc/ipc_space.h>
65#include <kern/queue.h>
66#include <kern/counters.h>
67#include <kern/sched_prim.h>
68#include <vm/vm_kern.h>
69#include <vm/vm_pageout.h>
70/* CDY CDY */
71#include <vm/vm_map.h>
72
0b4e3aa0
A
73/*
74 * ALLOC_STRIDE... the maximum number of bytes allocated from
75 * a swap file before moving on to the next swap file... if
76 * all swap files reside on a single disk, this value should
77 * be very large (this is the default assumption)... if the
78 * swap files are spread across multiple disks, than this value
79 * should be small (128 * 1024)...
80 *
81 * This should be determined dynamically in the future
82 */
1c79356b 83
0b4e3aa0 84#define ALLOC_STRIDE (1024 * 1024 * 1024)
1c79356b
A
85int physical_transfer_cluster_count = 0;
86
9bccf70c
A
87#define VM_SUPER_CLUSTER 0x40000
88#define VM_SUPER_PAGES 64
1c79356b
A
89
90/*
91 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
92 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
93 */
94#define VSTRUCT_DEF_CLSHIFT 2
95int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
96int default_pager_clsize = 0;
97
98/* statistics */
0b4e3aa0
A
99unsigned int clustered_writes[VM_SUPER_PAGES+1];
100unsigned int clustered_reads[VM_SUPER_PAGES+1];
1c79356b
A
101
102/*
103 * Globals used for asynchronous paging operations:
104 * vs_async_list: head of list of to-be-completed I/O ops
105 * async_num_queued: number of pages completed, but not yet
106 * processed by async thread.
107 * async_requests_out: number of pages of requests not completed.
108 */
109
110#if 0
111struct vs_async *vs_async_list;
112int async_num_queued;
113int async_requests_out;
114#endif
115
116
117#define VS_ASYNC_REUSE 1
118struct vs_async *vs_async_free_list;
119
120mutex_t default_pager_async_lock; /* Protects globals above */
121
122
123int vs_alloc_async_failed = 0; /* statistics */
124int vs_alloc_async_count = 0; /* statistics */
125struct vs_async *vs_alloc_async(void); /* forward */
126void vs_free_async(struct vs_async *vsa); /* forward */
127
128
129#define VS_ALLOC_ASYNC() vs_alloc_async()
130#define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
131
132#define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
133#define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
134#define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, \
135 ETAP_IO_DEV_PAGEH)
136#define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
137/*
138 * Paging Space Hysteresis triggers and the target notification port
139 *
140 */
141
142unsigned int minimum_pages_remaining = 0;
143unsigned int maximum_pages_free = 0;
144ipc_port_t min_pages_trigger_port = NULL;
145ipc_port_t max_pages_trigger_port = NULL;
146
147boolean_t bs_low = FALSE;
0b4e3aa0 148int backing_store_release_trigger_disable = 0;
1c79356b
A
149
150
151
152/*
153 * Object sizes are rounded up to the next power of 2,
154 * unless they are bigger than a given maximum size.
155 */
156vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
157
158/*
159 * List of all backing store and segments.
160 */
161struct backing_store_list_head backing_store_list;
162paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
163mutex_t paging_segments_lock;
164int paging_segment_max = 0;
165int paging_segment_count = 0;
166int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
167
168
169/*
170 * Total pages free in system
171 * This differs from clusters committed/avail which is a measure of the
172 * over commitment of paging segments to backing store. An idea which is
173 * likely to be deprecated.
174 */
175unsigned int dp_pages_free = 0;
176unsigned int cluster_transfer_minimum = 100;
177
178kern_return_t ps_write_file(paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, int); /* forward */
0b4e3aa0
A
179kern_return_t ps_read_file (paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, unsigned int *, int); /* forward */
180
1c79356b
A
181
182default_pager_thread_t *
183get_read_buffer()
184{
185 int i;
186
187 DPT_LOCK(dpt_lock);
188 while(TRUE) {
189 for (i=0; i<default_pager_internal_count; i++) {
190 if(dpt_array[i]->checked_out == FALSE) {
191 dpt_array[i]->checked_out = TRUE;
192 DPT_UNLOCK(dpt_lock);
193 return dpt_array[i];
194 }
195 }
9bccf70c 196 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
1c79356b
A
197 }
198}
199
200void
201bs_initialize(void)
202{
203 int i;
204
205 /*
206 * List of all backing store.
207 */
208 BSL_LOCK_INIT();
209 queue_init(&backing_store_list.bsl_queue);
210 PSL_LOCK_INIT();
211
212 VS_ASYNC_LOCK_INIT();
213#if VS_ASYNC_REUSE
214 vs_async_free_list = NULL;
215#endif /* VS_ASYNC_REUSE */
216
0b4e3aa0 217 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
1c79356b
A
218 clustered_writes[i] = 0;
219 clustered_reads[i] = 0;
220 }
221
222}
223
224/*
225 * When things do not quite workout...
226 */
227void bs_no_paging_space(boolean_t); /* forward */
228
229void
230bs_no_paging_space(
231 boolean_t out_of_memory)
232{
1c79356b
A
233
234 if (out_of_memory)
235 dprintf(("*** OUT OF MEMORY ***\n"));
236 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
237}
238
239void bs_more_space(int); /* forward */
240void bs_commit(int); /* forward */
241
242boolean_t user_warned = FALSE;
243unsigned int clusters_committed = 0;
244unsigned int clusters_available = 0;
245unsigned int clusters_committed_peak = 0;
246
247void
248bs_more_space(
249 int nclusters)
250{
251 BSL_LOCK();
252 /*
253 * Account for new paging space.
254 */
255 clusters_available += nclusters;
256
257 if (clusters_available >= clusters_committed) {
258 if (verbose && user_warned) {
259 printf("%s%s - %d excess clusters now.\n",
260 my_name,
261 "paging space is OK now",
262 clusters_available - clusters_committed);
263 user_warned = FALSE;
264 clusters_committed_peak = 0;
265 }
266 } else {
267 if (verbose && user_warned) {
268 printf("%s%s - still short of %d clusters.\n",
269 my_name,
270 "WARNING: paging space over-committed",
271 clusters_committed - clusters_available);
272 clusters_committed_peak -= nclusters;
273 }
274 }
275 BSL_UNLOCK();
276
277 return;
278}
279
280void
281bs_commit(
282 int nclusters)
283{
284 BSL_LOCK();
285 clusters_committed += nclusters;
286 if (clusters_committed > clusters_available) {
287 if (verbose && !user_warned) {
288 user_warned = TRUE;
289 printf("%s%s - short of %d clusters.\n",
290 my_name,
291 "WARNING: paging space over-committed",
292 clusters_committed - clusters_available);
293 }
294 if (clusters_committed > clusters_committed_peak) {
295 clusters_committed_peak = clusters_committed;
296 }
297 } else {
298 if (verbose && user_warned) {
299 printf("%s%s - was short of up to %d clusters.\n",
300 my_name,
301 "paging space is OK now",
302 clusters_committed_peak - clusters_available);
303 user_warned = FALSE;
304 clusters_committed_peak = 0;
305 }
306 }
307 BSL_UNLOCK();
308
309 return;
310}
311
312int default_pager_info_verbose = 1;
313
314void
315bs_global_info(
316 vm_size_t *totalp,
317 vm_size_t *freep)
318{
319 vm_size_t pages_total, pages_free;
320 paging_segment_t ps;
321 int i;
1c79356b
A
322
323 PSL_LOCK();
324 pages_total = pages_free = 0;
325 for (i = 0; i <= paging_segment_max; i++) {
326 ps = paging_segments[i];
327 if (ps == PAGING_SEGMENT_NULL)
328 continue;
329
330 /*
331 * no need to lock: by the time this data
332 * gets back to any remote requestor it
333 * will be obsolete anyways
334 */
335 pages_total += ps->ps_pgnum;
336 pages_free += ps->ps_clcount << ps->ps_clshift;
337 DEBUG(DEBUG_BS_INTERNAL,
338 ("segment #%d: %d total, %d free\n",
339 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
340 }
341 *totalp = pages_total;
342 *freep = pages_free;
343 if (verbose && user_warned && default_pager_info_verbose) {
344 if (clusters_available < clusters_committed) {
345 printf("%s %d clusters committed, %d available.\n",
346 my_name,
347 clusters_committed,
348 clusters_available);
349 }
350 }
351 PSL_UNLOCK();
352}
353
354backing_store_t backing_store_alloc(void); /* forward */
355
356backing_store_t
357backing_store_alloc(void)
358{
359 backing_store_t bs;
1c79356b
A
360
361 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
362 if (bs == BACKING_STORE_NULL)
363 panic("backing_store_alloc: no memory");
364
365 BS_LOCK_INIT(bs);
366 bs->bs_port = MACH_PORT_NULL;
367 bs->bs_priority = 0;
368 bs->bs_clsize = 0;
369 bs->bs_pages_total = 0;
370 bs->bs_pages_in = 0;
371 bs->bs_pages_in_fail = 0;
372 bs->bs_pages_out = 0;
373 bs->bs_pages_out_fail = 0;
374
375 return bs;
376}
377
378backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
379
380/* Even in both the component space and external versions of this pager, */
381/* backing_store_lookup will be called from tasks in the application space */
382backing_store_t
383backing_store_lookup(
384 MACH_PORT_FACE port)
385{
386 backing_store_t bs;
387
388/*
389 port is currently backed with a vs structure in the alias field
390 we could create an ISBS alias and a port_is_bs call but frankly
391 I see no reason for the test, the bs->port == port check below
392 will work properly on junk entries.
393
394 if ((port == MACH_PORT_NULL) || port_is_vs(port))
395*/
396 if ((port == MACH_PORT_NULL))
397 return BACKING_STORE_NULL;
398
399 BSL_LOCK();
400 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
401 bs_links) {
402 BS_LOCK(bs);
403 if (bs->bs_port == port) {
404 BSL_UNLOCK();
405 /* Success, return it locked. */
406 return bs;
407 }
408 BS_UNLOCK(bs);
409 }
410 BSL_UNLOCK();
411 return BACKING_STORE_NULL;
412}
413
414void backing_store_add(backing_store_t); /* forward */
415
416void
417backing_store_add(
418 backing_store_t bs)
419{
420 MACH_PORT_FACE port = bs->bs_port;
421 MACH_PORT_FACE pset = default_pager_default_set;
422 kern_return_t kr = KERN_SUCCESS;
1c79356b
A
423
424 if (kr != KERN_SUCCESS)
425 panic("backing_store_add: add to set");
426
427}
428
429/*
430 * Set up default page shift, but only if not already
431 * set and argument is within range.
432 */
433boolean_t
434bs_set_default_clsize(unsigned int npages)
435{
436 switch(npages){
437 case 1:
438 case 2:
439 case 4:
440 case 8:
441 if (default_pager_clsize == 0) /* if not yet set */
442 vstruct_def_clshift = local_log2(npages);
443 return(TRUE);
444 }
445 return(FALSE);
446}
447
448int bs_get_global_clsize(int clsize); /* forward */
449
450int
451bs_get_global_clsize(
452 int clsize)
453{
454 int i;
0b4e3aa0 455 memory_object_default_t dmm;
1c79356b 456 kern_return_t kr;
1c79356b
A
457
458 /*
459 * Only allow setting of cluster size once. If called
460 * with no cluster size (default), we use the compiled-in default
461 * for the duration. The same cluster size is used for all
462 * paging segments.
463 */
464 if (default_pager_clsize == 0) {
1c79356b
A
465 /*
466 * Keep cluster size in bit shift because it's quicker
467 * arithmetic, and easier to keep at a power of 2.
468 */
469 if (clsize != NO_CLSIZE) {
470 for (i = 0; (1 << i) < clsize; i++);
471 if (i > MAX_CLUSTER_SHIFT)
472 i = MAX_CLUSTER_SHIFT;
473 vstruct_def_clshift = i;
474 }
475 default_pager_clsize = (1 << vstruct_def_clshift);
476
477 /*
478 * Let the user know the new (and definitive) cluster size.
479 */
480 if (verbose)
481 printf("%scluster size = %d page%s\n",
482 my_name, default_pager_clsize,
483 (default_pager_clsize == 1) ? "" : "s");
0b4e3aa0 484
1c79356b
A
485 /*
486 * Let the kernel know too, in case it hasn't used the
487 * default value provided in main() yet.
488 */
0b4e3aa0 489 dmm = default_pager_object;
1c79356b
A
490 clsize = default_pager_clsize * vm_page_size; /* in bytes */
491 kr = host_default_memory_manager(host_priv_self(),
0b4e3aa0 492 &dmm,
1c79356b 493 clsize);
0b4e3aa0
A
494 memory_object_default_deallocate(dmm);
495
1c79356b
A
496 if (kr != KERN_SUCCESS) {
497 panic("bs_get_global_cl_size:host_default_memory_manager");
498 }
0b4e3aa0 499 if (dmm != default_pager_object) {
1c79356b
A
500 panic("bs_get_global_cl_size:there is another default pager");
501 }
502 }
503 ASSERT(default_pager_clsize > 0 &&
504 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
505
506 return default_pager_clsize;
507}
508
509kern_return_t
510default_pager_backing_store_create(
0b4e3aa0
A
511 memory_object_default_t pager,
512 int priority,
513 int clsize, /* in bytes */
514 MACH_PORT_FACE *backing_store)
1c79356b
A
515{
516 backing_store_t bs;
517 MACH_PORT_FACE port;
518 kern_return_t kr;
519 struct vstruct_alias *alias_struct;
1c79356b 520
0b4e3aa0 521 if (pager != default_pager_object)
1c79356b
A
522 return KERN_INVALID_ARGUMENT;
523
524 bs = backing_store_alloc();
525 port = ipc_port_alloc_kernel();
526 ipc_port_make_send(port);
527 assert (port != IP_NULL);
528
529 DEBUG(DEBUG_BS_EXTERNAL,
530 ("priority=%d clsize=%d bs_port=0x%x\n",
531 priority, clsize, (int) backing_store));
532
533 alias_struct = (struct vstruct_alias *)
534 kalloc(sizeof (struct vstruct_alias));
535 if(alias_struct != NULL) {
536 alias_struct->vs = (struct vstruct *)bs;
537 alias_struct->name = ISVS;
538 port->alias = (int) alias_struct;
539 }
540 else {
541 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
542 kfree((vm_offset_t)bs, sizeof (struct backing_store));
543 return KERN_RESOURCE_SHORTAGE;
544 }
545
546 bs->bs_port = port;
547 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
548 priority = BS_MAXPRI;
549 else if (priority == BS_NOPRI)
550 priority = BS_MAXPRI;
551 else
552 priority = BS_MINPRI;
553 bs->bs_priority = priority;
554
55e303ae 555 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
1c79356b
A
556
557 BSL_LOCK();
558 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
559 bs_links);
560 BSL_UNLOCK();
561
562 backing_store_add(bs);
563
564 *backing_store = port;
565 return KERN_SUCCESS;
566}
567
568kern_return_t
569default_pager_backing_store_info(
570 MACH_PORT_FACE backing_store,
571 backing_store_flavor_t flavour,
572 backing_store_info_t info,
573 mach_msg_type_number_t *size)
574{
575 backing_store_t bs;
576 backing_store_basic_info_t basic;
577 int i;
578 paging_segment_t ps;
579
580 if (flavour != BACKING_STORE_BASIC_INFO ||
581 *size < BACKING_STORE_BASIC_INFO_COUNT)
582 return KERN_INVALID_ARGUMENT;
583
584 basic = (backing_store_basic_info_t)info;
585 *size = BACKING_STORE_BASIC_INFO_COUNT;
586
587 VSTATS_LOCK(&global_stats.gs_lock);
588 basic->pageout_calls = global_stats.gs_pageout_calls;
589 basic->pagein_calls = global_stats.gs_pagein_calls;
590 basic->pages_in = global_stats.gs_pages_in;
591 basic->pages_out = global_stats.gs_pages_out;
592 basic->pages_unavail = global_stats.gs_pages_unavail;
593 basic->pages_init = global_stats.gs_pages_init;
594 basic->pages_init_writes= global_stats.gs_pages_init_writes;
595 VSTATS_UNLOCK(&global_stats.gs_lock);
596
597 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
598 return KERN_INVALID_ARGUMENT;
599
600 basic->bs_pages_total = bs->bs_pages_total;
601 PSL_LOCK();
602 bs->bs_pages_free = 0;
603 for (i = 0; i <= paging_segment_max; i++) {
604 ps = paging_segments[i];
605 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
606 PS_LOCK(ps);
607 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
608 PS_UNLOCK(ps);
609 }
610 }
611 PSL_UNLOCK();
612 basic->bs_pages_free = bs->bs_pages_free;
613 basic->bs_pages_in = bs->bs_pages_in;
614 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
615 basic->bs_pages_out = bs->bs_pages_out;
616 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
617
618 basic->bs_priority = bs->bs_priority;
55e303ae 619 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */
1c79356b
A
620
621 BS_UNLOCK(bs);
622
623 return KERN_SUCCESS;
624}
625
626int ps_delete(paging_segment_t); /* forward */
627
628int
629ps_delete(
630 paging_segment_t ps)
631{
632 vstruct_t vs;
633 kern_return_t error = KERN_SUCCESS;
634 int vs_count;
635
636 VSL_LOCK(); /* get the lock on the list of vs's */
637
638 /* The lock relationship and sequence is farily complicated */
639 /* this code looks at a live list, locking and unlocking the list */
640 /* as it traverses it. It depends on the locking behavior of */
641 /* default_pager_no_senders. no_senders always locks the vstruct */
642 /* targeted for removal before locking the vstruct list. However */
643 /* it will remove that member of the list without locking its */
644 /* neighbors. We can be sure when we hold a lock on a vstruct */
645 /* it cannot be removed from the list but we must hold the list */
646 /* lock to be sure that its pointers to its neighbors are valid. */
647 /* Also, we can hold off destruction of a vstruct when the list */
648 /* lock and the vs locks are not being held by bumping the */
649 /* vs_async_pending count. */
650
0b4e3aa0
A
651
652 while(backing_store_release_trigger_disable != 0) {
9bccf70c 653 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
0b4e3aa0
A
654 }
655
1c79356b
A
656 /* we will choose instead to hold a send right */
657 vs_count = vstruct_list.vsl_count;
658 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
659 if(vs == (vstruct_t)&vstruct_list) {
660 VSL_UNLOCK();
661 return KERN_SUCCESS;
662 }
663 VS_LOCK(vs);
664 vs_async_wait(vs); /* wait for any pending async writes */
665 if ((vs_count != 0) && (vs != NULL))
666 vs->vs_async_pending += 1; /* hold parties calling */
667 /* vs_async_wait */
668 VS_UNLOCK(vs);
669 VSL_UNLOCK();
670 while((vs_count != 0) && (vs != NULL)) {
671 /* We take the count of AMO's before beginning the */
672 /* transfer of of the target segment. */
673 /* We are guaranteed that the target segment cannot get */
674 /* more users. We also know that queue entries are */
675 /* made at the back of the list. If some of the entries */
676 /* we would check disappear while we are traversing the */
677 /* list then we will either check new entries which */
678 /* do not have any backing store in the target segment */
679 /* or re-check old entries. This might not be optimal */
680 /* but it will always be correct. The alternative is to */
681 /* take a snapshot of the list. */
682 vstruct_t next_vs;
683
684 if(dp_pages_free < cluster_transfer_minimum)
685 error = KERN_FAILURE;
686 else {
687 vm_object_t transfer_object;
0b4e3aa0 688 int count;
1c79356b
A
689 upl_t upl;
690
691 transfer_object = vm_object_allocate(VM_SUPER_CLUSTER);
0b4e3aa0
A
692 count = 0;
693 error = vm_object_upl_request(transfer_object,
694 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
695 &upl, NULL, &count,
696 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
697 | UPL_SET_INTERNAL);
1c79356b 698 if(error == KERN_SUCCESS) {
1c79356b
A
699 error = ps_vstruct_transfer_from_segment(
700 vs, ps, upl);
0b4e3aa0
A
701 upl_commit(upl, NULL);
702 upl_deallocate(upl);
1c79356b 703 } else {
1c79356b
A
704 error = KERN_FAILURE;
705 }
9bccf70c 706 vm_object_deallocate(transfer_object);
1c79356b
A
707 }
708 if(error) {
709 VS_LOCK(vs);
710 vs->vs_async_pending -= 1; /* release vs_async_wait */
0b4e3aa0
A
711 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
712 vs->vs_waiting_async = FALSE;
1c79356b 713 VS_UNLOCK(vs);
0b4e3aa0 714 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
715 } else {
716 VS_UNLOCK(vs);
717 }
718 return KERN_FAILURE;
719 }
720
721 VSL_LOCK();
0b4e3aa0
A
722
723 while(backing_store_release_trigger_disable != 0) {
9bccf70c
A
724 VSL_SLEEP(&backing_store_release_trigger_disable,
725 THREAD_UNINT);
0b4e3aa0
A
726 }
727
1c79356b
A
728 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
729 if((next_vs != (vstruct_t)&vstruct_list) &&
730 (vs != next_vs) && (vs_count != 1)) {
731 VS_LOCK(next_vs);
732 vs_async_wait(next_vs); /* wait for any */
733 /* pending async writes */
734 next_vs->vs_async_pending += 1; /* hold parties */
735 /* calling vs_async_wait */
736 VS_UNLOCK(next_vs);
737 }
738 VSL_UNLOCK();
739 VS_LOCK(vs);
740 vs->vs_async_pending -= 1;
0b4e3aa0
A
741 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
742 vs->vs_waiting_async = FALSE;
1c79356b 743 VS_UNLOCK(vs);
0b4e3aa0 744 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
745 } else {
746 VS_UNLOCK(vs);
747 }
748 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
749 vs = NULL;
750 else
751 vs = next_vs;
752 vs_count--;
753 }
754 return KERN_SUCCESS;
755}
756
757
758kern_return_t
759default_pager_backing_store_delete(
760 MACH_PORT_FACE backing_store)
761{
762 backing_store_t bs;
763 int i;
764 paging_segment_t ps;
765 int error;
766 int interim_pages_removed = 0;
767 kern_return_t kr;
1c79356b
A
768
769 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
770 return KERN_INVALID_ARGUMENT;
771
772#if 0
773 /* not implemented */
774 BS_UNLOCK(bs);
775 return KERN_FAILURE;
776#endif
777
778 restart:
779 PSL_LOCK();
780 error = KERN_SUCCESS;
781 for (i = 0; i <= paging_segment_max; i++) {
782 ps = paging_segments[i];
783 if (ps != PAGING_SEGMENT_NULL &&
784 ps->ps_bs == bs &&
785 ! ps->ps_going_away) {
786 PS_LOCK(ps);
787 /* disable access to this segment */
788 ps->ps_going_away = TRUE;
789 PS_UNLOCK(ps);
790 /*
791 * The "ps" segment is "off-line" now,
792 * we can try and delete it...
793 */
794 if(dp_pages_free < (cluster_transfer_minimum
795 + ps->ps_pgcount)) {
796 error = KERN_FAILURE;
797 PSL_UNLOCK();
798 }
799 else {
800 /* remove all pages associated with the */
801 /* segment from the list of free pages */
802 /* when transfer is through, all target */
803 /* segment pages will appear to be free */
804
805 dp_pages_free -= ps->ps_pgcount;
806 interim_pages_removed += ps->ps_pgcount;
807 PSL_UNLOCK();
808 error = ps_delete(ps);
809 }
810 if (error != KERN_SUCCESS) {
811 /*
812 * We couldn't delete the segment,
813 * probably because there's not enough
814 * virtual memory left.
815 * Re-enable all the segments.
816 */
817 PSL_LOCK();
818 break;
819 }
820 goto restart;
821 }
822 }
823
824 if (error != KERN_SUCCESS) {
825 for (i = 0; i <= paging_segment_max; i++) {
826 ps = paging_segments[i];
827 if (ps != PAGING_SEGMENT_NULL &&
828 ps->ps_bs == bs &&
829 ps->ps_going_away) {
830 PS_LOCK(ps);
831 /* re-enable access to this segment */
832 ps->ps_going_away = FALSE;
833 PS_UNLOCK(ps);
834 }
835 }
836 dp_pages_free += interim_pages_removed;
837 PSL_UNLOCK();
838 BS_UNLOCK(bs);
839 return error;
840 }
841
842 for (i = 0; i <= paging_segment_max; i++) {
843 ps = paging_segments[i];
844 if (ps != PAGING_SEGMENT_NULL &&
845 ps->ps_bs == bs) {
846 if(ps->ps_going_away) {
847 paging_segments[i] = PAGING_SEGMENT_NULL;
848 paging_segment_count--;
849 PS_LOCK(ps);
850 kfree((vm_offset_t)ps->ps_bmap,
851 RMAPSIZE(ps->ps_ncls));
852 kfree((vm_offset_t)ps, sizeof *ps);
853 }
854 }
855 }
856
857 /* Scan the entire ps array separately to make certain we find the */
858 /* proper paging_segment_max */
859 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
860 if(paging_segments[i] != PAGING_SEGMENT_NULL)
861 paging_segment_max = i;
862 }
863
864 PSL_UNLOCK();
865
866 /*
867 * All the segments have been deleted.
868 * We can remove the backing store.
869 */
870
871 /*
872 * Disable lookups of this backing store.
873 */
874 if((void *)bs->bs_port->alias != NULL)
875 kfree((vm_offset_t) bs->bs_port->alias,
876 sizeof (struct vstruct_alias));
1c79356b
A
877 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
878 bs->bs_port = MACH_PORT_NULL;
879 BS_UNLOCK(bs);
880
881 /*
882 * Remove backing store from backing_store list.
883 */
884 BSL_LOCK();
885 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
886 bs_links);
887 BSL_UNLOCK();
888
889 /*
890 * Free the backing store structure.
891 */
892 kfree((vm_offset_t)bs, sizeof *bs);
893
894 return KERN_SUCCESS;
895}
896
897int ps_enter(paging_segment_t); /* forward */
898
899int
900ps_enter(
901 paging_segment_t ps)
902{
903 int i;
904
905 PSL_LOCK();
906
907 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
908 if (paging_segments[i] == PAGING_SEGMENT_NULL)
909 break;
910 }
911
912 if (i < MAX_NUM_PAGING_SEGMENTS) {
913 paging_segments[i] = ps;
914 if (i > paging_segment_max)
915 paging_segment_max = i;
916 paging_segment_count++;
917 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
918 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
919 ps_select_array[ps->ps_bs->bs_priority] = 0;
920 i = 0;
921 } else {
922 PSL_UNLOCK();
923 return KERN_RESOURCE_SHORTAGE;
924 }
925
926 PSL_UNLOCK();
927 return i;
928}
929
930#ifdef DEVICE_PAGING
931kern_return_t
932default_pager_add_segment(
933 MACH_PORT_FACE backing_store,
934 MACH_PORT_FACE device,
935 recnum_t offset,
936 recnum_t count,
937 int record_size)
938{
939 backing_store_t bs;
940 paging_segment_t ps;
941 int i;
942 int error;
1c79356b
A
943
944 if ((bs = backing_store_lookup(backing_store))
945 == BACKING_STORE_NULL)
946 return KERN_INVALID_ARGUMENT;
947
948 PSL_LOCK();
949 for (i = 0; i <= paging_segment_max; i++) {
950 ps = paging_segments[i];
951 if (ps == PAGING_SEGMENT_NULL)
952 continue;
953
954 /*
955 * Check for overlap on same device.
956 */
957 if (!(ps->ps_device != device
958 || offset >= ps->ps_offset + ps->ps_recnum
959 || offset + count <= ps->ps_offset)) {
960 PSL_UNLOCK();
961 BS_UNLOCK(bs);
962 return KERN_INVALID_ARGUMENT;
963 }
964 }
965 PSL_UNLOCK();
966
967 /*
968 * Set up the paging segment
969 */
970 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
971 if (ps == PAGING_SEGMENT_NULL) {
972 BS_UNLOCK(bs);
973 return KERN_RESOURCE_SHORTAGE;
974 }
975
976 ps->ps_segtype = PS_PARTITION;
977 ps->ps_device = device;
978 ps->ps_offset = offset;
979 ps->ps_record_shift = local_log2(vm_page_size / record_size);
980 ps->ps_recnum = count;
981 ps->ps_pgnum = count >> ps->ps_record_shift;
982
983 ps->ps_pgcount = ps->ps_pgnum;
984 ps->ps_clshift = local_log2(bs->bs_clsize);
985 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
986 ps->ps_hint = 0;
987
988 PS_LOCK_INIT(ps);
989 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
990 if (!ps->ps_bmap) {
991 kfree((vm_offset_t)ps, sizeof *ps);
992 BS_UNLOCK(bs);
993 return KERN_RESOURCE_SHORTAGE;
994 }
995 for (i = 0; i < ps->ps_ncls; i++) {
996 clrbit(ps->ps_bmap, i);
997 }
998
999 ps->ps_going_away = FALSE;
1000 ps->ps_bs = bs;
1001
1002 if ((error = ps_enter(ps)) != 0) {
1003 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1004 kfree((vm_offset_t)ps, sizeof *ps);
1005 BS_UNLOCK(bs);
1006 return KERN_RESOURCE_SHORTAGE;
1007 }
1008
1009 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1010 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1011 BS_UNLOCK(bs);
1012
1013 PSL_LOCK();
1014 dp_pages_free += ps->ps_pgcount;
1015 PSL_UNLOCK();
1016
1017 bs_more_space(ps->ps_clcount);
1018
1019 DEBUG(DEBUG_BS_INTERNAL,
1020 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1021 device, offset, count, record_size,
1022 ps->ps_record_shift, ps->ps_pgnum));
1023
1024 return KERN_SUCCESS;
1025}
1026
1027boolean_t
1028bs_add_device(
1029 char *dev_name,
1030 MACH_PORT_FACE master)
1031{
1032 security_token_t null_security_token = {
1033 { 0, 0 }
1034 };
1035 MACH_PORT_FACE device;
1036 int info[DEV_GET_SIZE_COUNT];
1037 mach_msg_type_number_t info_count;
1038 MACH_PORT_FACE bs = MACH_PORT_NULL;
1039 unsigned int rec_size;
1040 recnum_t count;
1041 int clsize;
1042 MACH_PORT_FACE reply_port;
1043
1044 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1045 null_security_token, dev_name, &device))
1046 return FALSE;
1047
1048 info_count = DEV_GET_SIZE_COUNT;
1049 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1050 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1051 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1052 clsize = bs_get_global_clsize(0);
1053 if (!default_pager_backing_store_create(
0b4e3aa0 1054 default_pager_object,
1c79356b
A
1055 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1056 (clsize * vm_page_size),
1057 &bs)) {
1058 if (!default_pager_add_segment(bs, device,
1059 0, count, rec_size)) {
1060 return TRUE;
1061 }
1062 ipc_port_release_receive(bs);
1063 }
1064 }
1065
1066 ipc_port_release_send(device);
1067 return FALSE;
1068}
1069#endif /* DEVICE_PAGING */
1070
1071#if VS_ASYNC_REUSE
1072
1073struct vs_async *
1074vs_alloc_async(void)
1075{
1076 struct vs_async *vsa;
1077 MACH_PORT_FACE reply_port;
1078 kern_return_t kr;
1079
1080 VS_ASYNC_LOCK();
1081 if (vs_async_free_list == NULL) {
1082 VS_ASYNC_UNLOCK();
1083 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1084 if (vsa != NULL) {
1085 /*
1086 * Try allocating a reply port named after the
1087 * address of the vs_async structure.
1088 */
1089 struct vstruct_alias *alias_struct;
1090
1091 reply_port = ipc_port_alloc_kernel();
1092 alias_struct = (struct vstruct_alias *)
1093 kalloc(sizeof (struct vstruct_alias));
1094 if(alias_struct != NULL) {
1095 alias_struct->vs = (struct vstruct *)vsa;
1096 alias_struct->name = ISVS;
1097 reply_port->alias = (int) alias_struct;
1098 vsa->reply_port = reply_port;
1099 vs_alloc_async_count++;
1100 }
1101 else {
1102 vs_alloc_async_failed++;
1103 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1104 (reply_port));
1105 kfree((vm_offset_t)vsa,
1106 sizeof (struct vs_async));
1107 vsa = NULL;
1108 }
1109 }
1110 } else {
1111 vsa = vs_async_free_list;
1112 vs_async_free_list = vs_async_free_list->vsa_next;
1113 VS_ASYNC_UNLOCK();
1114 }
1115
1116 return vsa;
1117}
1118
1119void
1120vs_free_async(
1121 struct vs_async *vsa)
1122{
1123 VS_ASYNC_LOCK();
1124 vsa->vsa_next = vs_async_free_list;
1125 vs_async_free_list = vsa;
1126 VS_ASYNC_UNLOCK();
1127}
1128
1129#else /* VS_ASYNC_REUSE */
1130
1131struct vs_async *
1132vs_alloc_async(void)
1133{
1134 struct vs_async *vsa;
1135 MACH_PORT_FACE reply_port;
1136 kern_return_t kr;
1137
1138 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1139 if (vsa != NULL) {
1140 /*
1141 * Try allocating a reply port named after the
1142 * address of the vs_async structure.
1143 */
1144 reply_port = ipc_port_alloc_kernel();
1145 alias_struct = (vstruct_alias *)
1146 kalloc(sizeof (struct vstruct_alias));
1147 if(alias_struct != NULL) {
1148 alias_struct->vs = reply_port;
1149 alias_struct->name = ISVS;
1150 reply_port->alias = (int) vsa;
1151 vsa->reply_port = reply_port;
1152 vs_alloc_async_count++;
1153 }
1154 else {
1155 vs_alloc_async_failed++;
1156 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1157 (reply_port));
1158 kfree((vm_offset_t) vsa,
1159 sizeof (struct vs_async));
1160 vsa = NULL;
1161 }
1162 }
1163
1164 return vsa;
1165}
1166
1167void
1168vs_free_async(
1169 struct vs_async *vsa)
1170{
1c79356b
A
1171 MACH_PORT_FACE reply_port;
1172 kern_return_t kr;
1173
1174 reply_port = vsa->reply_port;
1175 kfree((vm_offset_t) reply_port->alias, sizeof (struct vstuct_alias));
1176 kfree((vm_offset_t) vsa, sizeof (struct vs_async));
1c79356b
A
1177 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1178#if 0
1179 VS_ASYNC_LOCK();
1180 vs_alloc_async_count--;
1181 VS_ASYNC_UNLOCK();
1182#endif
1183}
1184
1185#endif /* VS_ASYNC_REUSE */
1186
0b4e3aa0
A
1187zone_t vstruct_zone;
1188
1c79356b
A
1189vstruct_t
1190ps_vstruct_create(
1191 vm_size_t size)
1192{
1193 vstruct_t vs;
1194 int i;
1c79356b 1195
0b4e3aa0 1196 vs = (vstruct_t) zalloc(vstruct_zone);
1c79356b
A
1197 if (vs == VSTRUCT_NULL) {
1198 return VSTRUCT_NULL;
1199 }
1200
1201 VS_LOCK_INIT(vs);
1202
1203 /*
1204 * The following fields will be provided later.
1205 */
0b4e3aa0
A
1206 vs->vs_mem_obj = NULL;
1207 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1208 vs->vs_references = 1;
1c79356b 1209 vs->vs_seqno = 0;
1c79356b
A
1210
1211#ifdef MACH_KERNEL
1212 vs->vs_waiting_seqno = FALSE;
1213 vs->vs_waiting_read = FALSE;
1214 vs->vs_waiting_write = FALSE;
1c79356b
A
1215 vs->vs_waiting_async = FALSE;
1216#else
1217 mutex_init(&vs->vs_waiting_seqno, ETAP_DPAGE_VSSEQNO);
1218 mutex_init(&vs->vs_waiting_read, ETAP_DPAGE_VSREAD);
1219 mutex_init(&vs->vs_waiting_write, ETAP_DPAGE_VSWRITE);
1220 mutex_init(&vs->vs_waiting_refs, ETAP_DPAGE_VSREFS);
1221 mutex_init(&vs->vs_waiting_async, ETAP_DPAGE_VSASYNC);
1222#endif
1223
1224 vs->vs_readers = 0;
1225 vs->vs_writers = 0;
1226
1227 vs->vs_errors = 0;
1228
1229 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
55e303ae 1230 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1c79356b
A
1231 vs->vs_async_pending = 0;
1232
1233 /*
1234 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1235 * depending on the size of the memory object.
1236 */
1237 if (INDIRECT_CLMAP(vs->vs_size)) {
1238 vs->vs_imap = (struct vs_map **)
1239 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1240 vs->vs_indirect = TRUE;
1241 } else {
1242 vs->vs_dmap = (struct vs_map *)
1243 kalloc(CLMAP_SIZE(vs->vs_size));
1244 vs->vs_indirect = FALSE;
1245 }
1246 vs->vs_xfer_pending = FALSE;
1247 DEBUG(DEBUG_VS_INTERNAL,
1248 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1249
1250 /*
1251 * Check to see that we got the space.
1252 */
1253 if (!vs->vs_dmap) {
1254 kfree((vm_offset_t)vs, sizeof *vs);
1255 return VSTRUCT_NULL;
1256 }
1257
1258 /*
1259 * Zero the indirect pointers, or clear the direct pointers.
1260 */
1261 if (vs->vs_indirect)
1262 memset(vs->vs_imap, 0,
1263 INDIRECT_CLMAP_SIZE(vs->vs_size));
1264 else
1265 for (i = 0; i < vs->vs_size; i++)
1266 VSM_CLR(vs->vs_dmap[i]);
1267
1268 VS_MAP_LOCK_INIT(vs);
1269
1270 bs_commit(vs->vs_size);
1271
1272 return vs;
1273}
1274
1275paging_segment_t ps_select_segment(int, int *); /* forward */
1276
1277paging_segment_t
1278ps_select_segment(
1279 int shift,
1280 int *psindex)
1281{
1282 paging_segment_t ps;
1283 int i;
1284 int j;
1c79356b
A
1285
1286 /*
1287 * Optimize case where there's only one segment.
1288 * paging_segment_max will index the one and only segment.
1289 */
1290
1291 PSL_LOCK();
1292 if (paging_segment_count == 1) {
1293 paging_segment_t lps; /* used to avoid extra PS_UNLOCK */
0b4e3aa0 1294 ipc_port_t trigger = IP_NULL;
1c79356b
A
1295
1296 ps = paging_segments[paging_segment_max];
1297 *psindex = paging_segment_max;
1298 PS_LOCK(ps);
1299 if (ps->ps_going_away) {
1300 /* this segment is being turned off */
1301 lps = PAGING_SEGMENT_NULL;
1302 } else {
1303 ASSERT(ps->ps_clshift >= shift);
1304 if (ps->ps_clcount) {
1305 ps->ps_clcount--;
1306 dp_pages_free -= 1 << ps->ps_clshift;
1307 if(min_pages_trigger_port &&
1308 (dp_pages_free < minimum_pages_remaining)) {
0b4e3aa0 1309 trigger = min_pages_trigger_port;
1c79356b
A
1310 min_pages_trigger_port = NULL;
1311 bs_low = TRUE;
1312 }
1313 lps = ps;
1314 } else
1315 lps = PAGING_SEGMENT_NULL;
1316 }
1317 PS_UNLOCK(ps);
1318 PSL_UNLOCK();
0b4e3aa0
A
1319
1320 if (trigger != IP_NULL) {
1321 default_pager_space_alert(trigger, HI_WAT_ALERT);
1322 ipc_port_release_send(trigger);
1323 }
1c79356b
A
1324 return lps;
1325 }
1326
1327 if (paging_segment_count == 0) {
1328 PSL_UNLOCK();
1329 return PAGING_SEGMENT_NULL;
1330 }
1331
1332 for (i = BS_MAXPRI;
1333 i >= BS_MINPRI; i--) {
1334 int start_index;
1335
1336 if ((ps_select_array[i] == BS_NOPRI) ||
1337 (ps_select_array[i] == BS_FULLPRI))
1338 continue;
1339 start_index = ps_select_array[i];
1340
1341 if(!(paging_segments[start_index])) {
1342 j = start_index+1;
1343 physical_transfer_cluster_count = 0;
1344 }
0b4e3aa0 1345 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1c79356b 1346 (((paging_segments[start_index])->ps_clshift)
0b4e3aa0 1347 + vm_page_shift))) {
1c79356b
A
1348 physical_transfer_cluster_count = 0;
1349 j = start_index + 1;
1350 } else {
1351 physical_transfer_cluster_count+=1;
1352 j = start_index;
1353 if(start_index == 0)
1354 start_index = paging_segment_max;
1355 else
1356 start_index = start_index - 1;
1357 }
1358
1359 while (1) {
1360 if (j > paging_segment_max)
1361 j = 0;
1362 if ((ps = paging_segments[j]) &&
1363 (ps->ps_bs->bs_priority == i)) {
1364 /*
1365 * Force the ps cluster size to be
1366 * >= that of the vstruct.
1367 */
1368 PS_LOCK(ps);
1369 if (ps->ps_going_away) {
1370 /* this segment is being turned off */
1371 } else if ((ps->ps_clcount) &&
1372 (ps->ps_clshift >= shift)) {
0b4e3aa0
A
1373 ipc_port_t trigger = IP_NULL;
1374
1c79356b
A
1375 ps->ps_clcount--;
1376 dp_pages_free -= 1 << ps->ps_clshift;
1377 if(min_pages_trigger_port &&
1378 (dp_pages_free <
1379 minimum_pages_remaining)) {
0b4e3aa0 1380 trigger = min_pages_trigger_port;
1c79356b
A
1381 min_pages_trigger_port = NULL;
1382 }
1383 PS_UNLOCK(ps);
1384 /*
1385 * found one, quit looking.
1386 */
1387 ps_select_array[i] = j;
1388 PSL_UNLOCK();
0b4e3aa0
A
1389
1390 if (trigger != IP_NULL) {
1391 default_pager_space_alert(
1392 trigger,
1393 HI_WAT_ALERT);
1394 ipc_port_release_send(trigger);
1395 }
1c79356b
A
1396 *psindex = j;
1397 return ps;
1398 }
1399 PS_UNLOCK(ps);
1400 }
1401 if (j == start_index) {
1402 /*
1403 * none at this priority -- mark it full
1404 */
1405 ps_select_array[i] = BS_FULLPRI;
1406 break;
1407 }
1408 j++;
1409 }
1410 }
1411 PSL_UNLOCK();
1412 return PAGING_SEGMENT_NULL;
1413}
1414
1415vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1416
1417vm_offset_t
1418ps_allocate_cluster(
1419 vstruct_t vs,
1420 int *psindex,
1421 paging_segment_t use_ps)
1422{
1423 int byte_num;
1424 int bit_num = 0;
1425 paging_segment_t ps;
1426 vm_offset_t cluster;
0b4e3aa0 1427 ipc_port_t trigger = IP_NULL;
1c79356b
A
1428
1429 /*
1430 * Find best paging segment.
1431 * ps_select_segment will decrement cluster count on ps.
1432 * Must pass cluster shift to find the most appropriate segment.
1433 */
1434 /* NOTE: The addition of paging segment delete capability threatened
1435 * to seriously complicate the treatment of paging segments in this
1436 * module and the ones that call it (notably ps_clmap), because of the
1437 * difficulty in assuring that the paging segment would continue to
1438 * exist between being unlocked and locked. This was
1439 * avoided because all calls to this module are based in either
1440 * dp_memory_object calls which rely on the vs lock, or by
1441 * the transfer function which is part of the segment delete path.
1442 * The transfer function which is part of paging segment delete is
1443 * protected from multiple callers by the backing store lock.
1444 * The paging segment delete function treats mappings to a paging
1445 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1446 * while data is transferred to the remaining segments. This is in
1447 * line with the view that incomplete or in-transition mappings between
1448 * data, a vstruct, and backing store are protected by the vs lock.
1449 * This and the ordering of the paging segment "going_away" bit setting
1450 * protects us.
1451 */
1452 if (use_ps != PAGING_SEGMENT_NULL) {
1453 ps = use_ps;
1454 PSL_LOCK();
1455 PS_LOCK(ps);
55e303ae
A
1456
1457 ASSERT(ps->ps_clcount != 0);
1458
1c79356b
A
1459 ps->ps_clcount--;
1460 dp_pages_free -= 1 << ps->ps_clshift;
1c79356b
A
1461 if(min_pages_trigger_port &&
1462 (dp_pages_free < minimum_pages_remaining)) {
0b4e3aa0 1463 trigger = min_pages_trigger_port;
1c79356b
A
1464 min_pages_trigger_port = NULL;
1465 }
0b4e3aa0 1466 PSL_UNLOCK();
1c79356b 1467 PS_UNLOCK(ps);
0b4e3aa0
A
1468 if (trigger != IP_NULL) {
1469 default_pager_space_alert(trigger, HI_WAT_ALERT);
1470 ipc_port_release_send(trigger);
1471 }
1472
1c79356b
A
1473 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1474 PAGING_SEGMENT_NULL) {
1475#if 0
1476 bs_no_paging_space(TRUE);
1477#endif
1478#if 0
1479 if (verbose)
1480#endif
1481 dprintf(("no space in available paging segments; "
1482 "swapon suggested\n"));
1483 /* the count got off maybe, reset to zero */
0b4e3aa0 1484 PSL_LOCK();
1c79356b
A
1485 dp_pages_free = 0;
1486 if(min_pages_trigger_port) {
0b4e3aa0 1487 trigger = min_pages_trigger_port;
1c79356b
A
1488 min_pages_trigger_port = NULL;
1489 bs_low = TRUE;
1490 }
0b4e3aa0
A
1491 PSL_UNLOCK();
1492 if (trigger != IP_NULL) {
1493 default_pager_space_alert(trigger, HI_WAT_ALERT);
1494 ipc_port_release_send(trigger);
1495 }
1c79356b
A
1496 return (vm_offset_t) -1;
1497 }
1c79356b
A
1498
1499 /*
1500 * Look for an available cluster. At the end of the loop,
1501 * byte_num is the byte offset and bit_num is the bit offset of the
1502 * first zero bit in the paging segment bitmap.
1503 */
1504 PS_LOCK(ps);
1505 byte_num = ps->ps_hint;
1506 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1507 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1508 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1509 if (isclr((ps->ps_bmap + byte_num), bit_num))
1510 break;
1511 }
1512 ASSERT(bit_num != NBBY);
1513 break;
1514 }
1515 }
1516 ps->ps_hint = byte_num;
1517 cluster = (byte_num*NBBY) + bit_num;
1518
1519 /* Space was reserved, so this must be true */
1520 ASSERT(cluster < ps->ps_ncls);
1521
1522 setbit(ps->ps_bmap, cluster);
1523 PS_UNLOCK(ps);
1524
1525 return cluster;
1526}
1527
1528void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */
1529
1530void
1531ps_deallocate_cluster(
1532 paging_segment_t ps,
1533 vm_offset_t cluster)
1534{
1535
1536 if (cluster >= (vm_offset_t) ps->ps_ncls)
1537 panic("ps_deallocate_cluster: Invalid cluster number");
1538
1539 /*
1540 * Lock the paging segment, clear the cluster's bitmap and increment the
1541 * number of free cluster.
1542 */
1543 PSL_LOCK();
1544 PS_LOCK(ps);
1545 clrbit(ps->ps_bmap, cluster);
1546 ++ps->ps_clcount;
1547 dp_pages_free += 1 << ps->ps_clshift;
0b4e3aa0 1548 PSL_UNLOCK();
1c79356b
A
1549
1550 /*
1551 * Move the hint down to the freed cluster if it is
1552 * less than the current hint.
1553 */
1554 if ((cluster/NBBY) < ps->ps_hint) {
1555 ps->ps_hint = (cluster/NBBY);
1556 }
1557
1558 PS_UNLOCK(ps);
1559
1560 /*
1561 * If we're freeing space on a full priority, reset the array.
1562 */
1563 PSL_LOCK();
1564 if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1565 ps_select_array[ps->ps_bs->bs_priority] = 0;
1566 PSL_UNLOCK();
1567
1568 return;
1569}
1570
1571void ps_dealloc_vsmap(struct vs_map *, vm_size_t); /* forward */
1572
1573void
1574ps_dealloc_vsmap(
1575 struct vs_map *vsmap,
1576 vm_size_t size)
1577{
1578 int i;
1579 for (i = 0; i < size; i++)
1580 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1581 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1582 VSM_CLOFF(vsmap[i]));
1583}
1584
1585void
1586ps_vstruct_dealloc(
1587 vstruct_t vs)
1588{
1589 int i;
1590 spl_t s;
1c79356b
A
1591
1592 VS_MAP_LOCK(vs);
1593
1594 /*
1595 * If this is an indirect structure, then we walk through the valid
1596 * (non-zero) indirect pointers and deallocate the clusters
1597 * associated with each used map entry (via ps_dealloc_vsmap).
1598 * When all of the clusters in an indirect block have been
1599 * freed, we deallocate the block. When all of the indirect
1600 * blocks have been deallocated we deallocate the memory
1601 * holding the indirect pointers.
1602 */
1603 if (vs->vs_indirect) {
1604 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1605 if (vs->vs_imap[i] != NULL) {
1606 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1607 kfree((vm_offset_t)vs->vs_imap[i],
1608 CLMAP_THRESHOLD);
1609 }
1610 }
1611 kfree((vm_offset_t)vs->vs_imap,
1612 INDIRECT_CLMAP_SIZE(vs->vs_size));
1613 } else {
1614 /*
1615 * Direct map. Free used clusters, then memory.
1616 */
1617 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1618 kfree((vm_offset_t)vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1619 }
1620 VS_MAP_UNLOCK(vs);
1621
1622 bs_commit(- vs->vs_size);
1623
0b4e3aa0 1624 zfree(vstruct_zone, (vm_offset_t)vs);
1c79356b
A
1625}
1626
1627int ps_map_extend(vstruct_t, int); /* forward */
1628
1629int ps_map_extend(
1630 vstruct_t vs,
1631 int new_size)
1632{
1633 struct vs_map **new_imap;
1634 struct vs_map *new_dmap = NULL;
1635 int newdsize;
1636 int i;
1637 void *old_map = NULL;
1638 int old_map_size = 0;
1639
1640 if (vs->vs_size >= new_size) {
1641 /*
1642 * Someone has already done the work.
1643 */
1644 return 0;
1645 }
1646
1647 /*
1648 * If the new size extends into the indirect range, then we have one
1649 * of two cases: we are going from indirect to indirect, or we are
1650 * going from direct to indirect. If we are going from indirect to
1651 * indirect, then it is possible that the new size will fit in the old
1652 * indirect map. If this is the case, then just reset the size of the
1653 * vstruct map and we are done. If the new size will not
1654 * fit into the old indirect map, then we have to allocate a new
1655 * indirect map and copy the old map pointers into this new map.
1656 *
1657 * If we are going from direct to indirect, then we have to allocate a
1658 * new indirect map and copy the old direct pages into the first
1659 * indirect page of the new map.
1660 * NOTE: allocating memory here is dangerous, as we're in the
1661 * pageout path.
1662 */
1663 if (INDIRECT_CLMAP(new_size)) {
1664 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1665
1666 /*
1667 * Get a new indirect map and zero it.
1668 */
1669 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1670 if (vs->vs_indirect &&
1671 (new_map_size == old_map_size)) {
1672 bs_commit(new_size - vs->vs_size);
1673 vs->vs_size = new_size;
1674 return 0;
1675 }
1676
1677 new_imap = (struct vs_map **)kalloc(new_map_size);
1678 if (new_imap == NULL) {
1679 return -1;
1680 }
1681 memset(new_imap, 0, new_map_size);
1682
1683 if (vs->vs_indirect) {
1684 /* Copy old entries into new map */
1685 memcpy(new_imap, vs->vs_imap, old_map_size);
1686 /* Arrange to free the old map */
1687 old_map = (void *) vs->vs_imap;
1688 newdsize = 0;
1689 } else { /* Old map was a direct map */
1690 /* Allocate an indirect page */
1691 if ((new_imap[0] = (struct vs_map *)
1692 kalloc(CLMAP_THRESHOLD)) == NULL) {
1693 kfree((vm_offset_t)new_imap, new_map_size);
1694 return -1;
1695 }
1696 new_dmap = new_imap[0];
1697 newdsize = CLMAP_ENTRIES;
1698 }
1699 } else {
1700 new_imap = NULL;
1701 newdsize = new_size;
1702 /*
1703 * If the new map is a direct map, then the old map must
1704 * also have been a direct map. All we have to do is
1705 * to allocate a new direct map, copy the old entries
1706 * into it and free the old map.
1707 */
1708 if ((new_dmap = (struct vs_map *)
1709 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1710 return -1;
1711 }
1712 }
1713 if (newdsize) {
1714
1715 /* Free the old map */
1716 old_map = (void *) vs->vs_dmap;
1717 old_map_size = CLMAP_SIZE(vs->vs_size);
1718
1719 /* Copy info from the old map into the new map */
1720 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1721
1722 /* Initialize the rest of the new map */
1723 for (i = vs->vs_size; i < newdsize; i++)
1724 VSM_CLR(new_dmap[i]);
1725 }
1726 if (new_imap) {
1727 vs->vs_imap = new_imap;
1728 vs->vs_indirect = TRUE;
1729 } else
1730 vs->vs_dmap = new_dmap;
1731 bs_commit(new_size - vs->vs_size);
1732 vs->vs_size = new_size;
1733 if (old_map)
1734 kfree((vm_offset_t)old_map, old_map_size);
1735 return 0;
1736}
1737
1738vm_offset_t
1739ps_clmap(
1740 vstruct_t vs,
1741 vm_offset_t offset,
1742 struct clmap *clmap,
1743 int flag,
1744 vm_size_t size,
1745 int error)
1746{
1747 vm_offset_t cluster; /* The cluster of offset. */
1748 vm_offset_t newcl; /* The new cluster allocated. */
1749 vm_offset_t newoff;
1750 int i;
1751 struct vs_map *vsmap;
1c79356b
A
1752
1753 VS_MAP_LOCK(vs);
1754
1755 ASSERT(vs->vs_dmap);
55e303ae 1756 cluster = atop_32(offset) >> vs->vs_clshift;
1c79356b
A
1757
1758 /*
1759 * Initialize cluster error value
1760 */
1761 clmap->cl_error = 0;
1762
1763 /*
1764 * If the object has grown, extend the page map.
1765 */
1766 if (cluster >= vs->vs_size) {
1767 if (flag == CL_FIND) {
1768 /* Do not allocate if just doing a lookup */
1769 VS_MAP_UNLOCK(vs);
1770 return (vm_offset_t) -1;
1771 }
1772 if (ps_map_extend(vs, cluster + 1)) {
1773 VS_MAP_UNLOCK(vs);
1774 return (vm_offset_t) -1;
1775 }
1776 }
1777
1778 /*
1779 * Look for the desired cluster. If the map is indirect, then we
1780 * have a two level lookup. First find the indirect block, then
1781 * find the actual cluster. If the indirect block has not yet
1782 * been allocated, then do so. If the cluster has not yet been
1783 * allocated, then do so.
1784 *
1785 * If any of the allocations fail, then return an error.
1786 * Don't allocate if just doing a lookup.
1787 */
1788 if (vs->vs_indirect) {
1789 long ind_block = cluster/CLMAP_ENTRIES;
1790
1791 /* Is the indirect block allocated? */
1792 vsmap = vs->vs_imap[ind_block];
1793 if (vsmap == NULL) {
1794 if (flag == CL_FIND) {
1795 VS_MAP_UNLOCK(vs);
1796 return (vm_offset_t) -1;
1797 }
1798
1799 /* Allocate the indirect block */
1800 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1801 if (vsmap == NULL) {
1802 VS_MAP_UNLOCK(vs);
1803 return (vm_offset_t) -1;
1804 }
1805 /* Initialize the cluster offsets */
1806 for (i = 0; i < CLMAP_ENTRIES; i++)
1807 VSM_CLR(vsmap[i]);
1808 vs->vs_imap[ind_block] = vsmap;
1809 }
1810 } else
1811 vsmap = vs->vs_dmap;
1812
1813 ASSERT(vsmap);
1814 vsmap += cluster%CLMAP_ENTRIES;
1815
1816 /*
1817 * At this point, vsmap points to the struct vs_map desired.
1818 *
1819 * Look in the map for the cluster, if there was an error on a
1820 * previous write, flag it and return. If it is not yet
1821 * allocated, then allocate it, if we're writing; if we're
1822 * doing a lookup and the cluster's not allocated, return error.
1823 */
1824 if (VSM_ISERR(*vsmap)) {
1825 clmap->cl_error = VSM_GETERR(*vsmap);
1826 VS_MAP_UNLOCK(vs);
1827 return (vm_offset_t) -1;
1828 } else if (VSM_ISCLR(*vsmap)) {
1829 int psindex;
1830
1831 if (flag == CL_FIND) {
1832 /*
1833 * If there's an error and the entry is clear, then
1834 * we've run out of swap space. Record the error
1835 * here and return.
1836 */
1837 if (error) {
1838 VSM_SETERR(*vsmap, error);
1839 }
1840 VS_MAP_UNLOCK(vs);
1841 return (vm_offset_t) -1;
1842 } else {
1843 /*
1844 * Attempt to allocate a cluster from the paging segment
1845 */
1846 newcl = ps_allocate_cluster(vs, &psindex,
1847 PAGING_SEGMENT_NULL);
1848 if (newcl == -1) {
1849 VS_MAP_UNLOCK(vs);
1850 return (vm_offset_t) -1;
1851 }
1852 VSM_CLR(*vsmap);
1853 VSM_SETCLOFF(*vsmap, newcl);
1854 VSM_SETPS(*vsmap, psindex);
1855 }
1856 } else
1857 newcl = VSM_CLOFF(*vsmap);
1858
1859 /*
1860 * Fill in pertinent fields of the clmap
1861 */
1862 clmap->cl_ps = VSM_PS(*vsmap);
1863 clmap->cl_numpages = VSCLSIZE(vs);
1864 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1865
1866 /*
1867 * Byte offset in paging segment is byte offset to cluster plus
1868 * byte offset within cluster. It looks ugly, but should be
1869 * relatively quick.
1870 */
1871 ASSERT(trunc_page(offset) == offset);
55e303ae 1872 newcl = ptoa_32(newcl) << vs->vs_clshift;
1c79356b
A
1873 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1874 if (flag == CL_ALLOC) {
1875 /*
1876 * set bits in the allocation bitmap according to which
1877 * pages were requested. size is in bytes.
1878 */
55e303ae 1879 i = atop_32(newoff);
1c79356b
A
1880 while ((size > 0) && (i < VSCLSIZE(vs))) {
1881 VSM_SETALLOC(*vsmap, i);
1882 i++;
1883 size -= vm_page_size;
1884 }
1885 }
1886 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1887 if (newoff) {
1888 /*
1889 * Offset is not cluster aligned, so number of pages
1890 * and bitmaps must be adjusted
1891 */
55e303ae 1892 clmap->cl_numpages -= atop_32(newoff);
1c79356b
A
1893 CLMAP_SHIFT(clmap, vs);
1894 CLMAP_SHIFTALLOC(clmap, vs);
1895 }
1896
1897 /*
1898 *
1899 * The setting of valid bits and handling of write errors
1900 * must be done here, while we hold the lock on the map.
1901 * It logically should be done in ps_vs_write_complete().
1902 * The size and error information has been passed from
1903 * ps_vs_write_complete(). If the size parameter is non-zero,
1904 * then there is work to be done. If error is also non-zero,
1905 * then the error number is recorded in the cluster and the
1906 * entire cluster is in error.
1907 */
1908 if (size && flag == CL_FIND) {
1909 vm_offset_t off = (vm_offset_t) 0;
1910
1911 if (!error) {
1912 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1913 i++) {
1914 VSM_SETPG(*vsmap, i);
1915 size -= vm_page_size;
1916 }
1917 ASSERT(i <= VSCLSIZE(vs));
1918 } else {
1919 BS_STAT(clmap->cl_ps->ps_bs,
1920 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
55e303ae 1921 atop_32(size));
1c79356b
A
1922 off = VSM_CLOFF(*vsmap);
1923 VSM_SETERR(*vsmap, error);
1924 }
1925 /*
1926 * Deallocate cluster if error, and no valid pages
1927 * already present.
1928 */
1929 if (off != (vm_offset_t) 0)
1930 ps_deallocate_cluster(clmap->cl_ps, off);
1931 VS_MAP_UNLOCK(vs);
1932 return (vm_offset_t) 0;
1933 } else
1934 VS_MAP_UNLOCK(vs);
1935
1936 DEBUG(DEBUG_VS_INTERNAL,
1937 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1938 newcl+newoff, (int) vs, (int) vsmap, flag));
1939 DEBUG(DEBUG_VS_INTERNAL,
1940 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1941 (int) clmap->cl_ps, clmap->cl_numpages,
1942 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1943
1944 return (newcl + newoff);
1945}
1946
1947void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */
1948
1949void
1950ps_clunmap(
1951 vstruct_t vs,
1952 vm_offset_t offset,
1953 vm_size_t length)
1954{
1955 vm_offset_t cluster; /* The cluster number of offset */
1956 struct vs_map *vsmap;
1c79356b
A
1957
1958 VS_MAP_LOCK(vs);
1959
1960 /*
1961 * Loop through all clusters in this range, freeing paging segment
1962 * clusters and map entries as encountered.
1963 */
1964 while (length > 0) {
1965 vm_offset_t newoff;
1966 int i;
1967
55e303ae 1968 cluster = atop_32(offset) >> vs->vs_clshift;
1c79356b
A
1969 if (vs->vs_indirect) /* indirect map */
1970 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
1971 else
1972 vsmap = vs->vs_dmap;
1973 if (vsmap == NULL) {
1974 VS_MAP_UNLOCK(vs);
1975 return;
1976 }
1977 vsmap += cluster%CLMAP_ENTRIES;
1978 if (VSM_ISCLR(*vsmap)) {
1979 length -= vm_page_size;
1980 offset += vm_page_size;
1981 continue;
1982 }
1983 /*
1984 * We've got a valid mapping. Clear it and deallocate
1985 * paging segment cluster pages.
1986 * Optimize for entire cluster cleraing.
1987 */
1988 if (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) {
1989 /*
1990 * Not cluster aligned.
1991 */
1992 ASSERT(trunc_page(newoff) == newoff);
55e303ae 1993 i = atop_32(newoff);
1c79356b
A
1994 } else
1995 i = 0;
1996 while ((i < VSCLSIZE(vs)) && (length > 0)) {
1997 VSM_CLRPG(*vsmap, i);
1998 VSM_CLRALLOC(*vsmap, i);
1999 length -= vm_page_size;
2000 offset += vm_page_size;
2001 i++;
2002 }
2003
2004 /*
2005 * If map entry is empty, clear and deallocate cluster.
2006 */
2007 if (!VSM_ALLOC(*vsmap)) {
2008 ps_deallocate_cluster(VSM_PS(*vsmap),
2009 VSM_CLOFF(*vsmap));
2010 VSM_CLR(*vsmap);
2011 }
2012 }
2013
2014 VS_MAP_UNLOCK(vs);
2015}
2016
2017void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2018
2019void
2020ps_vs_write_complete(
2021 vstruct_t vs,
2022 vm_offset_t offset,
2023 vm_size_t size,
2024 int error)
2025{
2026 struct clmap clmap;
2027
2028 /*
2029 * Get the struct vsmap for this cluster.
2030 * Use READ, even though it was written, because the
2031 * cluster MUST be present, unless there was an error
2032 * in the original ps_clmap (e.g. no space), in which
2033 * case, nothing happens.
2034 *
2035 * Must pass enough information to ps_clmap to allow it
2036 * to set the vs_map structure bitmap under lock.
2037 */
2038 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2039}
2040
2041void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */
2042
2043void
2044vs_cl_write_complete(
2045 vstruct_t vs,
2046 paging_segment_t ps,
2047 vm_offset_t offset,
2048 vm_offset_t addr,
2049 vm_size_t size,
2050 boolean_t async,
2051 int error)
2052{
1c79356b
A
2053 kern_return_t kr;
2054
2055 if (error) {
2056 /*
2057 * For internal objects, the error is recorded on a
2058 * per-cluster basis by ps_clmap() which is called
2059 * by ps_vs_write_complete() below.
2060 */
2061 dprintf(("write failed error = 0x%x\n", error));
2062 /* add upl_abort code here */
2063 } else
55e303ae 2064 GSTAT(global_stats.gs_pages_out += atop_32(size));
1c79356b
A
2065 /*
2066 * Notify the vstruct mapping code, so it can do its accounting.
2067 */
2068 ps_vs_write_complete(vs, offset, size, error);
2069
2070 if (async) {
2071 VS_LOCK(vs);
2072 ASSERT(vs->vs_async_pending > 0);
2073 vs->vs_async_pending -= size;
0b4e3aa0
A
2074 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2075 vs->vs_waiting_async = FALSE;
1c79356b
A
2076 VS_UNLOCK(vs);
2077 /* mutex_unlock(&vs->vs_waiting_async); */
0b4e3aa0 2078 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
2079 } else {
2080 VS_UNLOCK(vs);
2081 }
2082 }
2083}
2084
2085#ifdef DEVICE_PAGING
2086kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2087
2088kern_return_t
2089device_write_reply(
2090 MACH_PORT_FACE reply_port,
2091 kern_return_t device_code,
2092 io_buf_len_t bytes_written)
2093{
2094 struct vs_async *vsa;
1c79356b
A
2095
2096 vsa = (struct vs_async *)
2097 ((struct vstruct_alias *)(reply_port->alias))->vs;
2098
2099 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2100 device_code = KERN_FAILURE;
2101 }
2102
2103 vsa->vsa_error = device_code;
2104
2105
2106 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2107 if(vsa->vsa_flags & VSA_TRANSFER) {
2108 /* revisit when async disk segments redone */
2109 if(vsa->vsa_error) {
2110 /* need to consider error condition. re-write data or */
2111 /* throw it away here. */
2112 vm_offset_t ioaddr;
2113 if(vm_map_copyout(kernel_map, &ioaddr,
2114 (vm_map_copy_t)vsa->vsa_addr) != KERN_SUCCESS)
2115 panic("vs_cluster_write: unable to copy source list\n");
2116 vm_deallocate(kernel_map, ioaddr, vsa->vsa_size);
2117 }
2118 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2119 vsa->vsa_size, vsa->vsa_error);
2120 } else {
2121 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2122 vsa->vsa_addr, vsa->vsa_size, TRUE,
2123 vsa->vsa_error);
2124 }
2125 VS_FREE_ASYNC(vsa);
2126
2127 return KERN_SUCCESS;
2128}
2129
2130kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2131kern_return_t
2132device_write_reply_inband(
2133 MACH_PORT_FACE reply_port,
2134 kern_return_t return_code,
2135 io_buf_len_t bytes_written)
2136{
2137 panic("device_write_reply_inband: illegal");
2138 return KERN_SUCCESS;
2139}
2140
2141kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2142kern_return_t
2143device_read_reply(
2144 MACH_PORT_FACE reply_port,
2145 kern_return_t return_code,
2146 io_buf_ptr_t data,
2147 mach_msg_type_number_t dataCnt)
2148{
2149 struct vs_async *vsa;
2150 vsa = (struct vs_async *)
2151 ((struct vstruct_alias *)(reply_port->alias))->vs;
2152 vsa->vsa_addr = (vm_offset_t)data;
2153 vsa->vsa_size = (vm_size_t)dataCnt;
2154 vsa->vsa_error = return_code;
2155 thread_wakeup(&vsa->vsa_lock);
2156 return KERN_SUCCESS;
2157}
2158
2159kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2160kern_return_t
2161device_read_reply_inband(
2162 MACH_PORT_FACE reply_port,
2163 kern_return_t return_code,
2164 io_buf_ptr_inband_t data,
2165 mach_msg_type_number_t dataCnt)
2166{
2167 panic("device_read_reply_inband: illegal");
2168 return KERN_SUCCESS;
2169}
2170
2171kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2172kern_return_t
2173device_read_reply_overwrite(
2174 MACH_PORT_FACE reply_port,
2175 kern_return_t return_code,
2176 io_buf_len_t bytes_read)
2177{
2178 panic("device_read_reply_overwrite: illegal\n");
2179 return KERN_SUCCESS;
2180}
2181
2182kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2183kern_return_t
2184device_open_reply(
2185 MACH_PORT_FACE reply_port,
2186 kern_return_t return_code,
2187 MACH_PORT_FACE device_port)
2188{
2189 panic("device_open_reply: illegal\n");
2190 return KERN_SUCCESS;
2191}
2192
2193kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
2194
2195kern_return_t
2196ps_read_device(
2197 paging_segment_t ps,
2198 vm_offset_t offset,
2199 vm_offset_t *bufferp,
2200 unsigned int size,
2201 unsigned int *residualp,
2202 int flags)
2203{
2204 kern_return_t kr;
2205 recnum_t dev_offset;
2206 unsigned int bytes_wanted;
2207 unsigned int bytes_read;
2208 unsigned int total_read;
2209 vm_offset_t dev_buffer;
2210 vm_offset_t buf_ptr;
2211 unsigned int records_read;
1c79356b
A
2212 struct vs_async *vsa;
2213 mutex_t vs_waiting_read_reply;
2214
2215 device_t device;
2216 vm_map_copy_t device_data = NULL;
2217 default_pager_thread_t *dpt = NULL;
2218
2219 device = dev_port_lookup(ps->ps_device);
55e303ae 2220 clustered_reads[atop_32(size)]++;
1c79356b
A
2221
2222 dev_offset = (ps->ps_offset +
2223 (offset >> (vm_page_shift - ps->ps_record_shift)));
2224 bytes_wanted = size;
2225 total_read = 0;
2226 *bufferp = (vm_offset_t)NULL;
2227
2228 do {
2229 vsa = VS_ALLOC_ASYNC();
2230 if (vsa) {
2231 vsa->vsa_vs = NULL;
2232 vsa->vsa_addr = 0;
2233 vsa->vsa_offset = 0;
2234 vsa->vsa_size = 0;
2235 vsa->vsa_ps = NULL;
2236 }
2237 mutex_init(&vsa->vsa_lock, ETAP_DPAGE_VSSEQNO);
2238 ip_lock(vsa->reply_port);
2239 vsa->reply_port->ip_sorights++;
2240 ip_reference(vsa->reply_port);
2241 ip_unlock(vsa->reply_port);
2242 kr = ds_device_read_common(device,
2243 vsa->reply_port,
2244 (mach_msg_type_name_t)
2245 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2246 (dev_mode_t) 0,
2247 dev_offset,
2248 bytes_wanted,
2249 (IO_READ | IO_CALL),
2250 (io_buf_ptr_t *) &dev_buffer,
2251 (mach_msg_type_number_t *) &bytes_read);
2252 if(kr == MIG_NO_REPLY) {
2253 assert_wait(&vsa->vsa_lock, THREAD_UNINT);
9bccf70c 2254 thread_block(THREAD_CONTINUE_NULL);
1c79356b
A
2255
2256 dev_buffer = vsa->vsa_addr;
2257 bytes_read = (unsigned int)vsa->vsa_size;
2258 kr = vsa->vsa_error;
2259 }
2260 VS_FREE_ASYNC(vsa);
2261 if (kr != KERN_SUCCESS || bytes_read == 0) {
2262 break;
2263 }
2264 total_read += bytes_read;
2265
2266 /*
2267 * If we got the entire range, use the returned dev_buffer.
2268 */
2269 if (bytes_read == size) {
2270 *bufferp = (vm_offset_t)dev_buffer;
2271 break;
2272 }
2273
2274#if 1
2275 dprintf(("read only %d bytes out of %d\n",
2276 bytes_read, bytes_wanted));
2277#endif
2278 if(dpt == NULL) {
2279 dpt = get_read_buffer();
2280 buf_ptr = dpt->dpt_buffer;
2281 *bufferp = (vm_offset_t)buf_ptr;
2282 }
2283 /*
2284 * Otherwise, copy the data into the provided buffer (*bufferp)
2285 * and append the rest of the range as it comes in.
2286 */
2287 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2288 buf_ptr += bytes_read;
2289 bytes_wanted -= bytes_read;
2290 records_read = (bytes_read >>
2291 (vm_page_shift - ps->ps_record_shift));
2292 dev_offset += records_read;
2293 DEBUG(DEBUG_VS_INTERNAL,
2294 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2295 dev_buffer, bytes_read));
2296 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2297 != KERN_SUCCESS)
2298 Panic("dealloc buf");
2299 } while (bytes_wanted);
2300
2301 *residualp = size - total_read;
2302 if((dev_buffer != *bufferp) && (total_read != 0)) {
2303 vm_offset_t temp_buffer;
2304 vm_allocate(kernel_map, &temp_buffer, total_read, TRUE);
2305 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2306 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2307 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2308 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2309 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2310 (vm_map_copy_t *)&device_data, FALSE))
2311 panic("ps_read_device: cannot copyin locally provided buffer\n");
2312 }
2313 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2314 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2315 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2316 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2317 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2318 (vm_map_copy_t *)&device_data, FALSE))
2319 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2320 }
2321 else {
2322 device_data = NULL;
2323 }
2324 *bufferp = (vm_offset_t)device_data;
2325
2326 if(dpt != NULL) {
2327 /* Free the receive buffer */
2328 dpt->checked_out = 0;
2329 thread_wakeup(&dpt_array);
2330 }
2331 return KERN_SUCCESS;
2332}
2333
2334kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
2335
2336kern_return_t
2337ps_write_device(
2338 paging_segment_t ps,
2339 vm_offset_t offset,
2340 vm_offset_t addr,
2341 unsigned int size,
2342 struct vs_async *vsa)
2343{
2344 recnum_t dev_offset;
2345 io_buf_len_t bytes_to_write, bytes_written;
2346 recnum_t records_written;
2347 kern_return_t kr;
2348 MACH_PORT_FACE reply_port;
1c79356b
A
2349
2350
2351
55e303ae 2352 clustered_writes[atop_32(size)]++;
1c79356b
A
2353
2354 dev_offset = (ps->ps_offset +
2355 (offset >> (vm_page_shift - ps->ps_record_shift)));
2356 bytes_to_write = size;
2357
2358 if (vsa) {
2359 /*
2360 * Asynchronous write.
2361 */
2362 reply_port = vsa->reply_port;
2363 ip_lock(reply_port);
2364 reply_port->ip_sorights++;
2365 ip_reference(reply_port);
2366 ip_unlock(reply_port);
2367 {
2368 device_t device;
2369 device = dev_port_lookup(ps->ps_device);
2370
2371 vsa->vsa_addr = addr;
2372 kr=ds_device_write_common(device,
2373 reply_port,
2374 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2375 (dev_mode_t) 0,
2376 dev_offset,
2377 (io_buf_ptr_t) addr,
2378 size,
2379 (IO_WRITE | IO_CALL),
2380 &bytes_written);
2381 }
2382 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2383 if (verbose)
2384 dprintf(("%s0x%x, addr=0x%x,"
2385 "size=0x%x,offset=0x%x\n",
2386 "device_write_request returned ",
2387 kr, addr, size, offset));
2388 BS_STAT(ps->ps_bs,
55e303ae 2389 ps->ps_bs->bs_pages_out_fail += atop_32(size));
1c79356b
A
2390 /* do the completion notification to free resources */
2391 device_write_reply(reply_port, kr, 0);
2392 return PAGER_ERROR;
2393 }
2394 } else do {
2395 /*
2396 * Synchronous write.
2397 */
2398 {
2399 device_t device;
2400 device = dev_port_lookup(ps->ps_device);
2401 kr=ds_device_write_common(device,
2402 IP_NULL, 0,
2403 (dev_mode_t) 0,
2404 dev_offset,
2405 (io_buf_ptr_t) addr,
2406 size,
2407 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2408 &bytes_written);
2409 }
2410 if (kr != KERN_SUCCESS) {
2411 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2412 "device_write returned ",
2413 kr, addr, size, offset));
2414 BS_STAT(ps->ps_bs,
55e303ae 2415 ps->ps_bs->bs_pages_out_fail += atop_32(size));
1c79356b
A
2416 return PAGER_ERROR;
2417 }
2418 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2419 Panic("fragmented write");
2420 records_written = (bytes_written >>
2421 (vm_page_shift - ps->ps_record_shift));
2422 dev_offset += records_written;
2423#if 1
2424 if (bytes_written != bytes_to_write) {
2425 dprintf(("wrote only %d bytes out of %d\n",
2426 bytes_written, bytes_to_write));
2427 }
2428#endif
2429 bytes_to_write -= bytes_written;
2430 addr += bytes_written;
2431 } while (bytes_to_write > 0);
2432
2433 return PAGER_SUCCESS;
2434}
2435
2436
2437#else /* !DEVICE_PAGING */
2438
2439kern_return_t
2440ps_read_device(
2441 paging_segment_t ps,
2442 vm_offset_t offset,
2443 vm_offset_t *bufferp,
2444 unsigned int size,
2445 unsigned int *residualp,
2446 int flags)
2447{
2448 panic("ps_read_device not supported");
2449}
2450
2451ps_write_device(
2452 paging_segment_t ps,
2453 vm_offset_t offset,
2454 vm_offset_t addr,
2455 unsigned int size,
2456 struct vs_async *vsa)
2457{
2458 panic("ps_write_device not supported");
2459}
2460
2461#endif /* DEVICE_PAGING */
2462void pvs_object_data_provided(vstruct_t, upl_t, vm_offset_t, vm_size_t); /* forward */
2463
2464void
2465pvs_object_data_provided(
2466 vstruct_t vs,
2467 upl_t upl,
2468 vm_offset_t offset,
2469 vm_size_t size)
2470{
1c79356b
A
2471
2472 DEBUG(DEBUG_VS_INTERNAL,
2473 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2474 upl, offset, size));
2475
2476 ASSERT(size > 0);
55e303ae 2477 GSTAT(global_stats.gs_pages_in += atop_32(size));
1c79356b
A
2478
2479
2480#if USE_PRECIOUS
2481 ps_clunmap(vs, offset, size);
2482#endif /* USE_PRECIOUS */
2483
2484}
2485
2486kern_return_t
2487pvs_cluster_read(
2488 vstruct_t vs,
0b4e3aa0 2489 vm_offset_t vs_offset,
1c79356b
A
2490 vm_size_t cnt)
2491{
1c79356b
A
2492 upl_t upl;
2493 kern_return_t error = KERN_SUCCESS;
0b4e3aa0 2494 int size;
1c79356b
A
2495 unsigned int residual;
2496 unsigned int request_flags;
0b4e3aa0
A
2497 int seg_index;
2498 int pages_in_cl;
2499 int cl_size;
2500 int cl_mask;
2501 int cl_index;
2502 int xfer_size;
2503 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2504 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2505 struct clmap clmap;
2506
2507 pages_in_cl = 1 << vs->vs_clshift;
2508 cl_size = pages_in_cl * vm_page_size;
2509 cl_mask = cl_size - 1;
1c79356b
A
2510
2511 /*
0b4e3aa0
A
2512 * This loop will be executed multiple times until the entire
2513 * request has been satisfied... if the request spans cluster
2514 * boundaries, the clusters will be checked for logical continunity,
2515 * if contiguous the I/O request will span multiple clusters, otherwise
2516 * it will be broken up into the minimal set of I/O's
1c79356b 2517 *
0b4e3aa0
A
2518 * If there are holes in a request (either unallocated pages in a paging
2519 * segment or an unallocated paging segment), we stop
1c79356b
A
2520 * reading at the hole, inform the VM of any data read, inform
2521 * the VM of an unavailable range, then loop again, hoping to
0b4e3aa0 2522 * find valid pages later in the requested range. This continues until
1c79356b
A
2523 * the entire range has been examined, and read, if present.
2524 */
2525
2526#if USE_PRECIOUS
9bccf70c 2527 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
1c79356b 2528#else
9bccf70c 2529 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
1c79356b
A
2530#endif
2531 while (cnt && (error == KERN_SUCCESS)) {
0b4e3aa0
A
2532 int ps_info_valid;
2533 int page_list_count;
1c79356b 2534
d12e1678
A
2535 if((vs_offset & cl_mask) &&
2536 (cnt > (VM_SUPER_CLUSTER -
2537 (vs_offset & cl_mask)))) {
2538 size = VM_SUPER_CLUSTER;
2539 size -= vs_offset & cl_mask;
2540 } else if (cnt > VM_SUPER_CLUSTER) {
0b4e3aa0 2541 size = VM_SUPER_CLUSTER;
d12e1678 2542 } else {
0b4e3aa0 2543 size = cnt;
d12e1678 2544 }
0b4e3aa0 2545 cnt -= size;
1c79356b 2546
0b4e3aa0
A
2547 ps_info_valid = 0;
2548 seg_index = 0;
1c79356b 2549
0b4e3aa0
A
2550 while (size > 0 && error == KERN_SUCCESS) {
2551 int abort_size;
2552 int failed_size;
2553 int beg_pseg;
2554 int beg_indx;
2555 vm_offset_t cur_offset;
1c79356b 2556
0b4e3aa0
A
2557
2558 if ( !ps_info_valid) {
2559 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2560 psp[seg_index] = CLMAP_PS(clmap);
2561 ps_info_valid = 1;
1c79356b 2562 }
0b4e3aa0
A
2563 /*
2564 * skip over unallocated physical segments
2565 */
2566 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2567 abort_size = cl_size - (vs_offset & cl_mask);
2568 abort_size = MIN(abort_size, size);
2569
2570 page_list_count = 0;
2571 memory_object_super_upl_request(
2572 vs->vs_control,
2573 (memory_object_offset_t)vs_offset,
2574 abort_size, abort_size,
2575 &upl, NULL, &page_list_count,
2576 request_flags);
1c79356b 2577
0b4e3aa0
A
2578 if (clmap.cl_error) {
2579 upl_abort(upl, UPL_ABORT_ERROR);
2580 } else {
2581 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2582 }
2583 upl_deallocate(upl);
1c79356b 2584
0b4e3aa0
A
2585 size -= abort_size;
2586 vs_offset += abort_size;
1c79356b 2587
0b4e3aa0
A
2588 seg_index++;
2589 ps_info_valid = 0;
2590 continue;
1c79356b 2591 }
0b4e3aa0
A
2592 cl_index = (vs_offset & cl_mask) / vm_page_size;
2593
2594 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2595 /*
2596 * skip over unallocated pages
2597 */
2598 if (CLMAP_ISSET(clmap, cl_index))
2599 break;
2600 abort_size += vm_page_size;
2601 }
2602 if (abort_size) {
2603 /*
2604 * Let VM system know about holes in clusters.
2605 */
55e303ae 2606 GSTAT(global_stats.gs_pages_unavail += atop_32(abort_size));
0b4e3aa0
A
2607
2608 page_list_count = 0;
2609 memory_object_super_upl_request(
2610 vs->vs_control,
2611 (memory_object_offset_t)vs_offset,
2612 abort_size, abort_size,
2613 &upl, NULL, &page_list_count,
1c79356b 2614 request_flags);
1c79356b 2615
0b4e3aa0
A
2616 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2617 upl_deallocate(upl);
1c79356b 2618
0b4e3aa0
A
2619 size -= abort_size;
2620 vs_offset += abort_size;
2621
2622 if (cl_index == pages_in_cl) {
2623 /*
2624 * if we're at the end of this physical cluster
2625 * then bump to the next one and continue looking
2626 */
2627 seg_index++;
2628 ps_info_valid = 0;
2629 continue;
2630 }
2631 if (size == 0)
2632 break;
2633 }
1c79356b 2634 /*
0b4e3aa0
A
2635 * remember the starting point of the first allocated page
2636 * for the I/O we're about to issue
1c79356b 2637 */
0b4e3aa0
A
2638 beg_pseg = seg_index;
2639 beg_indx = cl_index;
2640 cur_offset = vs_offset;
2641
2642 /*
2643 * calculate the size of the I/O that we can do...
2644 * this may span multiple physical segments if
2645 * they are contiguous
2646 */
2647 for (xfer_size = 0; xfer_size < size; ) {
2648
d12e1678
A
2649 while (cl_index < pages_in_cl
2650 && xfer_size < size) {
0b4e3aa0 2651 /*
55e303ae 2652 * accumulate allocated pages within
d12e1678 2653 * a physical segment
1c79356b 2654 */
0b4e3aa0
A
2655 if (CLMAP_ISSET(clmap, cl_index)) {
2656 xfer_size += vm_page_size;
2657 cur_offset += vm_page_size;
2658 cl_index++;
2659
2660 BS_STAT(psp[seg_index]->ps_bs,
2661 psp[seg_index]->ps_bs->bs_pages_in++);
2662 } else
2663 break;
2664 }
d12e1678
A
2665 if (cl_index < pages_in_cl
2666 || xfer_size >= size) {
0b4e3aa0 2667 /*
55e303ae 2668 * we've hit an unallocated page or
d12e1678
A
2669 * the end of this request... go fire
2670 * the I/O
1c79356b 2671 */
0b4e3aa0
A
2672 break;
2673 }
2674 /*
d12e1678 2675 * we've hit the end of the current physical
55e303ae 2676 * segment and there's more to do, so try
d12e1678 2677 * moving to the next one
0b4e3aa0
A
2678 */
2679 seg_index++;
2680
d12e1678 2681 ps_offset[seg_index] =
55e303ae
A
2682 ps_clmap(vs,
2683 cur_offset & ~cl_mask,
d12e1678
A
2684 &clmap, CL_FIND, 0, 0);
2685 psp[seg_index] = CLMAP_PS(clmap);
0b4e3aa0
A
2686 ps_info_valid = 1;
2687
2688 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2689 /*
55e303ae
A
2690 * if the physical segment we're about
2691 * to step into is not contiguous to
2692 * the one we're currently in, or it's
d12e1678 2693 * in a different paging file, or
0b4e3aa0
A
2694 * it hasn't been allocated....
2695 * we stop here and generate the I/O
2696 */
2697 break;
1c79356b 2698 }
0b4e3aa0 2699 /*
d12e1678 2700 * start with first page of the next physical
55e303ae 2701 * segment
0b4e3aa0
A
2702 */
2703 cl_index = 0;
1c79356b 2704 }
0b4e3aa0
A
2705 if (xfer_size) {
2706 /*
2707 * we have a contiguous range of allocated pages
2708 * to read from
2709 */
2710 page_list_count = 0;
2711 memory_object_super_upl_request(vs->vs_control,
d12e1678
A
2712 (memory_object_offset_t)vs_offset,
2713 xfer_size, xfer_size,
2714 &upl, NULL, &page_list_count,
2715 request_flags | UPL_SET_INTERNAL);
0b4e3aa0 2716
55e303ae 2717 error = ps_read_file(psp[beg_pseg],
d12e1678 2718 upl, (vm_offset_t) 0,
55e303ae
A
2719 ps_offset[beg_pseg] +
2720 (beg_indx * vm_page_size),
d12e1678 2721 xfer_size, &residual, 0);
0b4e3aa0
A
2722 } else
2723 continue;
1c79356b 2724
0b4e3aa0
A
2725 failed_size = 0;
2726
2727 /*
55e303ae 2728 * Adjust counts and send response to VM. Optimize
d12e1678 2729 * for the common case, i.e. no error and/or partial
55e303ae 2730 * data. If there was an error, then we need to error
d12e1678 2731 * the entire range, even if some data was successfully
55e303ae 2732 * read. If there was a partial read we may supply some
0b4e3aa0 2733 * data and may error some as well. In all cases the
55e303ae
A
2734 * VM must receive some notification for every page
2735 * in the range.
0b4e3aa0
A
2736 */
2737 if ((error == KERN_SUCCESS) && (residual == 0)) {
2738 /*
d12e1678 2739 * Got everything we asked for, supply the data
55e303ae
A
2740 * to the VM. Note that as a side effect of
2741 * supplying the data, the buffer holding the
2742 * supplied data is deallocated from the pager's
2743 * address space.
0b4e3aa0 2744 */
d12e1678
A
2745 pvs_object_data_provided(
2746 vs, upl, vs_offset, xfer_size);
0b4e3aa0
A
2747 } else {
2748 failed_size = xfer_size;
2749
2750 if (error == KERN_SUCCESS) {
2751 if (residual == xfer_size) {
d12e1678
A
2752 /*
2753 * If a read operation returns no error
2754 * and no data moved, we turn it into
2755 * an error, assuming we're reading at
2756 * or beyong EOF.
2757 * Fall through and error the entire
2758 * range.
2759 */
0b4e3aa0
A
2760 error = KERN_FAILURE;
2761 } else {
d12e1678
A
2762 /*
2763 * Otherwise, we have partial read. If
2764 * the part read is a integral number
2765 * of pages supply it. Otherwise round
2766 * it up to a page boundary, zero fill
2767 * the unread part, and supply it.
2768 * Fall through and error the remainder
2769 * of the range, if any.
2770 */
0b4e3aa0
A
2771 int fill, lsize;
2772
d12e1678
A
2773 fill = residual
2774 & ~vm_page_size;
55e303ae 2775 lsize = (xfer_size - residual)
d12e1678
A
2776 + fill;
2777 pvs_object_data_provided(
55e303ae 2778 vs, upl,
d12e1678 2779 vs_offset, lsize);
0b4e3aa0
A
2780
2781 if (lsize < xfer_size) {
d12e1678
A
2782 failed_size =
2783 xfer_size - lsize;
0b4e3aa0
A
2784 error = KERN_FAILURE;
2785 }
2786 }
2787 }
2788 }
1c79356b
A
2789 /*
2790 * If there was an error in any part of the range, tell
d12e1678 2791 * the VM. Note that error is explicitly checked again
55e303ae 2792 * since it can be modified above.
1c79356b
A
2793 */
2794 if (error != KERN_SUCCESS) {
0b4e3aa0 2795 BS_STAT(psp[beg_pseg]->ps_bs,
d12e1678 2796 psp[beg_pseg]->ps_bs->bs_pages_in_fail
55e303ae 2797 += atop_32(failed_size));
1c79356b 2798 }
0b4e3aa0
A
2799 size -= xfer_size;
2800 vs_offset += xfer_size;
1c79356b 2801 }
1c79356b
A
2802
2803 } /* END while (cnt && (error == 0)) */
2804 return error;
2805}
2806
2807int vs_do_async_write = 1;
2808
2809kern_return_t
2810vs_cluster_write(
2811 vstruct_t vs,
2812 upl_t internal_upl,
2813 vm_offset_t offset,
2814 vm_size_t cnt,
2815 boolean_t dp_internal,
2816 int flags)
2817{
1c79356b
A
2818 vm_offset_t size;
2819 vm_offset_t transfer_size;
1c79356b
A
2820 int error = 0;
2821 struct clmap clmap;
0b4e3aa0
A
2822
2823 vm_offset_t actual_offset; /* Offset within paging segment */
1c79356b 2824 paging_segment_t ps;
0b4e3aa0
A
2825 vm_offset_t subx_size;
2826 vm_offset_t mobj_base_addr;
2827 vm_offset_t mobj_target_addr;
2828 int mobj_size;
2829
1c79356b
A
2830 struct vs_async *vsa;
2831 vm_map_copy_t copy;
1c79356b
A
2832
2833 upl_t upl;
0b4e3aa0 2834 upl_page_info_t *pl;
1c79356b
A
2835 int page_index;
2836 int list_size;
55e303ae 2837 int pages_in_cl;
1c79356b 2838 int cl_size;
55e303ae
A
2839 int base_index;
2840 int seg_size;
2841
2842 pages_in_cl = 1 << vs->vs_clshift;
2843 cl_size = pages_in_cl * vm_page_size;
1c79356b 2844
1c79356b 2845 if (!dp_internal) {
0b4e3aa0 2846 int page_list_count;
1c79356b
A
2847 int request_flags;
2848 int super_size;
0b4e3aa0
A
2849 int first_dirty;
2850 int num_dirty;
2851 int num_of_pages;
2852 int seg_index;
1c79356b 2853 vm_offset_t upl_offset;
0b4e3aa0 2854 vm_offset_t seg_offset;
55e303ae
A
2855 vm_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2856 paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
0b4e3aa0 2857
1c79356b 2858
1c79356b
A
2859 if (bs_low) {
2860 super_size = cl_size;
0b4e3aa0 2861
1c79356b
A
2862 request_flags = UPL_NOBLOCK |
2863 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2864 UPL_NO_SYNC | UPL_SET_INTERNAL;
2865 } else {
2866 super_size = VM_SUPER_CLUSTER;
0b4e3aa0 2867
1c79356b
A
2868 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2869 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2870 UPL_NO_SYNC | UPL_SET_INTERNAL;
2871 }
2872
0b4e3aa0
A
2873 page_list_count = 0;
2874 memory_object_super_upl_request(vs->vs_control,
2875 (memory_object_offset_t)offset,
2876 cnt, super_size,
2877 &upl, NULL, &page_list_count,
55e303ae 2878 request_flags | UPL_FOR_PAGEOUT);
1c79356b 2879
0b4e3aa0 2880 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1c79356b 2881
55e303ae
A
2882 seg_size = cl_size - (upl->offset % cl_size);
2883 upl_offset = upl->offset & ~(cl_size - 1);
2884
d12e1678
A
2885 for (seg_index = 0, transfer_size = upl->size;
2886 transfer_size > 0; ) {
d12e1678 2887 ps_offset[seg_index] =
55e303ae
A
2888 ps_clmap(vs,
2889 upl_offset,
2890 &clmap, CL_ALLOC,
2891 cl_size, 0);
1c79356b 2892
0b4e3aa0
A
2893 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2894 upl_abort(upl, 0);
2895 upl_deallocate(upl);
2896
2897 return KERN_FAILURE;
1c79356b 2898
0b4e3aa0
A
2899 }
2900 psp[seg_index] = CLMAP_PS(clmap);
1c79356b 2901
55e303ae
A
2902 if (transfer_size > seg_size) {
2903 transfer_size -= seg_size;
2904 upl_offset += cl_size;
2905 seg_size = cl_size;
0b4e3aa0
A
2906 seg_index++;
2907 } else
2908 transfer_size = 0;
2909 }
55e303ae
A
2910 /*
2911 * Ignore any non-present pages at the end of the
2912 * UPL.
2913 */
2914 for (page_index = upl->size / vm_page_size; page_index > 0;)
2915 if (UPL_PAGE_PRESENT(pl, --page_index))
2916 break;
2917 num_of_pages = page_index + 1;
2918
2919 base_index = (upl->offset % cl_size) / PAGE_SIZE;
2920
2921 for (page_index = 0; page_index < num_of_pages; ) {
0b4e3aa0
A
2922 /*
2923 * skip over non-dirty pages
2924 */
2925 for ( ; page_index < num_of_pages; page_index++) {
55e303ae 2926 if (UPL_DIRTY_PAGE(pl, page_index)
d12e1678 2927 || UPL_PRECIOUS_PAGE(pl, page_index))
0b4e3aa0
A
2928 /*
2929 * this is a page we need to write
55e303ae 2930 * go see if we can buddy it up with
d12e1678 2931 * others that are contiguous to it
0b4e3aa0
A
2932 */
2933 break;
2934 /*
d12e1678 2935 * if the page is not-dirty, but present we
55e303ae 2936 * need to commit it... This is an unusual
d12e1678 2937 * case since we only asked for dirty pages
0b4e3aa0
A
2938 */
2939 if (UPL_PAGE_PRESENT(pl, page_index)) {
2940 boolean_t empty = FALSE;
2941 upl_commit_range(upl,
2942 page_index * vm_page_size,
2943 vm_page_size,
2944 UPL_COMMIT_NOTIFY_EMPTY,
2945 pl,
d52fe63f 2946 page_list_count,
0b4e3aa0 2947 &empty);
55e303ae
A
2948 if (empty) {
2949 assert(page_index ==
2950 num_of_pages - 1);
0b4e3aa0 2951 upl_deallocate(upl);
55e303ae 2952 }
1c79356b 2953 }
1c79356b 2954 }
0b4e3aa0
A
2955 if (page_index == num_of_pages)
2956 /*
2957 * no more pages to look at, we're out of here
2958 */
2959 break;
1c79356b 2960
0b4e3aa0 2961 /*
55e303ae
A
2962 * gather up contiguous dirty pages... we have at
2963 * least 1 * otherwise we would have bailed above
0b4e3aa0
A
2964 * make sure that each physical segment that we step
2965 * into is contiguous to the one we're currently in
2966 * if it's not, we have to stop and write what we have
2967 */
55e303ae 2968 for (first_dirty = page_index;
d12e1678 2969 page_index < num_of_pages; ) {
55e303ae 2970 if ( !UPL_DIRTY_PAGE(pl, page_index)
d12e1678 2971 && !UPL_PRECIOUS_PAGE(pl, page_index))
0b4e3aa0
A
2972 break;
2973 page_index++;
2974 /*
2975 * if we just looked at the last page in the UPL
2976 * we don't need to check for physical segment
2977 * continuity
2978 */
2979 if (page_index < num_of_pages) {
2980 int cur_seg;
2981 int nxt_seg;
2982
55e303ae
A
2983 cur_seg = (base_index + (page_index - 1))/pages_in_cl;
2984 nxt_seg = (base_index + page_index)/pages_in_cl;
0b4e3aa0
A
2985
2986 if (cur_seg != nxt_seg) {
2987 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
55e303ae
A
2988 /*
2989 * if the segment we're about
2990 * to step into is not
2991 * contiguous to the one we're
2992 * currently in, or it's in a
d12e1678 2993 * different paging file....
55e303ae 2994 * we stop here and generate
d12e1678
A
2995 * the I/O
2996 */
0b4e3aa0 2997 break;
1c79356b 2998 }
1c79356b 2999 }
0b4e3aa0
A
3000 }
3001 num_dirty = page_index - first_dirty;
1c79356b 3002
0b4e3aa0
A
3003 if (num_dirty) {
3004 upl_offset = first_dirty * vm_page_size;
0b4e3aa0
A
3005 transfer_size = num_dirty * vm_page_size;
3006
d12e1678 3007 while (transfer_size) {
1c79356b 3008
d12e1678 3009 if ((seg_size = cl_size -
55e303ae 3010 ((upl->offset + upl_offset) % cl_size))
d12e1678
A
3011 > transfer_size)
3012 seg_size = transfer_size;
0b4e3aa0 3013
d12e1678
A
3014 ps_vs_write_complete(vs,
3015 upl->offset + upl_offset,
3016 seg_size, error);
0b4e3aa0 3017
d12e1678
A
3018 transfer_size -= seg_size;
3019 upl_offset += seg_size;
0b4e3aa0 3020 }
d12e1678
A
3021 upl_offset = first_dirty * vm_page_size;
3022 transfer_size = num_dirty * vm_page_size;
55e303ae
A
3023
3024 seg_index = (base_index + first_dirty) / pages_in_cl;
3025 seg_offset = (upl->offset + upl_offset) % cl_size;
3026
d12e1678
A
3027 error = ps_write_file(psp[seg_index],
3028 upl, upl_offset,
3029 ps_offset[seg_index]
3030 + seg_offset,
3031 transfer_size, flags);
55e303ae 3032 } else {
0b4e3aa0
A
3033 boolean_t empty = FALSE;
3034 upl_abort_range(upl,
3035 first_dirty * vm_page_size,
3036 num_dirty * vm_page_size,
3037 UPL_ABORT_NOTIFY_EMPTY,
3038 &empty);
55e303ae
A
3039 if (empty) {
3040 assert(page_index == num_of_pages);
0b4e3aa0 3041 upl_deallocate(upl);
55e303ae 3042 }
1c79356b 3043 }
1c79356b 3044 }
0b4e3aa0 3045
1c79356b
A
3046 } else {
3047 assert(cnt <= (vm_page_size << vs->vs_clshift));
3048 list_size = cnt;
3049
3050 page_index = 0;
3051 /* The caller provides a mapped_data which is derived */
3052 /* from a temporary object. The targeted pages are */
3053 /* guaranteed to be set at offset 0 in the mapped_data */
3054 /* The actual offset however must still be derived */
3055 /* from the offset in the vs in question */
3056 mobj_base_addr = offset;
3057 mobj_target_addr = mobj_base_addr;
3058
3059 for (transfer_size = list_size; transfer_size != 0;) {
3060 actual_offset = ps_clmap(vs, mobj_target_addr,
3061 &clmap, CL_ALLOC,
3062 transfer_size < cl_size ?
3063 transfer_size : cl_size, 0);
3064 if(actual_offset == (vm_offset_t) -1) {
3065 error = 1;
3066 break;
3067 }
3068 cnt = MIN(transfer_size,
3069 CLMAP_NPGS(clmap) * vm_page_size);
3070 ps = CLMAP_PS(clmap);
3071 /* Assume that the caller has given us contiguous */
3072 /* pages */
3073 if(cnt) {
d12e1678
A
3074 ps_vs_write_complete(vs, mobj_target_addr,
3075 cnt, error);
1c79356b
A
3076 error = ps_write_file(ps, internal_upl,
3077 0, actual_offset,
3078 cnt, flags);
3079 if (error)
3080 break;
55e303ae 3081 }
1c79356b
A
3082 if (error)
3083 break;
3084 actual_offset += cnt;
3085 mobj_target_addr += cnt;
3086 transfer_size -= cnt;
3087 cnt = 0;
3088
3089 if (error)
3090 break;
3091 }
3092 }
3093 if(error)
3094 return KERN_FAILURE;
3095 else
3096 return KERN_SUCCESS;
3097}
3098
3099vm_size_t
3100ps_vstruct_allocated_size(
3101 vstruct_t vs)
3102{
3103 int num_pages;
3104 struct vs_map *vsmap;
3105 int i, j, k;
3106
3107 num_pages = 0;
3108 if (vs->vs_indirect) {
3109 /* loop on indirect maps */
3110 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3111 vsmap = vs->vs_imap[i];
3112 if (vsmap == NULL)
3113 continue;
3114 /* loop on clusters in this indirect map */
3115 for (j = 0; j < CLMAP_ENTRIES; j++) {
3116 if (VSM_ISCLR(vsmap[j]) ||
3117 VSM_ISERR(vsmap[j]))
3118 continue;
3119 /* loop on pages in this cluster */
3120 for (k = 0; k < VSCLSIZE(vs); k++) {
3121 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3122 num_pages++;
3123 }
3124 }
3125 }
3126 } else {
3127 vsmap = vs->vs_dmap;
3128 if (vsmap == NULL)
3129 return 0;
3130 /* loop on clusters in the direct map */
3131 for (j = 0; j < CLMAP_ENTRIES; j++) {
3132 if (VSM_ISCLR(vsmap[j]) ||
3133 VSM_ISERR(vsmap[j]))
3134 continue;
3135 /* loop on pages in this cluster */
3136 for (k = 0; k < VSCLSIZE(vs); k++) {
3137 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3138 num_pages++;
3139 }
3140 }
3141 }
3142
55e303ae 3143 return ptoa_32(num_pages);
1c79356b
A
3144}
3145
3146size_t
3147ps_vstruct_allocated_pages(
3148 vstruct_t vs,
3149 default_pager_page_t *pages,
3150 size_t pages_size)
3151{
3152 int num_pages;
3153 struct vs_map *vsmap;
3154 vm_offset_t offset;
3155 int i, j, k;
3156
3157 num_pages = 0;
3158 offset = 0;
3159 if (vs->vs_indirect) {
3160 /* loop on indirect maps */
3161 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3162 vsmap = vs->vs_imap[i];
3163 if (vsmap == NULL) {
3164 offset += (vm_page_size * CLMAP_ENTRIES *
3165 VSCLSIZE(vs));
3166 continue;
3167 }
3168 /* loop on clusters in this indirect map */
3169 for (j = 0; j < CLMAP_ENTRIES; j++) {
3170 if (VSM_ISCLR(vsmap[j]) ||
3171 VSM_ISERR(vsmap[j])) {
3172 offset += vm_page_size * VSCLSIZE(vs);
3173 continue;
3174 }
3175 /* loop on pages in this cluster */
3176 for (k = 0; k < VSCLSIZE(vs); k++) {
3177 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3178 num_pages++;
3179 if (num_pages < pages_size)
3180 pages++->dpp_offset =
3181 offset;
3182 }
3183 offset += vm_page_size;
3184 }
3185 }
3186 }
3187 } else {
3188 vsmap = vs->vs_dmap;
3189 if (vsmap == NULL)
3190 return 0;
3191 /* loop on clusters in the direct map */
3192 for (j = 0; j < CLMAP_ENTRIES; j++) {
3193 if (VSM_ISCLR(vsmap[j]) ||
3194 VSM_ISERR(vsmap[j])) {
3195 offset += vm_page_size * VSCLSIZE(vs);
3196 continue;
3197 }
3198 /* loop on pages in this cluster */
3199 for (k = 0; k < VSCLSIZE(vs); k++) {
3200 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3201 num_pages++;
3202 if (num_pages < pages_size)
3203 pages++->dpp_offset = offset;
3204 }
3205 offset += vm_page_size;
3206 }
3207 }
3208 }
3209
3210 return num_pages;
3211}
3212
3213
3214kern_return_t
3215ps_vstruct_transfer_from_segment(
3216 vstruct_t vs,
3217 paging_segment_t segment,
1c79356b 3218 upl_t upl)
1c79356b
A
3219{
3220 struct vs_map *vsmap;
3221 struct vs_map old_vsmap;
3222 struct vs_map new_vsmap;
3223 int i, j, k;
3224
3225 VS_LOCK(vs); /* block all work on this vstruct */
3226 /* can't allow the normal multiple write */
3227 /* semantic because writes may conflict */
3228 vs->vs_xfer_pending = TRUE;
3229 vs_wait_for_sync_writers(vs);
3230 vs_start_write(vs);
3231 vs_wait_for_readers(vs);
3232 /* we will unlock the vs to allow other writes while transferring */
3233 /* and will be guaranteed of the persistance of the vs struct */
3234 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3235 /* vs_async_pending */
3236 /* OK we now have guaranteed no other parties are accessing this */
3237 /* vs. Now that we are also supporting simple lock versions of */
3238 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3239 /* our purpose in holding it before was the multiple write case */
3240 /* we now use the boolean xfer_pending to do that. We can use */
3241 /* a boolean instead of a count because we have guaranteed single */
3242 /* file access to this code in its caller */
3243 VS_UNLOCK(vs);
3244vs_changed:
3245 if (vs->vs_indirect) {
3246 int vsmap_size;
3247 int clmap_off;
3248 /* loop on indirect maps */
3249 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3250 vsmap = vs->vs_imap[i];
3251 if (vsmap == NULL)
3252 continue;
3253 /* loop on clusters in this indirect map */
3254 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3255 VSCLSIZE(vs) * i);
3256 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3257 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3258 else
3259 vsmap_size = CLMAP_ENTRIES;
3260 for (j = 0; j < vsmap_size; j++) {
3261 if (VSM_ISCLR(vsmap[j]) ||
3262 VSM_ISERR(vsmap[j]) ||
3263 (VSM_PS(vsmap[j]) != segment))
3264 continue;
3265 if(vs_cluster_transfer(vs,
3266 (vm_page_size * (j << vs->vs_clshift))
3267 + clmap_off,
3268 vm_page_size << vs->vs_clshift,
1c79356b 3269 upl)
1c79356b
A
3270 != KERN_SUCCESS) {
3271 VS_LOCK(vs);
3272 vs->vs_xfer_pending = FALSE;
3273 VS_UNLOCK(vs);
3274 vs_finish_write(vs);
3275 return KERN_FAILURE;
3276 }
3277 /* allow other readers/writers during transfer*/
3278 VS_LOCK(vs);
3279 vs->vs_xfer_pending = FALSE;
3280 VS_UNLOCK(vs);
3281 vs_finish_write(vs);
3282 VS_LOCK(vs);
3283 vs->vs_xfer_pending = TRUE;
1c79356b
A
3284 vs_wait_for_sync_writers(vs);
3285 vs_start_write(vs);
3286 vs_wait_for_readers(vs);
0b4e3aa0 3287 VS_UNLOCK(vs);
1c79356b
A
3288 if (!(vs->vs_indirect)) {
3289 goto vs_changed;
3290 }
3291 }
3292 }
3293 } else {
3294 vsmap = vs->vs_dmap;
3295 if (vsmap == NULL) {
3296 VS_LOCK(vs);
3297 vs->vs_xfer_pending = FALSE;
3298 VS_UNLOCK(vs);
3299 vs_finish_write(vs);
3300 return KERN_SUCCESS;
3301 }
3302 /* loop on clusters in the direct map */
3303 for (j = 0; j < vs->vs_size; j++) {
3304 if (VSM_ISCLR(vsmap[j]) ||
3305 VSM_ISERR(vsmap[j]) ||
3306 (VSM_PS(vsmap[j]) != segment))
3307 continue;
3308 if(vs_cluster_transfer(vs,
3309 vm_page_size * (j << vs->vs_clshift),
3310 vm_page_size << vs->vs_clshift,
1c79356b 3311 upl) != KERN_SUCCESS) {
1c79356b
A
3312 VS_LOCK(vs);
3313 vs->vs_xfer_pending = FALSE;
3314 VS_UNLOCK(vs);
3315 vs_finish_write(vs);
3316 return KERN_FAILURE;
3317 }
3318 /* allow other readers/writers during transfer*/
3319 VS_LOCK(vs);
3320 vs->vs_xfer_pending = FALSE;
3321 VS_UNLOCK(vs);
3322 vs_finish_write(vs);
3323 VS_LOCK(vs);
3324 vs->vs_xfer_pending = TRUE;
3325 VS_UNLOCK(vs);
3326 vs_wait_for_sync_writers(vs);
3327 vs_start_write(vs);
3328 vs_wait_for_readers(vs);
3329 if (vs->vs_indirect) {
3330 goto vs_changed;
3331 }
3332 }
3333 }
3334
3335 VS_LOCK(vs);
3336 vs->vs_xfer_pending = FALSE;
3337 VS_UNLOCK(vs);
3338 vs_finish_write(vs);
3339 return KERN_SUCCESS;
3340}
3341
3342
3343
3344vs_map_t
3345vs_get_map_entry(
3346 vstruct_t vs,
3347 vm_offset_t offset)
3348{
3349 struct vs_map *vsmap;
3350 vm_offset_t cluster;
3351
55e303ae 3352 cluster = atop_32(offset) >> vs->vs_clshift;
1c79356b
A
3353 if (vs->vs_indirect) {
3354 long ind_block = cluster/CLMAP_ENTRIES;
3355
3356 /* Is the indirect block allocated? */
3357 vsmap = vs->vs_imap[ind_block];
3358 if(vsmap == (vs_map_t) NULL)
3359 return vsmap;
3360 } else
3361 vsmap = vs->vs_dmap;
3362 vsmap += cluster%CLMAP_ENTRIES;
3363 return vsmap;
3364}
3365
3366kern_return_t
3367vs_cluster_transfer(
3368 vstruct_t vs,
3369 vm_offset_t offset,
3370 vm_size_t cnt,
1c79356b 3371 upl_t upl)
1c79356b
A
3372{
3373 vm_offset_t actual_offset;
3374 paging_segment_t ps;
3375 struct clmap clmap;
3376 kern_return_t error = KERN_SUCCESS;
3377 int size, size_wanted, i;
3378 unsigned int residual;
3379 int unavail_size;
3380 default_pager_thread_t *dpt;
3381 boolean_t dealloc;
3382 struct vs_map *vsmap_ptr;
3383 struct vs_map read_vsmap;
3384 struct vs_map original_read_vsmap;
3385 struct vs_map write_vsmap;
3386 upl_t sync_upl;
1c79356b
A
3387 vm_offset_t ioaddr;
3388
1c79356b
A
3389 /* vs_cluster_transfer reads in the pages of a cluster and
3390 * then writes these pages back to new backing store. The
3391 * segment the pages are being read from is assumed to have
3392 * been taken off-line and is no longer considered for new
3393 * space requests.
3394 */
3395
3396 /*
3397 * This loop will be executed once per cluster referenced.
3398 * Typically this means once, since it's unlikely that the
3399 * VM system will ask for anything spanning cluster boundaries.
3400 *
3401 * If there are holes in a cluster (in a paging segment), we stop
3402 * reading at the hole, then loop again, hoping to
3403 * find valid pages later in the cluster. This continues until
3404 * the entire range has been examined, and read, if present. The
3405 * pages are written as they are read. If a failure occurs after
3406 * some pages are written the unmap call at the bottom of the loop
3407 * recovers the backing store and the old backing store remains
3408 * in effect.
3409 */
3410
1c79356b
A
3411 VSM_CLR(write_vsmap);
3412 VSM_CLR(original_read_vsmap);
3413 /* grab the actual object's pages to sync with I/O */
3414 while (cnt && (error == KERN_SUCCESS)) {
3415 vsmap_ptr = vs_get_map_entry(vs, offset);
3416 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3417
3418 if (actual_offset == (vm_offset_t) -1) {
3419
3420 /*
3421 * Nothing left to write in this cluster at least
3422 * set write cluster information for any previous
3423 * write, clear for next cluster, if there is one
3424 */
3425 unsigned int local_size, clmask, clsize;
3426
3427 clsize = vm_page_size << vs->vs_clshift;
3428 clmask = clsize - 1;
3429 local_size = clsize - (offset & clmask);
3430 ASSERT(local_size);
3431 local_size = MIN(local_size, cnt);
3432
3433 /* This cluster has no data in it beyond what may */
3434 /* have been found on a previous iteration through */
3435 /* the loop "write_vsmap" */
3436 *vsmap_ptr = write_vsmap;
3437 VSM_CLR(write_vsmap);
3438 VSM_CLR(original_read_vsmap);
3439
3440 cnt -= local_size;
3441 offset += local_size;
3442 continue;
3443 }
3444
3445 /*
3446 * Count up contiguous available or unavailable
3447 * pages.
3448 */
3449 ps = CLMAP_PS(clmap);
3450 ASSERT(ps);
3451 size = 0;
3452 unavail_size = 0;
3453 for (i = 0;
3454 (size < cnt) && (unavail_size < cnt) &&
3455 (i < CLMAP_NPGS(clmap)); i++) {
3456 if (CLMAP_ISSET(clmap, i)) {
3457 if (unavail_size != 0)
3458 break;
3459 size += vm_page_size;
3460 BS_STAT(ps->ps_bs,
3461 ps->ps_bs->bs_pages_in++);
3462 } else {
3463 if (size != 0)
3464 break;
3465 unavail_size += vm_page_size;
3466 }
3467 }
3468
3469 if (size == 0) {
3470 ASSERT(unavail_size);
3471 cnt -= unavail_size;
3472 offset += unavail_size;
3473 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3474 == 0) {
3475 /* There is no more to transfer in this
3476 cluster
3477 */
3478 *vsmap_ptr = write_vsmap;
3479 VSM_CLR(write_vsmap);
3480 VSM_CLR(original_read_vsmap);
3481 }
3482 continue;
3483 }
3484
3485 if(VSM_ISCLR(original_read_vsmap))
3486 original_read_vsmap = *vsmap_ptr;
3487
3488 if(ps->ps_segtype == PS_PARTITION) {
3489/*
9bccf70c 3490 NEED TO ISSUE WITH SYNC & NO COMMIT
1c79356b
A
3491 error = ps_read_device(ps, actual_offset, &buffer,
3492 size, &residual, flags);
3493*/
3494 } else {
9bccf70c 3495 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
0b4e3aa0 3496 error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset,
1c79356b 3497 size, &residual,
9bccf70c 3498 (UPL_IOSYNC | UPL_NOCOMMIT));
1c79356b
A
3499 }
3500
3501 read_vsmap = *vsmap_ptr;
3502
3503
3504 /*
3505 * Adjust counts and put data in new BS. Optimize for the
3506 * common case, i.e. no error and/or partial data.
3507 * If there was an error, then we need to error the entire
3508 * range, even if some data was successfully read.
3509 *
3510 */
3511 if ((error == KERN_SUCCESS) && (residual == 0)) {
0b4e3aa0
A
3512 int page_list_count = 0;
3513
1c79356b
A
3514 /*
3515 * Got everything we asked for, supply the data to
3516 * the new BS. Note that as a side effect of supplying
3517 * the data, the buffer holding the supplied data is
3518 * deallocated from the pager's address space unless
3519 * the write is unsuccessful.
3520 */
3521
3522 /* note buffer will be cleaned up in all cases by */
3523 /* internal_cluster_write or if an error on write */
3524 /* the vm_map_copy_page_discard call */
3525 *vsmap_ptr = write_vsmap;
3526
1c79356b
A
3527 if(vs_cluster_write(vs, upl, offset,
3528 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
1c79356b
A
3529 error = KERN_FAILURE;
3530 if(!(VSM_ISCLR(*vsmap_ptr))) {
3531 /* unmap the new backing store object */
3532 ps_clunmap(vs, offset, size);
3533 }
3534 /* original vsmap */
3535 *vsmap_ptr = original_read_vsmap;
3536 VSM_CLR(write_vsmap);
3537 } else {
3538 if((offset + size) &
3539 ((vm_page_size << vs->vs_clshift)
3540 - 1)) {
3541 /* There is more to transfer in this
3542 cluster
3543 */
3544 write_vsmap = *vsmap_ptr;
3545 *vsmap_ptr = read_vsmap;
3546 } else {
3547 /* discard the old backing object */
3548 write_vsmap = *vsmap_ptr;
3549 *vsmap_ptr = read_vsmap;
3550 ps_clunmap(vs, offset, size);
3551 *vsmap_ptr = write_vsmap;
3552 VSM_CLR(write_vsmap);
3553 VSM_CLR(original_read_vsmap);
3554 }
3555 }
3556 } else {
3557 size_wanted = size;
3558 if (error == KERN_SUCCESS) {
3559 if (residual == size) {
3560 /*
3561 * If a read operation returns no error
3562 * and no data moved, we turn it into
3563 * an error, assuming we're reading at
3564 * or beyond EOF.
3565 * Fall through and error the entire
3566 * range.
3567 */
3568 error = KERN_FAILURE;
3569 *vsmap_ptr = write_vsmap;
3570 if(!(VSM_ISCLR(*vsmap_ptr))) {
3571 /* unmap the new backing store object */
3572 ps_clunmap(vs, offset, size);
3573 }
3574 *vsmap_ptr = original_read_vsmap;
3575 VSM_CLR(write_vsmap);
3576 continue;
3577 } else {
3578 /*
3579 * Otherwise, we have partial read.
3580 * This is also considered an error
3581 * for the purposes of cluster transfer
3582 */
3583 error = KERN_FAILURE;
3584 *vsmap_ptr = write_vsmap;
3585 if(!(VSM_ISCLR(*vsmap_ptr))) {
3586 /* unmap the new backing store object */
3587 ps_clunmap(vs, offset, size);
3588 }
3589 *vsmap_ptr = original_read_vsmap;
3590 VSM_CLR(write_vsmap);
3591 continue;
3592 }
3593 }
3594
3595 }
3596 cnt -= size;
3597 offset += size;
3598
3599 } /* END while (cnt && (error == 0)) */
3600 if(!VSM_ISCLR(write_vsmap))
3601 *vsmap_ptr = write_vsmap;
3602
1c79356b
A
3603 return error;
3604}
3605
3606kern_return_t
3607default_pager_add_file(MACH_PORT_FACE backing_store,
3608 int *vp,
3609 int record_size,
3610 long size)
3611{
3612 backing_store_t bs;
3613 paging_segment_t ps;
3614 int i;
3615 int error;
1c79356b
A
3616
3617 if ((bs = backing_store_lookup(backing_store))
3618 == BACKING_STORE_NULL)
3619 return KERN_INVALID_ARGUMENT;
3620
3621 PSL_LOCK();
3622 for (i = 0; i <= paging_segment_max; i++) {
3623 ps = paging_segments[i];
3624 if (ps == PAGING_SEGMENT_NULL)
3625 continue;
3626 if (ps->ps_segtype != PS_FILE)
3627 continue;
3628
3629 /*
3630 * Check for overlap on same device.
3631 */
3632 if (ps->ps_vnode == (struct vnode *)vp) {
3633 PSL_UNLOCK();
3634 BS_UNLOCK(bs);
3635 return KERN_INVALID_ARGUMENT;
3636 }
3637 }
3638 PSL_UNLOCK();
3639
3640 /*
3641 * Set up the paging segment
3642 */
3643 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3644 if (ps == PAGING_SEGMENT_NULL) {
3645 BS_UNLOCK(bs);
3646 return KERN_RESOURCE_SHORTAGE;
3647 }
3648
3649 ps->ps_segtype = PS_FILE;
3650 ps->ps_vnode = (struct vnode *)vp;
3651 ps->ps_offset = 0;
3652 ps->ps_record_shift = local_log2(vm_page_size / record_size);
3653 ps->ps_recnum = size;
3654 ps->ps_pgnum = size >> ps->ps_record_shift;
3655
3656 ps->ps_pgcount = ps->ps_pgnum;
3657 ps->ps_clshift = local_log2(bs->bs_clsize);
3658 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3659 ps->ps_hint = 0;
3660
3661 PS_LOCK_INIT(ps);
3662 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3663 if (!ps->ps_bmap) {
3664 kfree((vm_offset_t)ps, sizeof *ps);
3665 BS_UNLOCK(bs);
3666 return KERN_RESOURCE_SHORTAGE;
3667 }
3668 for (i = 0; i < ps->ps_ncls; i++) {
3669 clrbit(ps->ps_bmap, i);
3670 }
3671
3672 ps->ps_going_away = FALSE;
3673 ps->ps_bs = bs;
3674
3675 if ((error = ps_enter(ps)) != 0) {
3676 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3677 kfree((vm_offset_t)ps, sizeof *ps);
3678 BS_UNLOCK(bs);
3679 return KERN_RESOURCE_SHORTAGE;
3680 }
3681
3682 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3683 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3684 PSL_LOCK();
3685 dp_pages_free += ps->ps_pgcount;
3686 PSL_UNLOCK();
3687
3688 BS_UNLOCK(bs);
3689
3690 bs_more_space(ps->ps_clcount);
3691
3692 DEBUG(DEBUG_BS_INTERNAL,
3693 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3694 device, offset, size, record_size,
3695 ps->ps_record_shift, ps->ps_pgnum));
3696
3697 return KERN_SUCCESS;
3698}
3699
3700
3701
1c79356b
A
3702kern_return_t
3703ps_read_file(
3704 paging_segment_t ps,
3705 upl_t upl,
0b4e3aa0 3706 vm_offset_t upl_offset,
1c79356b
A
3707 vm_offset_t offset,
3708 unsigned int size,
3709 unsigned int *residualp,
3710 int flags)
3711{
3712 vm_object_offset_t f_offset;
3713 int error = 0;
3714 int result;
1c79356b
A
3715
3716
55e303ae 3717 clustered_reads[atop_32(size)]++;
1c79356b
A
3718
3719 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3720
3721 /* for transfer case we need to pass uploffset and flags */
3722 error = vnode_pagein(ps->ps_vnode,
0b4e3aa0 3723 upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
1c79356b
A
3724
3725 /* The vnode_pagein semantic is somewhat at odds with the existing */
3726 /* device_read semantic. Partial reads are not experienced at this */
3727 /* level. It is up to the bit map code and cluster read code to */
3728 /* check that requested data locations are actually backed, and the */
3729 /* pagein code to either read all of the requested data or return an */
3730 /* error. */
3731
3732 if (error)
3733 result = KERN_FAILURE;
3734 else {
3735 *residualp = 0;
3736 result = KERN_SUCCESS;
3737 }
3738 return result;
1c79356b
A
3739}
3740
3741kern_return_t
3742ps_write_file(
3743 paging_segment_t ps,
3744 upl_t upl,
3745 vm_offset_t upl_offset,
3746 vm_offset_t offset,
3747 unsigned int size,
3748 int flags)
3749{
3750 vm_object_offset_t f_offset;
3751 kern_return_t result;
1c79356b
A
3752
3753 int error = 0;
3754
55e303ae 3755 clustered_writes[atop_32(size)]++;
1c79356b
A
3756 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3757
3758 if (vnode_pageout(ps->ps_vnode,
3759 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3760 result = KERN_FAILURE;
3761 else
3762 result = KERN_SUCCESS;
3763
3764 return result;
3765}
3766
3767kern_return_t
3768default_pager_triggers(MACH_PORT_FACE default_pager,
3769 int hi_wat,
3770 int lo_wat,
3771 int flags,
3772 MACH_PORT_FACE trigger_port)
3773{
0b4e3aa0
A
3774 MACH_PORT_FACE release;
3775 kern_return_t kr;
1c79356b 3776
0b4e3aa0
A
3777 PSL_LOCK();
3778 if (flags == HI_WAT_ALERT) {
3779 release = min_pages_trigger_port;
1c79356b
A
3780 min_pages_trigger_port = trigger_port;
3781 minimum_pages_remaining = hi_wat/vm_page_size;
3782 bs_low = FALSE;
0b4e3aa0
A
3783 kr = KERN_SUCCESS;
3784 } else if (flags == LO_WAT_ALERT) {
3785 release = max_pages_trigger_port;
1c79356b
A
3786 max_pages_trigger_port = trigger_port;
3787 maximum_pages_free = lo_wat/vm_page_size;
0b4e3aa0
A
3788 kr = KERN_SUCCESS;
3789 } else {
3790 release = trigger_port;
3791 kr = KERN_INVALID_ARGUMENT;
1c79356b 3792 }
0b4e3aa0
A
3793 PSL_UNLOCK();
3794
3795 if (IP_VALID(release))
3796 ipc_port_release_send(release);
3797
3798 return kr;
1c79356b 3799}
55e303ae
A
3800
3801/*
3802 * Monitor the amount of available backing store vs. the amount of
3803 * required backing store, notify a listener (if present) when
3804 * backing store may safely be removed.
3805 *
3806 * We attempt to avoid the situation where backing store is
3807 * discarded en masse, as this can lead to thrashing as the
3808 * backing store is compacted.
3809 */
3810
3811#define PF_INTERVAL 3 /* time between free level checks */
3812#define PF_LATENCY 10 /* number of intervals before release */
3813
3814static int dp_pages_free_low_count = 0;
3815
3816void
3817default_pager_backing_store_monitor(thread_call_param_t p1, thread_call_param_t p2)
3818{
3819 unsigned long long average;
3820 ipc_port_t trigger;
3821 uint64_t deadline;
3822
3823 /*
3824 * We determine whether it will be safe to release some
3825 * backing store by watching the free page level. If
3826 * it remains below the maximum_pages_free threshold for
3827 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3828 * then we deem it safe.
3829 *
3830 * Note that this establishes a maximum rate at which backing
3831 * store will be released, as each notification (currently)
3832 * only results in a single backing store object being
3833 * released.
3834 */
3835 if (dp_pages_free > maximum_pages_free) {
3836 dp_pages_free_low_count++;
3837 } else {
3838 dp_pages_free_low_count = 0;
3839 }
3840
3841 /* decide whether to send notification */
3842 trigger = IP_NULL;
3843 if (max_pages_trigger_port &&
3844 (backing_store_release_trigger_disable == 0) &&
3845 (dp_pages_free_low_count > PF_LATENCY)) {
3846 trigger = max_pages_trigger_port;
3847 max_pages_trigger_port = NULL;
3848 }
3849
3850 /* send notification */
3851 if (trigger != IP_NULL) {
3852 VSL_LOCK();
3853 if(backing_store_release_trigger_disable != 0) {
3854 assert_wait((event_t)
3855 &backing_store_release_trigger_disable,
3856 THREAD_UNINT);
3857 VSL_UNLOCK();
3858 thread_block(THREAD_CONTINUE_NULL);
3859 } else {
3860 VSL_UNLOCK();
3861 }
3862 default_pager_space_alert(trigger, LO_WAT_ALERT);
3863 ipc_port_release_send(trigger);
3864 dp_pages_free_low_count = 0;
3865 }
3866
3867 clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
3868 thread_call_func_delayed(default_pager_backing_store_monitor, NULL, deadline);
3869}