]> git.saurik.com Git - apple/xnu.git/blame - osfmk/default_pager/dp_backing_store.c
xnu-344.23.tar.gz
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
CommitLineData
de355530 1
1c79356b
A
2/*
3 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 *
5 * @APPLE_LICENSE_HEADER_START@
6 *
de355530
A
7 * The contents of this file constitute Original Code as defined in and
8 * are subject to the Apple Public Source License Version 1.1 (the
9 * "License"). You may not use this file except in compliance with the
10 * License. Please obtain a copy of the License at
11 * http://www.apple.com/publicsource and read it before using this file.
1c79356b 12 *
de355530
A
13 * This Original Code and all software distributed under the License are
14 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
de355530
A
17 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
18 * License for the specific language governing rights and limitations
19 * under the License.
1c79356b
A
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23/*
24 * @OSF_COPYRIGHT@
25 */
26/*
27 * Mach Operating System
28 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
29 * All Rights Reserved.
30 *
31 * Permission to use, copy, modify and distribute this software and its
32 * documentation is hereby granted, provided that both the copyright
33 * notice and this permission notice appear in all copies of the
34 * software, derivative works or modified versions, and any portions
35 * thereof, and that both notices appear in supporting documentation.
36 *
37 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
38 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
39 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
40 *
41 * Carnegie Mellon requests users of this software to return to
42 *
43 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
44 * School of Computer Science
45 * Carnegie Mellon University
46 * Pittsburgh PA 15213-3890
47 *
48 * any improvements or extensions that they make and grant Carnegie Mellon
49 * the rights to redistribute these changes.
50 */
51
52/*
53 * Default Pager.
54 * Paging File Management.
55 */
56
0b4e3aa0 57#include <mach/memory_object_control.h>
1c79356b
A
58#include <mach/memory_object_server.h>
59#include "default_pager_internal.h"
60#include <default_pager/default_pager_alerts.h>
61#include <ipc/ipc_port.h>
62#include <ipc/ipc_space.h>
63#include <kern/queue.h>
64#include <kern/counters.h>
65#include <kern/sched_prim.h>
66#include <vm/vm_kern.h>
67#include <vm/vm_pageout.h>
68/* CDY CDY */
69#include <vm/vm_map.h>
70
0b4e3aa0
A
71/*
72 * ALLOC_STRIDE... the maximum number of bytes allocated from
73 * a swap file before moving on to the next swap file... if
74 * all swap files reside on a single disk, this value should
75 * be very large (this is the default assumption)... if the
76 * swap files are spread across multiple disks, than this value
77 * should be small (128 * 1024)...
78 *
79 * This should be determined dynamically in the future
80 */
1c79356b 81
0b4e3aa0 82#define ALLOC_STRIDE (1024 * 1024 * 1024)
1c79356b
A
83int physical_transfer_cluster_count = 0;
84
9bccf70c
A
85#define VM_SUPER_CLUSTER 0x40000
86#define VM_SUPER_PAGES 64
1c79356b
A
87
88/*
89 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
90 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
91 */
92#define VSTRUCT_DEF_CLSHIFT 2
93int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
94int default_pager_clsize = 0;
95
96/* statistics */
0b4e3aa0
A
97unsigned int clustered_writes[VM_SUPER_PAGES+1];
98unsigned int clustered_reads[VM_SUPER_PAGES+1];
1c79356b
A
99
100/*
101 * Globals used for asynchronous paging operations:
102 * vs_async_list: head of list of to-be-completed I/O ops
103 * async_num_queued: number of pages completed, but not yet
104 * processed by async thread.
105 * async_requests_out: number of pages of requests not completed.
106 */
107
108#if 0
109struct vs_async *vs_async_list;
110int async_num_queued;
111int async_requests_out;
112#endif
113
114
115#define VS_ASYNC_REUSE 1
116struct vs_async *vs_async_free_list;
117
118mutex_t default_pager_async_lock; /* Protects globals above */
119
120
121int vs_alloc_async_failed = 0; /* statistics */
122int vs_alloc_async_count = 0; /* statistics */
123struct vs_async *vs_alloc_async(void); /* forward */
124void vs_free_async(struct vs_async *vsa); /* forward */
125
126
127#define VS_ALLOC_ASYNC() vs_alloc_async()
128#define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
129
130#define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
131#define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
132#define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, \
133 ETAP_IO_DEV_PAGEH)
134#define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
135/*
136 * Paging Space Hysteresis triggers and the target notification port
137 *
138 */
139
140unsigned int minimum_pages_remaining = 0;
141unsigned int maximum_pages_free = 0;
142ipc_port_t min_pages_trigger_port = NULL;
143ipc_port_t max_pages_trigger_port = NULL;
144
145boolean_t bs_low = FALSE;
0b4e3aa0 146int backing_store_release_trigger_disable = 0;
1c79356b
A
147
148
149
150/*
151 * Object sizes are rounded up to the next power of 2,
152 * unless they are bigger than a given maximum size.
153 */
154vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
155
156/*
157 * List of all backing store and segments.
158 */
159struct backing_store_list_head backing_store_list;
160paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
161mutex_t paging_segments_lock;
162int paging_segment_max = 0;
163int paging_segment_count = 0;
164int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
165
166
167/*
168 * Total pages free in system
169 * This differs from clusters committed/avail which is a measure of the
170 * over commitment of paging segments to backing store. An idea which is
171 * likely to be deprecated.
172 */
173unsigned int dp_pages_free = 0;
174unsigned int cluster_transfer_minimum = 100;
175
176kern_return_t ps_write_file(paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, int); /* forward */
0b4e3aa0
A
177kern_return_t ps_read_file (paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, unsigned int *, int); /* forward */
178
1c79356b
A
179
180default_pager_thread_t *
181get_read_buffer()
182{
183 int i;
184
185 DPT_LOCK(dpt_lock);
186 while(TRUE) {
187 for (i=0; i<default_pager_internal_count; i++) {
188 if(dpt_array[i]->checked_out == FALSE) {
189 dpt_array[i]->checked_out = TRUE;
190 DPT_UNLOCK(dpt_lock);
191 return dpt_array[i];
192 }
193 }
9bccf70c 194 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
1c79356b
A
195 }
196}
197
198void
199bs_initialize(void)
200{
201 int i;
202
203 /*
204 * List of all backing store.
205 */
206 BSL_LOCK_INIT();
207 queue_init(&backing_store_list.bsl_queue);
208 PSL_LOCK_INIT();
209
210 VS_ASYNC_LOCK_INIT();
211#if VS_ASYNC_REUSE
212 vs_async_free_list = NULL;
213#endif /* VS_ASYNC_REUSE */
214
0b4e3aa0 215 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
1c79356b
A
216 clustered_writes[i] = 0;
217 clustered_reads[i] = 0;
218 }
219
220}
221
222/*
223 * When things do not quite workout...
224 */
225void bs_no_paging_space(boolean_t); /* forward */
226
227void
228bs_no_paging_space(
229 boolean_t out_of_memory)
230{
1c79356b
A
231
232 if (out_of_memory)
233 dprintf(("*** OUT OF MEMORY ***\n"));
234 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
235}
236
237void bs_more_space(int); /* forward */
238void bs_commit(int); /* forward */
239
240boolean_t user_warned = FALSE;
241unsigned int clusters_committed = 0;
242unsigned int clusters_available = 0;
243unsigned int clusters_committed_peak = 0;
244
245void
246bs_more_space(
247 int nclusters)
248{
249 BSL_LOCK();
250 /*
251 * Account for new paging space.
252 */
253 clusters_available += nclusters;
254
255 if (clusters_available >= clusters_committed) {
256 if (verbose && user_warned) {
257 printf("%s%s - %d excess clusters now.\n",
258 my_name,
259 "paging space is OK now",
260 clusters_available - clusters_committed);
261 user_warned = FALSE;
262 clusters_committed_peak = 0;
263 }
264 } else {
265 if (verbose && user_warned) {
266 printf("%s%s - still short of %d clusters.\n",
267 my_name,
268 "WARNING: paging space over-committed",
269 clusters_committed - clusters_available);
270 clusters_committed_peak -= nclusters;
271 }
272 }
273 BSL_UNLOCK();
274
275 return;
276}
277
278void
279bs_commit(
280 int nclusters)
281{
282 BSL_LOCK();
283 clusters_committed += nclusters;
284 if (clusters_committed > clusters_available) {
285 if (verbose && !user_warned) {
286 user_warned = TRUE;
287 printf("%s%s - short of %d clusters.\n",
288 my_name,
289 "WARNING: paging space over-committed",
290 clusters_committed - clusters_available);
291 }
292 if (clusters_committed > clusters_committed_peak) {
293 clusters_committed_peak = clusters_committed;
294 }
295 } else {
296 if (verbose && user_warned) {
297 printf("%s%s - was short of up to %d clusters.\n",
298 my_name,
299 "paging space is OK now",
300 clusters_committed_peak - clusters_available);
301 user_warned = FALSE;
302 clusters_committed_peak = 0;
303 }
304 }
305 BSL_UNLOCK();
306
307 return;
308}
309
310int default_pager_info_verbose = 1;
311
312void
313bs_global_info(
314 vm_size_t *totalp,
315 vm_size_t *freep)
316{
317 vm_size_t pages_total, pages_free;
318 paging_segment_t ps;
319 int i;
1c79356b
A
320
321 PSL_LOCK();
322 pages_total = pages_free = 0;
323 for (i = 0; i <= paging_segment_max; i++) {
324 ps = paging_segments[i];
325 if (ps == PAGING_SEGMENT_NULL)
326 continue;
327
328 /*
329 * no need to lock: by the time this data
330 * gets back to any remote requestor it
331 * will be obsolete anyways
332 */
333 pages_total += ps->ps_pgnum;
334 pages_free += ps->ps_clcount << ps->ps_clshift;
335 DEBUG(DEBUG_BS_INTERNAL,
336 ("segment #%d: %d total, %d free\n",
337 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
338 }
339 *totalp = pages_total;
340 *freep = pages_free;
341 if (verbose && user_warned && default_pager_info_verbose) {
342 if (clusters_available < clusters_committed) {
343 printf("%s %d clusters committed, %d available.\n",
344 my_name,
345 clusters_committed,
346 clusters_available);
347 }
348 }
349 PSL_UNLOCK();
350}
351
352backing_store_t backing_store_alloc(void); /* forward */
353
354backing_store_t
355backing_store_alloc(void)
356{
357 backing_store_t bs;
1c79356b
A
358
359 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
360 if (bs == BACKING_STORE_NULL)
361 panic("backing_store_alloc: no memory");
362
363 BS_LOCK_INIT(bs);
364 bs->bs_port = MACH_PORT_NULL;
365 bs->bs_priority = 0;
366 bs->bs_clsize = 0;
367 bs->bs_pages_total = 0;
368 bs->bs_pages_in = 0;
369 bs->bs_pages_in_fail = 0;
370 bs->bs_pages_out = 0;
371 bs->bs_pages_out_fail = 0;
372
373 return bs;
374}
375
376backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
377
378/* Even in both the component space and external versions of this pager, */
379/* backing_store_lookup will be called from tasks in the application space */
380backing_store_t
381backing_store_lookup(
382 MACH_PORT_FACE port)
383{
384 backing_store_t bs;
385
386/*
387 port is currently backed with a vs structure in the alias field
388 we could create an ISBS alias and a port_is_bs call but frankly
389 I see no reason for the test, the bs->port == port check below
390 will work properly on junk entries.
391
392 if ((port == MACH_PORT_NULL) || port_is_vs(port))
393*/
394 if ((port == MACH_PORT_NULL))
395 return BACKING_STORE_NULL;
396
397 BSL_LOCK();
398 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
399 bs_links) {
400 BS_LOCK(bs);
401 if (bs->bs_port == port) {
402 BSL_UNLOCK();
403 /* Success, return it locked. */
404 return bs;
405 }
406 BS_UNLOCK(bs);
407 }
408 BSL_UNLOCK();
409 return BACKING_STORE_NULL;
410}
411
412void backing_store_add(backing_store_t); /* forward */
413
414void
415backing_store_add(
416 backing_store_t bs)
417{
418 MACH_PORT_FACE port = bs->bs_port;
419 MACH_PORT_FACE pset = default_pager_default_set;
420 kern_return_t kr = KERN_SUCCESS;
1c79356b
A
421
422 if (kr != KERN_SUCCESS)
423 panic("backing_store_add: add to set");
424
425}
426
427/*
428 * Set up default page shift, but only if not already
429 * set and argument is within range.
430 */
431boolean_t
432bs_set_default_clsize(unsigned int npages)
433{
434 switch(npages){
435 case 1:
436 case 2:
437 case 4:
438 case 8:
439 if (default_pager_clsize == 0) /* if not yet set */
440 vstruct_def_clshift = local_log2(npages);
441 return(TRUE);
442 }
443 return(FALSE);
444}
445
446int bs_get_global_clsize(int clsize); /* forward */
447
448int
449bs_get_global_clsize(
450 int clsize)
451{
452 int i;
0b4e3aa0 453 memory_object_default_t dmm;
1c79356b 454 kern_return_t kr;
1c79356b
A
455
456 /*
457 * Only allow setting of cluster size once. If called
458 * with no cluster size (default), we use the compiled-in default
459 * for the duration. The same cluster size is used for all
460 * paging segments.
461 */
462 if (default_pager_clsize == 0) {
1c79356b
A
463 /*
464 * Keep cluster size in bit shift because it's quicker
465 * arithmetic, and easier to keep at a power of 2.
466 */
467 if (clsize != NO_CLSIZE) {
468 for (i = 0; (1 << i) < clsize; i++);
469 if (i > MAX_CLUSTER_SHIFT)
470 i = MAX_CLUSTER_SHIFT;
471 vstruct_def_clshift = i;
472 }
473 default_pager_clsize = (1 << vstruct_def_clshift);
474
475 /*
476 * Let the user know the new (and definitive) cluster size.
477 */
478 if (verbose)
479 printf("%scluster size = %d page%s\n",
480 my_name, default_pager_clsize,
481 (default_pager_clsize == 1) ? "" : "s");
0b4e3aa0 482
1c79356b
A
483 /*
484 * Let the kernel know too, in case it hasn't used the
485 * default value provided in main() yet.
486 */
0b4e3aa0 487 dmm = default_pager_object;
1c79356b
A
488 clsize = default_pager_clsize * vm_page_size; /* in bytes */
489 kr = host_default_memory_manager(host_priv_self(),
0b4e3aa0 490 &dmm,
1c79356b 491 clsize);
0b4e3aa0
A
492 memory_object_default_deallocate(dmm);
493
1c79356b
A
494 if (kr != KERN_SUCCESS) {
495 panic("bs_get_global_cl_size:host_default_memory_manager");
496 }
0b4e3aa0 497 if (dmm != default_pager_object) {
1c79356b
A
498 panic("bs_get_global_cl_size:there is another default pager");
499 }
500 }
501 ASSERT(default_pager_clsize > 0 &&
502 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
503
504 return default_pager_clsize;
505}
506
507kern_return_t
508default_pager_backing_store_create(
0b4e3aa0
A
509 memory_object_default_t pager,
510 int priority,
511 int clsize, /* in bytes */
512 MACH_PORT_FACE *backing_store)
1c79356b
A
513{
514 backing_store_t bs;
515 MACH_PORT_FACE port;
516 kern_return_t kr;
517 struct vstruct_alias *alias_struct;
1c79356b 518
0b4e3aa0 519 if (pager != default_pager_object)
1c79356b
A
520 return KERN_INVALID_ARGUMENT;
521
522 bs = backing_store_alloc();
523 port = ipc_port_alloc_kernel();
524 ipc_port_make_send(port);
525 assert (port != IP_NULL);
526
527 DEBUG(DEBUG_BS_EXTERNAL,
528 ("priority=%d clsize=%d bs_port=0x%x\n",
529 priority, clsize, (int) backing_store));
530
531 alias_struct = (struct vstruct_alias *)
532 kalloc(sizeof (struct vstruct_alias));
533 if(alias_struct != NULL) {
534 alias_struct->vs = (struct vstruct *)bs;
535 alias_struct->name = ISVS;
536 port->alias = (int) alias_struct;
537 }
538 else {
539 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
540 kfree((vm_offset_t)bs, sizeof (struct backing_store));
541 return KERN_RESOURCE_SHORTAGE;
542 }
543
544 bs->bs_port = port;
545 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
546 priority = BS_MAXPRI;
547 else if (priority == BS_NOPRI)
548 priority = BS_MAXPRI;
549 else
550 priority = BS_MINPRI;
551 bs->bs_priority = priority;
552
de355530 553 bs->bs_clsize = bs_get_global_clsize(atop(clsize));
1c79356b
A
554
555 BSL_LOCK();
556 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
557 bs_links);
558 BSL_UNLOCK();
559
560 backing_store_add(bs);
561
562 *backing_store = port;
563 return KERN_SUCCESS;
564}
565
566kern_return_t
567default_pager_backing_store_info(
568 MACH_PORT_FACE backing_store,
569 backing_store_flavor_t flavour,
570 backing_store_info_t info,
571 mach_msg_type_number_t *size)
572{
573 backing_store_t bs;
574 backing_store_basic_info_t basic;
575 int i;
576 paging_segment_t ps;
577
578 if (flavour != BACKING_STORE_BASIC_INFO ||
579 *size < BACKING_STORE_BASIC_INFO_COUNT)
580 return KERN_INVALID_ARGUMENT;
581
582 basic = (backing_store_basic_info_t)info;
583 *size = BACKING_STORE_BASIC_INFO_COUNT;
584
585 VSTATS_LOCK(&global_stats.gs_lock);
586 basic->pageout_calls = global_stats.gs_pageout_calls;
587 basic->pagein_calls = global_stats.gs_pagein_calls;
588 basic->pages_in = global_stats.gs_pages_in;
589 basic->pages_out = global_stats.gs_pages_out;
590 basic->pages_unavail = global_stats.gs_pages_unavail;
591 basic->pages_init = global_stats.gs_pages_init;
592 basic->pages_init_writes= global_stats.gs_pages_init_writes;
593 VSTATS_UNLOCK(&global_stats.gs_lock);
594
595 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
596 return KERN_INVALID_ARGUMENT;
597
598 basic->bs_pages_total = bs->bs_pages_total;
599 PSL_LOCK();
600 bs->bs_pages_free = 0;
601 for (i = 0; i <= paging_segment_max; i++) {
602 ps = paging_segments[i];
603 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
604 PS_LOCK(ps);
605 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
606 PS_UNLOCK(ps);
607 }
608 }
609 PSL_UNLOCK();
610 basic->bs_pages_free = bs->bs_pages_free;
611 basic->bs_pages_in = bs->bs_pages_in;
612 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
613 basic->bs_pages_out = bs->bs_pages_out;
614 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
615
616 basic->bs_priority = bs->bs_priority;
de355530 617 basic->bs_clsize = ptoa(bs->bs_clsize); /* in bytes */
1c79356b
A
618
619 BS_UNLOCK(bs);
620
621 return KERN_SUCCESS;
622}
623
624int ps_delete(paging_segment_t); /* forward */
625
626int
627ps_delete(
628 paging_segment_t ps)
629{
630 vstruct_t vs;
631 kern_return_t error = KERN_SUCCESS;
632 int vs_count;
633
634 VSL_LOCK(); /* get the lock on the list of vs's */
635
636 /* The lock relationship and sequence is farily complicated */
637 /* this code looks at a live list, locking and unlocking the list */
638 /* as it traverses it. It depends on the locking behavior of */
639 /* default_pager_no_senders. no_senders always locks the vstruct */
640 /* targeted for removal before locking the vstruct list. However */
641 /* it will remove that member of the list without locking its */
642 /* neighbors. We can be sure when we hold a lock on a vstruct */
643 /* it cannot be removed from the list but we must hold the list */
644 /* lock to be sure that its pointers to its neighbors are valid. */
645 /* Also, we can hold off destruction of a vstruct when the list */
646 /* lock and the vs locks are not being held by bumping the */
647 /* vs_async_pending count. */
648
0b4e3aa0
A
649
650 while(backing_store_release_trigger_disable != 0) {
9bccf70c 651 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
0b4e3aa0
A
652 }
653
1c79356b
A
654 /* we will choose instead to hold a send right */
655 vs_count = vstruct_list.vsl_count;
656 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
657 if(vs == (vstruct_t)&vstruct_list) {
658 VSL_UNLOCK();
659 return KERN_SUCCESS;
660 }
661 VS_LOCK(vs);
662 vs_async_wait(vs); /* wait for any pending async writes */
663 if ((vs_count != 0) && (vs != NULL))
664 vs->vs_async_pending += 1; /* hold parties calling */
665 /* vs_async_wait */
666 VS_UNLOCK(vs);
667 VSL_UNLOCK();
668 while((vs_count != 0) && (vs != NULL)) {
669 /* We take the count of AMO's before beginning the */
670 /* transfer of of the target segment. */
671 /* We are guaranteed that the target segment cannot get */
672 /* more users. We also know that queue entries are */
673 /* made at the back of the list. If some of the entries */
674 /* we would check disappear while we are traversing the */
675 /* list then we will either check new entries which */
676 /* do not have any backing store in the target segment */
677 /* or re-check old entries. This might not be optimal */
678 /* but it will always be correct. The alternative is to */
679 /* take a snapshot of the list. */
680 vstruct_t next_vs;
681
682 if(dp_pages_free < cluster_transfer_minimum)
683 error = KERN_FAILURE;
684 else {
685 vm_object_t transfer_object;
0b4e3aa0 686 int count;
1c79356b
A
687 upl_t upl;
688
689 transfer_object = vm_object_allocate(VM_SUPER_CLUSTER);
0b4e3aa0
A
690 count = 0;
691 error = vm_object_upl_request(transfer_object,
692 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
693 &upl, NULL, &count,
694 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
695 | UPL_SET_INTERNAL);
1c79356b 696 if(error == KERN_SUCCESS) {
1c79356b
A
697 error = ps_vstruct_transfer_from_segment(
698 vs, ps, upl);
0b4e3aa0
A
699 upl_commit(upl, NULL);
700 upl_deallocate(upl);
1c79356b 701 } else {
1c79356b
A
702 error = KERN_FAILURE;
703 }
9bccf70c 704 vm_object_deallocate(transfer_object);
1c79356b
A
705 }
706 if(error) {
707 VS_LOCK(vs);
708 vs->vs_async_pending -= 1; /* release vs_async_wait */
0b4e3aa0
A
709 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
710 vs->vs_waiting_async = FALSE;
1c79356b 711 VS_UNLOCK(vs);
0b4e3aa0 712 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
713 } else {
714 VS_UNLOCK(vs);
715 }
716 return KERN_FAILURE;
717 }
718
719 VSL_LOCK();
0b4e3aa0
A
720
721 while(backing_store_release_trigger_disable != 0) {
9bccf70c
A
722 VSL_SLEEP(&backing_store_release_trigger_disable,
723 THREAD_UNINT);
0b4e3aa0
A
724 }
725
1c79356b
A
726 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
727 if((next_vs != (vstruct_t)&vstruct_list) &&
728 (vs != next_vs) && (vs_count != 1)) {
729 VS_LOCK(next_vs);
730 vs_async_wait(next_vs); /* wait for any */
731 /* pending async writes */
732 next_vs->vs_async_pending += 1; /* hold parties */
733 /* calling vs_async_wait */
734 VS_UNLOCK(next_vs);
735 }
736 VSL_UNLOCK();
737 VS_LOCK(vs);
738 vs->vs_async_pending -= 1;
0b4e3aa0
A
739 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
740 vs->vs_waiting_async = FALSE;
1c79356b 741 VS_UNLOCK(vs);
0b4e3aa0 742 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
743 } else {
744 VS_UNLOCK(vs);
745 }
746 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
747 vs = NULL;
748 else
749 vs = next_vs;
750 vs_count--;
751 }
752 return KERN_SUCCESS;
753}
754
755
756kern_return_t
757default_pager_backing_store_delete(
758 MACH_PORT_FACE backing_store)
759{
760 backing_store_t bs;
761 int i;
762 paging_segment_t ps;
763 int error;
764 int interim_pages_removed = 0;
765 kern_return_t kr;
1c79356b
A
766
767 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
768 return KERN_INVALID_ARGUMENT;
769
770#if 0
771 /* not implemented */
772 BS_UNLOCK(bs);
773 return KERN_FAILURE;
774#endif
775
776 restart:
777 PSL_LOCK();
778 error = KERN_SUCCESS;
779 for (i = 0; i <= paging_segment_max; i++) {
780 ps = paging_segments[i];
781 if (ps != PAGING_SEGMENT_NULL &&
782 ps->ps_bs == bs &&
783 ! ps->ps_going_away) {
784 PS_LOCK(ps);
785 /* disable access to this segment */
786 ps->ps_going_away = TRUE;
787 PS_UNLOCK(ps);
788 /*
789 * The "ps" segment is "off-line" now,
790 * we can try and delete it...
791 */
792 if(dp_pages_free < (cluster_transfer_minimum
793 + ps->ps_pgcount)) {
794 error = KERN_FAILURE;
795 PSL_UNLOCK();
796 }
797 else {
798 /* remove all pages associated with the */
799 /* segment from the list of free pages */
800 /* when transfer is through, all target */
801 /* segment pages will appear to be free */
802
803 dp_pages_free -= ps->ps_pgcount;
804 interim_pages_removed += ps->ps_pgcount;
805 PSL_UNLOCK();
806 error = ps_delete(ps);
807 }
808 if (error != KERN_SUCCESS) {
809 /*
810 * We couldn't delete the segment,
811 * probably because there's not enough
812 * virtual memory left.
813 * Re-enable all the segments.
814 */
815 PSL_LOCK();
816 break;
817 }
818 goto restart;
819 }
820 }
821
822 if (error != KERN_SUCCESS) {
823 for (i = 0; i <= paging_segment_max; i++) {
824 ps = paging_segments[i];
825 if (ps != PAGING_SEGMENT_NULL &&
826 ps->ps_bs == bs &&
827 ps->ps_going_away) {
828 PS_LOCK(ps);
829 /* re-enable access to this segment */
830 ps->ps_going_away = FALSE;
831 PS_UNLOCK(ps);
832 }
833 }
834 dp_pages_free += interim_pages_removed;
835 PSL_UNLOCK();
836 BS_UNLOCK(bs);
837 return error;
838 }
839
840 for (i = 0; i <= paging_segment_max; i++) {
841 ps = paging_segments[i];
842 if (ps != PAGING_SEGMENT_NULL &&
843 ps->ps_bs == bs) {
844 if(ps->ps_going_away) {
845 paging_segments[i] = PAGING_SEGMENT_NULL;
846 paging_segment_count--;
847 PS_LOCK(ps);
848 kfree((vm_offset_t)ps->ps_bmap,
849 RMAPSIZE(ps->ps_ncls));
850 kfree((vm_offset_t)ps, sizeof *ps);
851 }
852 }
853 }
854
855 /* Scan the entire ps array separately to make certain we find the */
856 /* proper paging_segment_max */
857 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
858 if(paging_segments[i] != PAGING_SEGMENT_NULL)
859 paging_segment_max = i;
860 }
861
862 PSL_UNLOCK();
863
864 /*
865 * All the segments have been deleted.
866 * We can remove the backing store.
867 */
868
869 /*
870 * Disable lookups of this backing store.
871 */
872 if((void *)bs->bs_port->alias != NULL)
873 kfree((vm_offset_t) bs->bs_port->alias,
874 sizeof (struct vstruct_alias));
1c79356b
A
875 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
876 bs->bs_port = MACH_PORT_NULL;
877 BS_UNLOCK(bs);
878
879 /*
880 * Remove backing store from backing_store list.
881 */
882 BSL_LOCK();
883 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
884 bs_links);
885 BSL_UNLOCK();
886
887 /*
888 * Free the backing store structure.
889 */
890 kfree((vm_offset_t)bs, sizeof *bs);
891
892 return KERN_SUCCESS;
893}
894
895int ps_enter(paging_segment_t); /* forward */
896
897int
898ps_enter(
899 paging_segment_t ps)
900{
901 int i;
902
903 PSL_LOCK();
904
905 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
906 if (paging_segments[i] == PAGING_SEGMENT_NULL)
907 break;
908 }
909
910 if (i < MAX_NUM_PAGING_SEGMENTS) {
911 paging_segments[i] = ps;
912 if (i > paging_segment_max)
913 paging_segment_max = i;
914 paging_segment_count++;
915 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
916 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
917 ps_select_array[ps->ps_bs->bs_priority] = 0;
918 i = 0;
919 } else {
920 PSL_UNLOCK();
921 return KERN_RESOURCE_SHORTAGE;
922 }
923
924 PSL_UNLOCK();
925 return i;
926}
927
928#ifdef DEVICE_PAGING
929kern_return_t
930default_pager_add_segment(
931 MACH_PORT_FACE backing_store,
932 MACH_PORT_FACE device,
933 recnum_t offset,
934 recnum_t count,
935 int record_size)
936{
937 backing_store_t bs;
938 paging_segment_t ps;
939 int i;
940 int error;
1c79356b
A
941
942 if ((bs = backing_store_lookup(backing_store))
943 == BACKING_STORE_NULL)
944 return KERN_INVALID_ARGUMENT;
945
946 PSL_LOCK();
947 for (i = 0; i <= paging_segment_max; i++) {
948 ps = paging_segments[i];
949 if (ps == PAGING_SEGMENT_NULL)
950 continue;
951
952 /*
953 * Check for overlap on same device.
954 */
955 if (!(ps->ps_device != device
956 || offset >= ps->ps_offset + ps->ps_recnum
957 || offset + count <= ps->ps_offset)) {
958 PSL_UNLOCK();
959 BS_UNLOCK(bs);
960 return KERN_INVALID_ARGUMENT;
961 }
962 }
963 PSL_UNLOCK();
964
965 /*
966 * Set up the paging segment
967 */
968 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
969 if (ps == PAGING_SEGMENT_NULL) {
970 BS_UNLOCK(bs);
971 return KERN_RESOURCE_SHORTAGE;
972 }
973
974 ps->ps_segtype = PS_PARTITION;
975 ps->ps_device = device;
976 ps->ps_offset = offset;
977 ps->ps_record_shift = local_log2(vm_page_size / record_size);
978 ps->ps_recnum = count;
979 ps->ps_pgnum = count >> ps->ps_record_shift;
980
981 ps->ps_pgcount = ps->ps_pgnum;
982 ps->ps_clshift = local_log2(bs->bs_clsize);
983 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
984 ps->ps_hint = 0;
985
986 PS_LOCK_INIT(ps);
987 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
988 if (!ps->ps_bmap) {
989 kfree((vm_offset_t)ps, sizeof *ps);
990 BS_UNLOCK(bs);
991 return KERN_RESOURCE_SHORTAGE;
992 }
993 for (i = 0; i < ps->ps_ncls; i++) {
994 clrbit(ps->ps_bmap, i);
995 }
996
997 ps->ps_going_away = FALSE;
998 ps->ps_bs = bs;
999
1000 if ((error = ps_enter(ps)) != 0) {
1001 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1002 kfree((vm_offset_t)ps, sizeof *ps);
1003 BS_UNLOCK(bs);
1004 return KERN_RESOURCE_SHORTAGE;
1005 }
1006
1007 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1008 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1009 BS_UNLOCK(bs);
1010
1011 PSL_LOCK();
1012 dp_pages_free += ps->ps_pgcount;
1013 PSL_UNLOCK();
1014
1015 bs_more_space(ps->ps_clcount);
1016
1017 DEBUG(DEBUG_BS_INTERNAL,
1018 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1019 device, offset, count, record_size,
1020 ps->ps_record_shift, ps->ps_pgnum));
1021
1022 return KERN_SUCCESS;
1023}
1024
1025boolean_t
1026bs_add_device(
1027 char *dev_name,
1028 MACH_PORT_FACE master)
1029{
1030 security_token_t null_security_token = {
1031 { 0, 0 }
1032 };
1033 MACH_PORT_FACE device;
1034 int info[DEV_GET_SIZE_COUNT];
1035 mach_msg_type_number_t info_count;
1036 MACH_PORT_FACE bs = MACH_PORT_NULL;
1037 unsigned int rec_size;
1038 recnum_t count;
1039 int clsize;
1040 MACH_PORT_FACE reply_port;
1041
1042 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1043 null_security_token, dev_name, &device))
1044 return FALSE;
1045
1046 info_count = DEV_GET_SIZE_COUNT;
1047 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1048 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1049 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1050 clsize = bs_get_global_clsize(0);
1051 if (!default_pager_backing_store_create(
0b4e3aa0 1052 default_pager_object,
1c79356b
A
1053 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1054 (clsize * vm_page_size),
1055 &bs)) {
1056 if (!default_pager_add_segment(bs, device,
1057 0, count, rec_size)) {
1058 return TRUE;
1059 }
1060 ipc_port_release_receive(bs);
1061 }
1062 }
1063
1064 ipc_port_release_send(device);
1065 return FALSE;
1066}
1067#endif /* DEVICE_PAGING */
1068
1069#if VS_ASYNC_REUSE
1070
1071struct vs_async *
1072vs_alloc_async(void)
1073{
1074 struct vs_async *vsa;
1075 MACH_PORT_FACE reply_port;
1076 kern_return_t kr;
1077
1078 VS_ASYNC_LOCK();
1079 if (vs_async_free_list == NULL) {
1080 VS_ASYNC_UNLOCK();
1081 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1082 if (vsa != NULL) {
1083 /*
1084 * Try allocating a reply port named after the
1085 * address of the vs_async structure.
1086 */
1087 struct vstruct_alias *alias_struct;
1088
1089 reply_port = ipc_port_alloc_kernel();
1090 alias_struct = (struct vstruct_alias *)
1091 kalloc(sizeof (struct vstruct_alias));
1092 if(alias_struct != NULL) {
1093 alias_struct->vs = (struct vstruct *)vsa;
1094 alias_struct->name = ISVS;
1095 reply_port->alias = (int) alias_struct;
1096 vsa->reply_port = reply_port;
1097 vs_alloc_async_count++;
1098 }
1099 else {
1100 vs_alloc_async_failed++;
1101 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1102 (reply_port));
1103 kfree((vm_offset_t)vsa,
1104 sizeof (struct vs_async));
1105 vsa = NULL;
1106 }
1107 }
1108 } else {
1109 vsa = vs_async_free_list;
1110 vs_async_free_list = vs_async_free_list->vsa_next;
1111 VS_ASYNC_UNLOCK();
1112 }
1113
1114 return vsa;
1115}
1116
1117void
1118vs_free_async(
1119 struct vs_async *vsa)
1120{
1121 VS_ASYNC_LOCK();
1122 vsa->vsa_next = vs_async_free_list;
1123 vs_async_free_list = vsa;
1124 VS_ASYNC_UNLOCK();
1125}
1126
1127#else /* VS_ASYNC_REUSE */
1128
1129struct vs_async *
1130vs_alloc_async(void)
1131{
1132 struct vs_async *vsa;
1133 MACH_PORT_FACE reply_port;
1134 kern_return_t kr;
1135
1136 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1137 if (vsa != NULL) {
1138 /*
1139 * Try allocating a reply port named after the
1140 * address of the vs_async structure.
1141 */
1142 reply_port = ipc_port_alloc_kernel();
1143 alias_struct = (vstruct_alias *)
1144 kalloc(sizeof (struct vstruct_alias));
1145 if(alias_struct != NULL) {
1146 alias_struct->vs = reply_port;
1147 alias_struct->name = ISVS;
1148 reply_port->alias = (int) vsa;
1149 vsa->reply_port = reply_port;
1150 vs_alloc_async_count++;
1151 }
1152 else {
1153 vs_alloc_async_failed++;
1154 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1155 (reply_port));
1156 kfree((vm_offset_t) vsa,
1157 sizeof (struct vs_async));
1158 vsa = NULL;
1159 }
1160 }
1161
1162 return vsa;
1163}
1164
1165void
1166vs_free_async(
1167 struct vs_async *vsa)
1168{
1c79356b
A
1169 MACH_PORT_FACE reply_port;
1170 kern_return_t kr;
1171
1172 reply_port = vsa->reply_port;
1173 kfree((vm_offset_t) reply_port->alias, sizeof (struct vstuct_alias));
1174 kfree((vm_offset_t) vsa, sizeof (struct vs_async));
1c79356b
A
1175 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1176#if 0
1177 VS_ASYNC_LOCK();
1178 vs_alloc_async_count--;
1179 VS_ASYNC_UNLOCK();
1180#endif
1181}
1182
1183#endif /* VS_ASYNC_REUSE */
1184
0b4e3aa0
A
1185zone_t vstruct_zone;
1186
1c79356b
A
1187vstruct_t
1188ps_vstruct_create(
1189 vm_size_t size)
1190{
1191 vstruct_t vs;
1192 int i;
1c79356b 1193
0b4e3aa0 1194 vs = (vstruct_t) zalloc(vstruct_zone);
1c79356b
A
1195 if (vs == VSTRUCT_NULL) {
1196 return VSTRUCT_NULL;
1197 }
1198
1199 VS_LOCK_INIT(vs);
1200
1201 /*
1202 * The following fields will be provided later.
1203 */
0b4e3aa0
A
1204 vs->vs_mem_obj = NULL;
1205 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1206 vs->vs_references = 1;
1c79356b 1207 vs->vs_seqno = 0;
1c79356b
A
1208
1209#ifdef MACH_KERNEL
1210 vs->vs_waiting_seqno = FALSE;
1211 vs->vs_waiting_read = FALSE;
1212 vs->vs_waiting_write = FALSE;
1c79356b
A
1213 vs->vs_waiting_async = FALSE;
1214#else
1215 mutex_init(&vs->vs_waiting_seqno, ETAP_DPAGE_VSSEQNO);
1216 mutex_init(&vs->vs_waiting_read, ETAP_DPAGE_VSREAD);
1217 mutex_init(&vs->vs_waiting_write, ETAP_DPAGE_VSWRITE);
1218 mutex_init(&vs->vs_waiting_refs, ETAP_DPAGE_VSREFS);
1219 mutex_init(&vs->vs_waiting_async, ETAP_DPAGE_VSASYNC);
1220#endif
1221
1222 vs->vs_readers = 0;
1223 vs->vs_writers = 0;
1224
1225 vs->vs_errors = 0;
1226
1227 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
de355530 1228 vs->vs_size = ((atop(round_page(size)) - 1) >> vs->vs_clshift) + 1;
1c79356b
A
1229 vs->vs_async_pending = 0;
1230
1231 /*
1232 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1233 * depending on the size of the memory object.
1234 */
1235 if (INDIRECT_CLMAP(vs->vs_size)) {
1236 vs->vs_imap = (struct vs_map **)
1237 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1238 vs->vs_indirect = TRUE;
1239 } else {
1240 vs->vs_dmap = (struct vs_map *)
1241 kalloc(CLMAP_SIZE(vs->vs_size));
1242 vs->vs_indirect = FALSE;
1243 }
1244 vs->vs_xfer_pending = FALSE;
1245 DEBUG(DEBUG_VS_INTERNAL,
1246 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1247
1248 /*
1249 * Check to see that we got the space.
1250 */
1251 if (!vs->vs_dmap) {
1252 kfree((vm_offset_t)vs, sizeof *vs);
1253 return VSTRUCT_NULL;
1254 }
1255
1256 /*
1257 * Zero the indirect pointers, or clear the direct pointers.
1258 */
1259 if (vs->vs_indirect)
1260 memset(vs->vs_imap, 0,
1261 INDIRECT_CLMAP_SIZE(vs->vs_size));
1262 else
1263 for (i = 0; i < vs->vs_size; i++)
1264 VSM_CLR(vs->vs_dmap[i]);
1265
1266 VS_MAP_LOCK_INIT(vs);
1267
1268 bs_commit(vs->vs_size);
1269
1270 return vs;
1271}
1272
1273paging_segment_t ps_select_segment(int, int *); /* forward */
1274
1275paging_segment_t
1276ps_select_segment(
1277 int shift,
1278 int *psindex)
1279{
1280 paging_segment_t ps;
1281 int i;
1282 int j;
1c79356b
A
1283
1284 /*
1285 * Optimize case where there's only one segment.
1286 * paging_segment_max will index the one and only segment.
1287 */
1288
1289 PSL_LOCK();
1290 if (paging_segment_count == 1) {
1291 paging_segment_t lps; /* used to avoid extra PS_UNLOCK */
0b4e3aa0 1292 ipc_port_t trigger = IP_NULL;
1c79356b
A
1293
1294 ps = paging_segments[paging_segment_max];
1295 *psindex = paging_segment_max;
1296 PS_LOCK(ps);
1297 if (ps->ps_going_away) {
1298 /* this segment is being turned off */
1299 lps = PAGING_SEGMENT_NULL;
1300 } else {
1301 ASSERT(ps->ps_clshift >= shift);
1302 if (ps->ps_clcount) {
1303 ps->ps_clcount--;
1304 dp_pages_free -= 1 << ps->ps_clshift;
1305 if(min_pages_trigger_port &&
1306 (dp_pages_free < minimum_pages_remaining)) {
0b4e3aa0 1307 trigger = min_pages_trigger_port;
1c79356b
A
1308 min_pages_trigger_port = NULL;
1309 bs_low = TRUE;
1310 }
1311 lps = ps;
1312 } else
1313 lps = PAGING_SEGMENT_NULL;
1314 }
1315 PS_UNLOCK(ps);
1316 PSL_UNLOCK();
0b4e3aa0
A
1317
1318 if (trigger != IP_NULL) {
1319 default_pager_space_alert(trigger, HI_WAT_ALERT);
1320 ipc_port_release_send(trigger);
1321 }
1c79356b
A
1322 return lps;
1323 }
1324
1325 if (paging_segment_count == 0) {
1326 PSL_UNLOCK();
1327 return PAGING_SEGMENT_NULL;
1328 }
1329
1330 for (i = BS_MAXPRI;
1331 i >= BS_MINPRI; i--) {
1332 int start_index;
1333
1334 if ((ps_select_array[i] == BS_NOPRI) ||
1335 (ps_select_array[i] == BS_FULLPRI))
1336 continue;
1337 start_index = ps_select_array[i];
1338
1339 if(!(paging_segments[start_index])) {
1340 j = start_index+1;
1341 physical_transfer_cluster_count = 0;
1342 }
0b4e3aa0 1343 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1c79356b 1344 (((paging_segments[start_index])->ps_clshift)
0b4e3aa0 1345 + vm_page_shift))) {
1c79356b
A
1346 physical_transfer_cluster_count = 0;
1347 j = start_index + 1;
1348 } else {
1349 physical_transfer_cluster_count+=1;
1350 j = start_index;
1351 if(start_index == 0)
1352 start_index = paging_segment_max;
1353 else
1354 start_index = start_index - 1;
1355 }
1356
1357 while (1) {
1358 if (j > paging_segment_max)
1359 j = 0;
1360 if ((ps = paging_segments[j]) &&
1361 (ps->ps_bs->bs_priority == i)) {
1362 /*
1363 * Force the ps cluster size to be
1364 * >= that of the vstruct.
1365 */
1366 PS_LOCK(ps);
1367 if (ps->ps_going_away) {
1368 /* this segment is being turned off */
1369 } else if ((ps->ps_clcount) &&
1370 (ps->ps_clshift >= shift)) {
0b4e3aa0
A
1371 ipc_port_t trigger = IP_NULL;
1372
1c79356b
A
1373 ps->ps_clcount--;
1374 dp_pages_free -= 1 << ps->ps_clshift;
1375 if(min_pages_trigger_port &&
1376 (dp_pages_free <
1377 minimum_pages_remaining)) {
0b4e3aa0 1378 trigger = min_pages_trigger_port;
1c79356b
A
1379 min_pages_trigger_port = NULL;
1380 }
1381 PS_UNLOCK(ps);
1382 /*
1383 * found one, quit looking.
1384 */
1385 ps_select_array[i] = j;
1386 PSL_UNLOCK();
0b4e3aa0
A
1387
1388 if (trigger != IP_NULL) {
1389 default_pager_space_alert(
1390 trigger,
1391 HI_WAT_ALERT);
1392 ipc_port_release_send(trigger);
1393 }
1c79356b
A
1394 *psindex = j;
1395 return ps;
1396 }
1397 PS_UNLOCK(ps);
1398 }
1399 if (j == start_index) {
1400 /*
1401 * none at this priority -- mark it full
1402 */
1403 ps_select_array[i] = BS_FULLPRI;
1404 break;
1405 }
1406 j++;
1407 }
1408 }
1409 PSL_UNLOCK();
1410 return PAGING_SEGMENT_NULL;
1411}
1412
1413vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1414
1415vm_offset_t
1416ps_allocate_cluster(
1417 vstruct_t vs,
1418 int *psindex,
1419 paging_segment_t use_ps)
1420{
1421 int byte_num;
1422 int bit_num = 0;
1423 paging_segment_t ps;
1424 vm_offset_t cluster;
0b4e3aa0 1425 ipc_port_t trigger = IP_NULL;
1c79356b
A
1426
1427 /*
1428 * Find best paging segment.
1429 * ps_select_segment will decrement cluster count on ps.
1430 * Must pass cluster shift to find the most appropriate segment.
1431 */
1432 /* NOTE: The addition of paging segment delete capability threatened
1433 * to seriously complicate the treatment of paging segments in this
1434 * module and the ones that call it (notably ps_clmap), because of the
1435 * difficulty in assuring that the paging segment would continue to
1436 * exist between being unlocked and locked. This was
1437 * avoided because all calls to this module are based in either
1438 * dp_memory_object calls which rely on the vs lock, or by
1439 * the transfer function which is part of the segment delete path.
1440 * The transfer function which is part of paging segment delete is
1441 * protected from multiple callers by the backing store lock.
1442 * The paging segment delete function treats mappings to a paging
1443 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1444 * while data is transferred to the remaining segments. This is in
1445 * line with the view that incomplete or in-transition mappings between
1446 * data, a vstruct, and backing store are protected by the vs lock.
1447 * This and the ordering of the paging segment "going_away" bit setting
1448 * protects us.
1449 */
1450 if (use_ps != PAGING_SEGMENT_NULL) {
1451 ps = use_ps;
1452 PSL_LOCK();
1453 PS_LOCK(ps);
1454 ps->ps_clcount--;
1455 dp_pages_free -= 1 << ps->ps_clshift;
1c79356b
A
1456 if(min_pages_trigger_port &&
1457 (dp_pages_free < minimum_pages_remaining)) {
0b4e3aa0 1458 trigger = min_pages_trigger_port;
1c79356b
A
1459 min_pages_trigger_port = NULL;
1460 }
0b4e3aa0 1461 PSL_UNLOCK();
1c79356b 1462 PS_UNLOCK(ps);
0b4e3aa0
A
1463 if (trigger != IP_NULL) {
1464 default_pager_space_alert(trigger, HI_WAT_ALERT);
1465 ipc_port_release_send(trigger);
1466 }
1467
1c79356b
A
1468 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1469 PAGING_SEGMENT_NULL) {
1470#if 0
1471 bs_no_paging_space(TRUE);
1472#endif
1473#if 0
1474 if (verbose)
1475#endif
1476 dprintf(("no space in available paging segments; "
1477 "swapon suggested\n"));
1478 /* the count got off maybe, reset to zero */
0b4e3aa0 1479 PSL_LOCK();
1c79356b
A
1480 dp_pages_free = 0;
1481 if(min_pages_trigger_port) {
0b4e3aa0 1482 trigger = min_pages_trigger_port;
1c79356b
A
1483 min_pages_trigger_port = NULL;
1484 bs_low = TRUE;
1485 }
0b4e3aa0
A
1486 PSL_UNLOCK();
1487 if (trigger != IP_NULL) {
1488 default_pager_space_alert(trigger, HI_WAT_ALERT);
1489 ipc_port_release_send(trigger);
1490 }
1c79356b
A
1491 return (vm_offset_t) -1;
1492 }
1493 ASSERT(ps->ps_clcount != 0);
1494
1495 /*
1496 * Look for an available cluster. At the end of the loop,
1497 * byte_num is the byte offset and bit_num is the bit offset of the
1498 * first zero bit in the paging segment bitmap.
1499 */
1500 PS_LOCK(ps);
1501 byte_num = ps->ps_hint;
1502 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1503 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1504 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1505 if (isclr((ps->ps_bmap + byte_num), bit_num))
1506 break;
1507 }
1508 ASSERT(bit_num != NBBY);
1509 break;
1510 }
1511 }
1512 ps->ps_hint = byte_num;
1513 cluster = (byte_num*NBBY) + bit_num;
1514
1515 /* Space was reserved, so this must be true */
1516 ASSERT(cluster < ps->ps_ncls);
1517
1518 setbit(ps->ps_bmap, cluster);
1519 PS_UNLOCK(ps);
1520
1521 return cluster;
1522}
1523
1524void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */
1525
1526void
1527ps_deallocate_cluster(
1528 paging_segment_t ps,
1529 vm_offset_t cluster)
1530{
0b4e3aa0 1531 ipc_port_t trigger = IP_NULL;
1c79356b
A
1532
1533 if (cluster >= (vm_offset_t) ps->ps_ncls)
1534 panic("ps_deallocate_cluster: Invalid cluster number");
1535
1536 /*
1537 * Lock the paging segment, clear the cluster's bitmap and increment the
1538 * number of free cluster.
1539 */
1540 PSL_LOCK();
1541 PS_LOCK(ps);
1542 clrbit(ps->ps_bmap, cluster);
1543 ++ps->ps_clcount;
1544 dp_pages_free += 1 << ps->ps_clshift;
0b4e3aa0
A
1545 if(max_pages_trigger_port
1546 && (backing_store_release_trigger_disable == 0)
1547 && (dp_pages_free > maximum_pages_free)) {
1548 trigger = max_pages_trigger_port;
1c79356b
A
1549 max_pages_trigger_port = NULL;
1550 }
0b4e3aa0 1551 PSL_UNLOCK();
1c79356b
A
1552
1553 /*
1554 * Move the hint down to the freed cluster if it is
1555 * less than the current hint.
1556 */
1557 if ((cluster/NBBY) < ps->ps_hint) {
1558 ps->ps_hint = (cluster/NBBY);
1559 }
1560
1561 PS_UNLOCK(ps);
1562
1563 /*
1564 * If we're freeing space on a full priority, reset the array.
1565 */
1566 PSL_LOCK();
1567 if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1568 ps_select_array[ps->ps_bs->bs_priority] = 0;
1569 PSL_UNLOCK();
1570
0b4e3aa0
A
1571 if (trigger != IP_NULL) {
1572 VSL_LOCK();
1573 if(backing_store_release_trigger_disable != 0) {
1574 assert_wait((event_t)
1575 &backing_store_release_trigger_disable,
1576 THREAD_UNINT);
1577 VSL_UNLOCK();
9bccf70c 1578 thread_block(THREAD_CONTINUE_NULL);
0b4e3aa0
A
1579 } else {
1580 VSL_UNLOCK();
1581 }
1582 default_pager_space_alert(trigger, LO_WAT_ALERT);
1583 ipc_port_release_send(trigger);
1584 }
1585
1c79356b
A
1586 return;
1587}
1588
1589void ps_dealloc_vsmap(struct vs_map *, vm_size_t); /* forward */
1590
1591void
1592ps_dealloc_vsmap(
1593 struct vs_map *vsmap,
1594 vm_size_t size)
1595{
1596 int i;
1597 for (i = 0; i < size; i++)
1598 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1599 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1600 VSM_CLOFF(vsmap[i]));
1601}
1602
1603void
1604ps_vstruct_dealloc(
1605 vstruct_t vs)
1606{
1607 int i;
1608 spl_t s;
1c79356b
A
1609
1610 VS_MAP_LOCK(vs);
1611
1612 /*
1613 * If this is an indirect structure, then we walk through the valid
1614 * (non-zero) indirect pointers and deallocate the clusters
1615 * associated with each used map entry (via ps_dealloc_vsmap).
1616 * When all of the clusters in an indirect block have been
1617 * freed, we deallocate the block. When all of the indirect
1618 * blocks have been deallocated we deallocate the memory
1619 * holding the indirect pointers.
1620 */
1621 if (vs->vs_indirect) {
1622 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1623 if (vs->vs_imap[i] != NULL) {
1624 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1625 kfree((vm_offset_t)vs->vs_imap[i],
1626 CLMAP_THRESHOLD);
1627 }
1628 }
1629 kfree((vm_offset_t)vs->vs_imap,
1630 INDIRECT_CLMAP_SIZE(vs->vs_size));
1631 } else {
1632 /*
1633 * Direct map. Free used clusters, then memory.
1634 */
1635 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1636 kfree((vm_offset_t)vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1637 }
1638 VS_MAP_UNLOCK(vs);
1639
1640 bs_commit(- vs->vs_size);
1641
0b4e3aa0 1642 zfree(vstruct_zone, (vm_offset_t)vs);
1c79356b
A
1643}
1644
1645int ps_map_extend(vstruct_t, int); /* forward */
1646
1647int ps_map_extend(
1648 vstruct_t vs,
1649 int new_size)
1650{
1651 struct vs_map **new_imap;
1652 struct vs_map *new_dmap = NULL;
1653 int newdsize;
1654 int i;
1655 void *old_map = NULL;
1656 int old_map_size = 0;
1657
1658 if (vs->vs_size >= new_size) {
1659 /*
1660 * Someone has already done the work.
1661 */
1662 return 0;
1663 }
1664
1665 /*
1666 * If the new size extends into the indirect range, then we have one
1667 * of two cases: we are going from indirect to indirect, or we are
1668 * going from direct to indirect. If we are going from indirect to
1669 * indirect, then it is possible that the new size will fit in the old
1670 * indirect map. If this is the case, then just reset the size of the
1671 * vstruct map and we are done. If the new size will not
1672 * fit into the old indirect map, then we have to allocate a new
1673 * indirect map and copy the old map pointers into this new map.
1674 *
1675 * If we are going from direct to indirect, then we have to allocate a
1676 * new indirect map and copy the old direct pages into the first
1677 * indirect page of the new map.
1678 * NOTE: allocating memory here is dangerous, as we're in the
1679 * pageout path.
1680 */
1681 if (INDIRECT_CLMAP(new_size)) {
1682 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1683
1684 /*
1685 * Get a new indirect map and zero it.
1686 */
1687 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1688 if (vs->vs_indirect &&
1689 (new_map_size == old_map_size)) {
1690 bs_commit(new_size - vs->vs_size);
1691 vs->vs_size = new_size;
1692 return 0;
1693 }
1694
1695 new_imap = (struct vs_map **)kalloc(new_map_size);
1696 if (new_imap == NULL) {
1697 return -1;
1698 }
1699 memset(new_imap, 0, new_map_size);
1700
1701 if (vs->vs_indirect) {
1702 /* Copy old entries into new map */
1703 memcpy(new_imap, vs->vs_imap, old_map_size);
1704 /* Arrange to free the old map */
1705 old_map = (void *) vs->vs_imap;
1706 newdsize = 0;
1707 } else { /* Old map was a direct map */
1708 /* Allocate an indirect page */
1709 if ((new_imap[0] = (struct vs_map *)
1710 kalloc(CLMAP_THRESHOLD)) == NULL) {
1711 kfree((vm_offset_t)new_imap, new_map_size);
1712 return -1;
1713 }
1714 new_dmap = new_imap[0];
1715 newdsize = CLMAP_ENTRIES;
1716 }
1717 } else {
1718 new_imap = NULL;
1719 newdsize = new_size;
1720 /*
1721 * If the new map is a direct map, then the old map must
1722 * also have been a direct map. All we have to do is
1723 * to allocate a new direct map, copy the old entries
1724 * into it and free the old map.
1725 */
1726 if ((new_dmap = (struct vs_map *)
1727 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1728 return -1;
1729 }
1730 }
1731 if (newdsize) {
1732
1733 /* Free the old map */
1734 old_map = (void *) vs->vs_dmap;
1735 old_map_size = CLMAP_SIZE(vs->vs_size);
1736
1737 /* Copy info from the old map into the new map */
1738 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1739
1740 /* Initialize the rest of the new map */
1741 for (i = vs->vs_size; i < newdsize; i++)
1742 VSM_CLR(new_dmap[i]);
1743 }
1744 if (new_imap) {
1745 vs->vs_imap = new_imap;
1746 vs->vs_indirect = TRUE;
1747 } else
1748 vs->vs_dmap = new_dmap;
1749 bs_commit(new_size - vs->vs_size);
1750 vs->vs_size = new_size;
1751 if (old_map)
1752 kfree((vm_offset_t)old_map, old_map_size);
1753 return 0;
1754}
1755
1756vm_offset_t
1757ps_clmap(
1758 vstruct_t vs,
1759 vm_offset_t offset,
1760 struct clmap *clmap,
1761 int flag,
1762 vm_size_t size,
1763 int error)
1764{
1765 vm_offset_t cluster; /* The cluster of offset. */
1766 vm_offset_t newcl; /* The new cluster allocated. */
1767 vm_offset_t newoff;
1768 int i;
1769 struct vs_map *vsmap;
1c79356b
A
1770
1771 VS_MAP_LOCK(vs);
1772
1773 ASSERT(vs->vs_dmap);
de355530 1774 cluster = atop(offset) >> vs->vs_clshift;
1c79356b
A
1775
1776 /*
1777 * Initialize cluster error value
1778 */
1779 clmap->cl_error = 0;
1780
1781 /*
1782 * If the object has grown, extend the page map.
1783 */
1784 if (cluster >= vs->vs_size) {
1785 if (flag == CL_FIND) {
1786 /* Do not allocate if just doing a lookup */
1787 VS_MAP_UNLOCK(vs);
1788 return (vm_offset_t) -1;
1789 }
1790 if (ps_map_extend(vs, cluster + 1)) {
1791 VS_MAP_UNLOCK(vs);
1792 return (vm_offset_t) -1;
1793 }
1794 }
1795
1796 /*
1797 * Look for the desired cluster. If the map is indirect, then we
1798 * have a two level lookup. First find the indirect block, then
1799 * find the actual cluster. If the indirect block has not yet
1800 * been allocated, then do so. If the cluster has not yet been
1801 * allocated, then do so.
1802 *
1803 * If any of the allocations fail, then return an error.
1804 * Don't allocate if just doing a lookup.
1805 */
1806 if (vs->vs_indirect) {
1807 long ind_block = cluster/CLMAP_ENTRIES;
1808
1809 /* Is the indirect block allocated? */
1810 vsmap = vs->vs_imap[ind_block];
1811 if (vsmap == NULL) {
1812 if (flag == CL_FIND) {
1813 VS_MAP_UNLOCK(vs);
1814 return (vm_offset_t) -1;
1815 }
1816
1817 /* Allocate the indirect block */
1818 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1819 if (vsmap == NULL) {
1820 VS_MAP_UNLOCK(vs);
1821 return (vm_offset_t) -1;
1822 }
1823 /* Initialize the cluster offsets */
1824 for (i = 0; i < CLMAP_ENTRIES; i++)
1825 VSM_CLR(vsmap[i]);
1826 vs->vs_imap[ind_block] = vsmap;
1827 }
1828 } else
1829 vsmap = vs->vs_dmap;
1830
1831 ASSERT(vsmap);
1832 vsmap += cluster%CLMAP_ENTRIES;
1833
1834 /*
1835 * At this point, vsmap points to the struct vs_map desired.
1836 *
1837 * Look in the map for the cluster, if there was an error on a
1838 * previous write, flag it and return. If it is not yet
1839 * allocated, then allocate it, if we're writing; if we're
1840 * doing a lookup and the cluster's not allocated, return error.
1841 */
1842 if (VSM_ISERR(*vsmap)) {
1843 clmap->cl_error = VSM_GETERR(*vsmap);
1844 VS_MAP_UNLOCK(vs);
1845 return (vm_offset_t) -1;
1846 } else if (VSM_ISCLR(*vsmap)) {
1847 int psindex;
1848
1849 if (flag == CL_FIND) {
1850 /*
1851 * If there's an error and the entry is clear, then
1852 * we've run out of swap space. Record the error
1853 * here and return.
1854 */
1855 if (error) {
1856 VSM_SETERR(*vsmap, error);
1857 }
1858 VS_MAP_UNLOCK(vs);
1859 return (vm_offset_t) -1;
1860 } else {
1861 /*
1862 * Attempt to allocate a cluster from the paging segment
1863 */
1864 newcl = ps_allocate_cluster(vs, &psindex,
1865 PAGING_SEGMENT_NULL);
1866 if (newcl == -1) {
1867 VS_MAP_UNLOCK(vs);
1868 return (vm_offset_t) -1;
1869 }
1870 VSM_CLR(*vsmap);
1871 VSM_SETCLOFF(*vsmap, newcl);
1872 VSM_SETPS(*vsmap, psindex);
1873 }
1874 } else
1875 newcl = VSM_CLOFF(*vsmap);
1876
1877 /*
1878 * Fill in pertinent fields of the clmap
1879 */
1880 clmap->cl_ps = VSM_PS(*vsmap);
1881 clmap->cl_numpages = VSCLSIZE(vs);
1882 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1883
1884 /*
1885 * Byte offset in paging segment is byte offset to cluster plus
1886 * byte offset within cluster. It looks ugly, but should be
1887 * relatively quick.
1888 */
1889 ASSERT(trunc_page(offset) == offset);
de355530 1890 newcl = ptoa(newcl) << vs->vs_clshift;
1c79356b
A
1891 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1892 if (flag == CL_ALLOC) {
1893 /*
1894 * set bits in the allocation bitmap according to which
1895 * pages were requested. size is in bytes.
1896 */
de355530 1897 i = atop(newoff);
1c79356b
A
1898 while ((size > 0) && (i < VSCLSIZE(vs))) {
1899 VSM_SETALLOC(*vsmap, i);
1900 i++;
1901 size -= vm_page_size;
1902 }
1903 }
1904 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1905 if (newoff) {
1906 /*
1907 * Offset is not cluster aligned, so number of pages
1908 * and bitmaps must be adjusted
1909 */
de355530 1910 clmap->cl_numpages -= atop(newoff);
1c79356b
A
1911 CLMAP_SHIFT(clmap, vs);
1912 CLMAP_SHIFTALLOC(clmap, vs);
1913 }
1914
1915 /*
1916 *
1917 * The setting of valid bits and handling of write errors
1918 * must be done here, while we hold the lock on the map.
1919 * It logically should be done in ps_vs_write_complete().
1920 * The size and error information has been passed from
1921 * ps_vs_write_complete(). If the size parameter is non-zero,
1922 * then there is work to be done. If error is also non-zero,
1923 * then the error number is recorded in the cluster and the
1924 * entire cluster is in error.
1925 */
1926 if (size && flag == CL_FIND) {
1927 vm_offset_t off = (vm_offset_t) 0;
1928
1929 if (!error) {
1930 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1931 i++) {
1932 VSM_SETPG(*vsmap, i);
1933 size -= vm_page_size;
1934 }
1935 ASSERT(i <= VSCLSIZE(vs));
1936 } else {
1937 BS_STAT(clmap->cl_ps->ps_bs,
1938 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
de355530 1939 atop(size));
1c79356b
A
1940 off = VSM_CLOFF(*vsmap);
1941 VSM_SETERR(*vsmap, error);
1942 }
1943 /*
1944 * Deallocate cluster if error, and no valid pages
1945 * already present.
1946 */
1947 if (off != (vm_offset_t) 0)
1948 ps_deallocate_cluster(clmap->cl_ps, off);
1949 VS_MAP_UNLOCK(vs);
1950 return (vm_offset_t) 0;
1951 } else
1952 VS_MAP_UNLOCK(vs);
1953
1954 DEBUG(DEBUG_VS_INTERNAL,
1955 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1956 newcl+newoff, (int) vs, (int) vsmap, flag));
1957 DEBUG(DEBUG_VS_INTERNAL,
1958 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1959 (int) clmap->cl_ps, clmap->cl_numpages,
1960 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1961
1962 return (newcl + newoff);
1963}
1964
1965void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */
1966
1967void
1968ps_clunmap(
1969 vstruct_t vs,
1970 vm_offset_t offset,
1971 vm_size_t length)
1972{
1973 vm_offset_t cluster; /* The cluster number of offset */
1974 struct vs_map *vsmap;
1c79356b
A
1975
1976 VS_MAP_LOCK(vs);
1977
1978 /*
1979 * Loop through all clusters in this range, freeing paging segment
1980 * clusters and map entries as encountered.
1981 */
1982 while (length > 0) {
1983 vm_offset_t newoff;
1984 int i;
1985
de355530 1986 cluster = atop(offset) >> vs->vs_clshift;
1c79356b
A
1987 if (vs->vs_indirect) /* indirect map */
1988 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
1989 else
1990 vsmap = vs->vs_dmap;
1991 if (vsmap == NULL) {
1992 VS_MAP_UNLOCK(vs);
1993 return;
1994 }
1995 vsmap += cluster%CLMAP_ENTRIES;
1996 if (VSM_ISCLR(*vsmap)) {
1997 length -= vm_page_size;
1998 offset += vm_page_size;
1999 continue;
2000 }
2001 /*
2002 * We've got a valid mapping. Clear it and deallocate
2003 * paging segment cluster pages.
2004 * Optimize for entire cluster cleraing.
2005 */
2006 if (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) {
2007 /*
2008 * Not cluster aligned.
2009 */
2010 ASSERT(trunc_page(newoff) == newoff);
de355530 2011 i = atop(newoff);
1c79356b
A
2012 } else
2013 i = 0;
2014 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2015 VSM_CLRPG(*vsmap, i);
2016 VSM_CLRALLOC(*vsmap, i);
2017 length -= vm_page_size;
2018 offset += vm_page_size;
2019 i++;
2020 }
2021
2022 /*
2023 * If map entry is empty, clear and deallocate cluster.
2024 */
2025 if (!VSM_ALLOC(*vsmap)) {
2026 ps_deallocate_cluster(VSM_PS(*vsmap),
2027 VSM_CLOFF(*vsmap));
2028 VSM_CLR(*vsmap);
2029 }
2030 }
2031
2032 VS_MAP_UNLOCK(vs);
2033}
2034
2035void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2036
2037void
2038ps_vs_write_complete(
2039 vstruct_t vs,
2040 vm_offset_t offset,
2041 vm_size_t size,
2042 int error)
2043{
2044 struct clmap clmap;
2045
2046 /*
2047 * Get the struct vsmap for this cluster.
2048 * Use READ, even though it was written, because the
2049 * cluster MUST be present, unless there was an error
2050 * in the original ps_clmap (e.g. no space), in which
2051 * case, nothing happens.
2052 *
2053 * Must pass enough information to ps_clmap to allow it
2054 * to set the vs_map structure bitmap under lock.
2055 */
2056 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2057}
2058
2059void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */
2060
2061void
2062vs_cl_write_complete(
2063 vstruct_t vs,
2064 paging_segment_t ps,
2065 vm_offset_t offset,
2066 vm_offset_t addr,
2067 vm_size_t size,
2068 boolean_t async,
2069 int error)
2070{
1c79356b
A
2071 kern_return_t kr;
2072
2073 if (error) {
2074 /*
2075 * For internal objects, the error is recorded on a
2076 * per-cluster basis by ps_clmap() which is called
2077 * by ps_vs_write_complete() below.
2078 */
2079 dprintf(("write failed error = 0x%x\n", error));
2080 /* add upl_abort code here */
2081 } else
de355530 2082 GSTAT(global_stats.gs_pages_out += atop(size));
1c79356b
A
2083 /*
2084 * Notify the vstruct mapping code, so it can do its accounting.
2085 */
2086 ps_vs_write_complete(vs, offset, size, error);
2087
2088 if (async) {
2089 VS_LOCK(vs);
2090 ASSERT(vs->vs_async_pending > 0);
2091 vs->vs_async_pending -= size;
0b4e3aa0
A
2092 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2093 vs->vs_waiting_async = FALSE;
1c79356b
A
2094 VS_UNLOCK(vs);
2095 /* mutex_unlock(&vs->vs_waiting_async); */
0b4e3aa0 2096 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
2097 } else {
2098 VS_UNLOCK(vs);
2099 }
2100 }
2101}
2102
2103#ifdef DEVICE_PAGING
2104kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2105
2106kern_return_t
2107device_write_reply(
2108 MACH_PORT_FACE reply_port,
2109 kern_return_t device_code,
2110 io_buf_len_t bytes_written)
2111{
2112 struct vs_async *vsa;
1c79356b
A
2113
2114 vsa = (struct vs_async *)
2115 ((struct vstruct_alias *)(reply_port->alias))->vs;
2116
2117 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2118 device_code = KERN_FAILURE;
2119 }
2120
2121 vsa->vsa_error = device_code;
2122
2123
2124 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2125 if(vsa->vsa_flags & VSA_TRANSFER) {
2126 /* revisit when async disk segments redone */
2127 if(vsa->vsa_error) {
2128 /* need to consider error condition. re-write data or */
2129 /* throw it away here. */
2130 vm_offset_t ioaddr;
2131 if(vm_map_copyout(kernel_map, &ioaddr,
2132 (vm_map_copy_t)vsa->vsa_addr) != KERN_SUCCESS)
2133 panic("vs_cluster_write: unable to copy source list\n");
2134 vm_deallocate(kernel_map, ioaddr, vsa->vsa_size);
2135 }
2136 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2137 vsa->vsa_size, vsa->vsa_error);
2138 } else {
2139 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2140 vsa->vsa_addr, vsa->vsa_size, TRUE,
2141 vsa->vsa_error);
2142 }
2143 VS_FREE_ASYNC(vsa);
2144
2145 return KERN_SUCCESS;
2146}
2147
2148kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2149kern_return_t
2150device_write_reply_inband(
2151 MACH_PORT_FACE reply_port,
2152 kern_return_t return_code,
2153 io_buf_len_t bytes_written)
2154{
2155 panic("device_write_reply_inband: illegal");
2156 return KERN_SUCCESS;
2157}
2158
2159kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2160kern_return_t
2161device_read_reply(
2162 MACH_PORT_FACE reply_port,
2163 kern_return_t return_code,
2164 io_buf_ptr_t data,
2165 mach_msg_type_number_t dataCnt)
2166{
2167 struct vs_async *vsa;
2168 vsa = (struct vs_async *)
2169 ((struct vstruct_alias *)(reply_port->alias))->vs;
2170 vsa->vsa_addr = (vm_offset_t)data;
2171 vsa->vsa_size = (vm_size_t)dataCnt;
2172 vsa->vsa_error = return_code;
2173 thread_wakeup(&vsa->vsa_lock);
2174 return KERN_SUCCESS;
2175}
2176
2177kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2178kern_return_t
2179device_read_reply_inband(
2180 MACH_PORT_FACE reply_port,
2181 kern_return_t return_code,
2182 io_buf_ptr_inband_t data,
2183 mach_msg_type_number_t dataCnt)
2184{
2185 panic("device_read_reply_inband: illegal");
2186 return KERN_SUCCESS;
2187}
2188
2189kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2190kern_return_t
2191device_read_reply_overwrite(
2192 MACH_PORT_FACE reply_port,
2193 kern_return_t return_code,
2194 io_buf_len_t bytes_read)
2195{
2196 panic("device_read_reply_overwrite: illegal\n");
2197 return KERN_SUCCESS;
2198}
2199
2200kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2201kern_return_t
2202device_open_reply(
2203 MACH_PORT_FACE reply_port,
2204 kern_return_t return_code,
2205 MACH_PORT_FACE device_port)
2206{
2207 panic("device_open_reply: illegal\n");
2208 return KERN_SUCCESS;
2209}
2210
2211kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
2212
2213kern_return_t
2214ps_read_device(
2215 paging_segment_t ps,
2216 vm_offset_t offset,
2217 vm_offset_t *bufferp,
2218 unsigned int size,
2219 unsigned int *residualp,
2220 int flags)
2221{
2222 kern_return_t kr;
2223 recnum_t dev_offset;
2224 unsigned int bytes_wanted;
2225 unsigned int bytes_read;
2226 unsigned int total_read;
2227 vm_offset_t dev_buffer;
2228 vm_offset_t buf_ptr;
2229 unsigned int records_read;
1c79356b
A
2230 struct vs_async *vsa;
2231 mutex_t vs_waiting_read_reply;
2232
2233 device_t device;
2234 vm_map_copy_t device_data = NULL;
2235 default_pager_thread_t *dpt = NULL;
2236
2237 device = dev_port_lookup(ps->ps_device);
de355530 2238 clustered_reads[atop(size)]++;
1c79356b
A
2239
2240 dev_offset = (ps->ps_offset +
2241 (offset >> (vm_page_shift - ps->ps_record_shift)));
2242 bytes_wanted = size;
2243 total_read = 0;
2244 *bufferp = (vm_offset_t)NULL;
2245
2246 do {
2247 vsa = VS_ALLOC_ASYNC();
2248 if (vsa) {
2249 vsa->vsa_vs = NULL;
2250 vsa->vsa_addr = 0;
2251 vsa->vsa_offset = 0;
2252 vsa->vsa_size = 0;
2253 vsa->vsa_ps = NULL;
2254 }
2255 mutex_init(&vsa->vsa_lock, ETAP_DPAGE_VSSEQNO);
2256 ip_lock(vsa->reply_port);
2257 vsa->reply_port->ip_sorights++;
2258 ip_reference(vsa->reply_port);
2259 ip_unlock(vsa->reply_port);
2260 kr = ds_device_read_common(device,
2261 vsa->reply_port,
2262 (mach_msg_type_name_t)
2263 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2264 (dev_mode_t) 0,
2265 dev_offset,
2266 bytes_wanted,
2267 (IO_READ | IO_CALL),
2268 (io_buf_ptr_t *) &dev_buffer,
2269 (mach_msg_type_number_t *) &bytes_read);
2270 if(kr == MIG_NO_REPLY) {
2271 assert_wait(&vsa->vsa_lock, THREAD_UNINT);
9bccf70c 2272 thread_block(THREAD_CONTINUE_NULL);
1c79356b
A
2273
2274 dev_buffer = vsa->vsa_addr;
2275 bytes_read = (unsigned int)vsa->vsa_size;
2276 kr = vsa->vsa_error;
2277 }
2278 VS_FREE_ASYNC(vsa);
2279 if (kr != KERN_SUCCESS || bytes_read == 0) {
2280 break;
2281 }
2282 total_read += bytes_read;
2283
2284 /*
2285 * If we got the entire range, use the returned dev_buffer.
2286 */
2287 if (bytes_read == size) {
2288 *bufferp = (vm_offset_t)dev_buffer;
2289 break;
2290 }
2291
2292#if 1
2293 dprintf(("read only %d bytes out of %d\n",
2294 bytes_read, bytes_wanted));
2295#endif
2296 if(dpt == NULL) {
2297 dpt = get_read_buffer();
2298 buf_ptr = dpt->dpt_buffer;
2299 *bufferp = (vm_offset_t)buf_ptr;
2300 }
2301 /*
2302 * Otherwise, copy the data into the provided buffer (*bufferp)
2303 * and append the rest of the range as it comes in.
2304 */
2305 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2306 buf_ptr += bytes_read;
2307 bytes_wanted -= bytes_read;
2308 records_read = (bytes_read >>
2309 (vm_page_shift - ps->ps_record_shift));
2310 dev_offset += records_read;
2311 DEBUG(DEBUG_VS_INTERNAL,
2312 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2313 dev_buffer, bytes_read));
2314 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2315 != KERN_SUCCESS)
2316 Panic("dealloc buf");
2317 } while (bytes_wanted);
2318
2319 *residualp = size - total_read;
2320 if((dev_buffer != *bufferp) && (total_read != 0)) {
2321 vm_offset_t temp_buffer;
2322 vm_allocate(kernel_map, &temp_buffer, total_read, TRUE);
2323 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2324 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2325 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2326 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2327 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2328 (vm_map_copy_t *)&device_data, FALSE))
2329 panic("ps_read_device: cannot copyin locally provided buffer\n");
2330 }
2331 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2332 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2333 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2334 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2335 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2336 (vm_map_copy_t *)&device_data, FALSE))
2337 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2338 }
2339 else {
2340 device_data = NULL;
2341 }
2342 *bufferp = (vm_offset_t)device_data;
2343
2344 if(dpt != NULL) {
2345 /* Free the receive buffer */
2346 dpt->checked_out = 0;
2347 thread_wakeup(&dpt_array);
2348 }
2349 return KERN_SUCCESS;
2350}
2351
2352kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
2353
2354kern_return_t
2355ps_write_device(
2356 paging_segment_t ps,
2357 vm_offset_t offset,
2358 vm_offset_t addr,
2359 unsigned int size,
2360 struct vs_async *vsa)
2361{
2362 recnum_t dev_offset;
2363 io_buf_len_t bytes_to_write, bytes_written;
2364 recnum_t records_written;
2365 kern_return_t kr;
2366 MACH_PORT_FACE reply_port;
1c79356b
A
2367
2368
2369
de355530 2370 clustered_writes[atop(size)]++;
1c79356b
A
2371
2372 dev_offset = (ps->ps_offset +
2373 (offset >> (vm_page_shift - ps->ps_record_shift)));
2374 bytes_to_write = size;
2375
2376 if (vsa) {
2377 /*
2378 * Asynchronous write.
2379 */
2380 reply_port = vsa->reply_port;
2381 ip_lock(reply_port);
2382 reply_port->ip_sorights++;
2383 ip_reference(reply_port);
2384 ip_unlock(reply_port);
2385 {
2386 device_t device;
2387 device = dev_port_lookup(ps->ps_device);
2388
2389 vsa->vsa_addr = addr;
2390 kr=ds_device_write_common(device,
2391 reply_port,
2392 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2393 (dev_mode_t) 0,
2394 dev_offset,
2395 (io_buf_ptr_t) addr,
2396 size,
2397 (IO_WRITE | IO_CALL),
2398 &bytes_written);
2399 }
2400 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2401 if (verbose)
2402 dprintf(("%s0x%x, addr=0x%x,"
2403 "size=0x%x,offset=0x%x\n",
2404 "device_write_request returned ",
2405 kr, addr, size, offset));
2406 BS_STAT(ps->ps_bs,
de355530 2407 ps->ps_bs->bs_pages_out_fail += atop(size));
1c79356b
A
2408 /* do the completion notification to free resources */
2409 device_write_reply(reply_port, kr, 0);
2410 return PAGER_ERROR;
2411 }
2412 } else do {
2413 /*
2414 * Synchronous write.
2415 */
2416 {
2417 device_t device;
2418 device = dev_port_lookup(ps->ps_device);
2419 kr=ds_device_write_common(device,
2420 IP_NULL, 0,
2421 (dev_mode_t) 0,
2422 dev_offset,
2423 (io_buf_ptr_t) addr,
2424 size,
2425 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2426 &bytes_written);
2427 }
2428 if (kr != KERN_SUCCESS) {
2429 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2430 "device_write returned ",
2431 kr, addr, size, offset));
2432 BS_STAT(ps->ps_bs,
de355530 2433 ps->ps_bs->bs_pages_out_fail += atop(size));
1c79356b
A
2434 return PAGER_ERROR;
2435 }
2436 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2437 Panic("fragmented write");
2438 records_written = (bytes_written >>
2439 (vm_page_shift - ps->ps_record_shift));
2440 dev_offset += records_written;
2441#if 1
2442 if (bytes_written != bytes_to_write) {
2443 dprintf(("wrote only %d bytes out of %d\n",
2444 bytes_written, bytes_to_write));
2445 }
2446#endif
2447 bytes_to_write -= bytes_written;
2448 addr += bytes_written;
2449 } while (bytes_to_write > 0);
2450
2451 return PAGER_SUCCESS;
2452}
2453
2454
2455#else /* !DEVICE_PAGING */
2456
2457kern_return_t
2458ps_read_device(
2459 paging_segment_t ps,
2460 vm_offset_t offset,
2461 vm_offset_t *bufferp,
2462 unsigned int size,
2463 unsigned int *residualp,
2464 int flags)
2465{
2466 panic("ps_read_device not supported");
2467}
2468
2469ps_write_device(
2470 paging_segment_t ps,
2471 vm_offset_t offset,
2472 vm_offset_t addr,
2473 unsigned int size,
2474 struct vs_async *vsa)
2475{
2476 panic("ps_write_device not supported");
2477}
2478
2479#endif /* DEVICE_PAGING */
2480void pvs_object_data_provided(vstruct_t, upl_t, vm_offset_t, vm_size_t); /* forward */
2481
2482void
2483pvs_object_data_provided(
2484 vstruct_t vs,
2485 upl_t upl,
2486 vm_offset_t offset,
2487 vm_size_t size)
2488{
1c79356b
A
2489
2490 DEBUG(DEBUG_VS_INTERNAL,
2491 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2492 upl, offset, size));
2493
2494 ASSERT(size > 0);
de355530 2495 GSTAT(global_stats.gs_pages_in += atop(size));
1c79356b
A
2496
2497
2498#if USE_PRECIOUS
2499 ps_clunmap(vs, offset, size);
2500#endif /* USE_PRECIOUS */
2501
2502}
2503
2504kern_return_t
2505pvs_cluster_read(
2506 vstruct_t vs,
0b4e3aa0 2507 vm_offset_t vs_offset,
1c79356b
A
2508 vm_size_t cnt)
2509{
1c79356b
A
2510 upl_t upl;
2511 kern_return_t error = KERN_SUCCESS;
0b4e3aa0 2512 int size;
1c79356b
A
2513 unsigned int residual;
2514 unsigned int request_flags;
0b4e3aa0
A
2515 int seg_index;
2516 int pages_in_cl;
2517 int cl_size;
2518 int cl_mask;
2519 int cl_index;
2520 int xfer_size;
2521 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2522 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2523 struct clmap clmap;
2524
2525 pages_in_cl = 1 << vs->vs_clshift;
2526 cl_size = pages_in_cl * vm_page_size;
2527 cl_mask = cl_size - 1;
1c79356b
A
2528
2529 /*
0b4e3aa0
A
2530 * This loop will be executed multiple times until the entire
2531 * request has been satisfied... if the request spans cluster
2532 * boundaries, the clusters will be checked for logical continunity,
2533 * if contiguous the I/O request will span multiple clusters, otherwise
2534 * it will be broken up into the minimal set of I/O's
1c79356b 2535 *
0b4e3aa0
A
2536 * If there are holes in a request (either unallocated pages in a paging
2537 * segment or an unallocated paging segment), we stop
1c79356b
A
2538 * reading at the hole, inform the VM of any data read, inform
2539 * the VM of an unavailable range, then loop again, hoping to
0b4e3aa0 2540 * find valid pages later in the requested range. This continues until
1c79356b
A
2541 * the entire range has been examined, and read, if present.
2542 */
2543
2544#if USE_PRECIOUS
9bccf70c 2545 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
1c79356b 2546#else
9bccf70c 2547 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
1c79356b
A
2548#endif
2549 while (cnt && (error == KERN_SUCCESS)) {
0b4e3aa0
A
2550 int ps_info_valid;
2551 int page_list_count;
1c79356b 2552
de355530 2553 if (cnt > VM_SUPER_CLUSTER)
0b4e3aa0 2554 size = VM_SUPER_CLUSTER;
de355530 2555 else
0b4e3aa0
A
2556 size = cnt;
2557 cnt -= size;
1c79356b 2558
0b4e3aa0
A
2559 ps_info_valid = 0;
2560 seg_index = 0;
1c79356b 2561
0b4e3aa0
A
2562 while (size > 0 && error == KERN_SUCCESS) {
2563 int abort_size;
2564 int failed_size;
2565 int beg_pseg;
2566 int beg_indx;
2567 vm_offset_t cur_offset;
1c79356b 2568
0b4e3aa0
A
2569
2570 if ( !ps_info_valid) {
2571 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2572 psp[seg_index] = CLMAP_PS(clmap);
2573 ps_info_valid = 1;
1c79356b 2574 }
0b4e3aa0
A
2575 /*
2576 * skip over unallocated physical segments
2577 */
2578 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2579 abort_size = cl_size - (vs_offset & cl_mask);
2580 abort_size = MIN(abort_size, size);
2581
2582 page_list_count = 0;
2583 memory_object_super_upl_request(
2584 vs->vs_control,
2585 (memory_object_offset_t)vs_offset,
2586 abort_size, abort_size,
2587 &upl, NULL, &page_list_count,
2588 request_flags);
1c79356b 2589
0b4e3aa0
A
2590 if (clmap.cl_error) {
2591 upl_abort(upl, UPL_ABORT_ERROR);
2592 } else {
2593 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2594 }
2595 upl_deallocate(upl);
1c79356b 2596
0b4e3aa0
A
2597 size -= abort_size;
2598 vs_offset += abort_size;
1c79356b 2599
0b4e3aa0
A
2600 seg_index++;
2601 ps_info_valid = 0;
2602 continue;
1c79356b 2603 }
0b4e3aa0
A
2604 cl_index = (vs_offset & cl_mask) / vm_page_size;
2605
2606 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2607 /*
2608 * skip over unallocated pages
2609 */
2610 if (CLMAP_ISSET(clmap, cl_index))
2611 break;
2612 abort_size += vm_page_size;
2613 }
2614 if (abort_size) {
2615 /*
2616 * Let VM system know about holes in clusters.
2617 */
de355530 2618 GSTAT(global_stats.gs_pages_unavail += atop(abort_size));
0b4e3aa0
A
2619
2620 page_list_count = 0;
2621 memory_object_super_upl_request(
2622 vs->vs_control,
2623 (memory_object_offset_t)vs_offset,
2624 abort_size, abort_size,
2625 &upl, NULL, &page_list_count,
1c79356b 2626 request_flags);
1c79356b 2627
0b4e3aa0
A
2628 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2629 upl_deallocate(upl);
1c79356b 2630
0b4e3aa0
A
2631 size -= abort_size;
2632 vs_offset += abort_size;
2633
2634 if (cl_index == pages_in_cl) {
2635 /*
2636 * if we're at the end of this physical cluster
2637 * then bump to the next one and continue looking
2638 */
2639 seg_index++;
2640 ps_info_valid = 0;
2641 continue;
2642 }
2643 if (size == 0)
2644 break;
2645 }
1c79356b 2646 /*
0b4e3aa0
A
2647 * remember the starting point of the first allocated page
2648 * for the I/O we're about to issue
1c79356b 2649 */
0b4e3aa0
A
2650 beg_pseg = seg_index;
2651 beg_indx = cl_index;
2652 cur_offset = vs_offset;
2653
2654 /*
2655 * calculate the size of the I/O that we can do...
2656 * this may span multiple physical segments if
2657 * they are contiguous
2658 */
2659 for (xfer_size = 0; xfer_size < size; ) {
2660
de355530 2661 while (cl_index < pages_in_cl && xfer_size < size) {
0b4e3aa0 2662 /*
de355530 2663 * accumulate allocated pages within a physical segment
1c79356b 2664 */
0b4e3aa0
A
2665 if (CLMAP_ISSET(clmap, cl_index)) {
2666 xfer_size += vm_page_size;
2667 cur_offset += vm_page_size;
2668 cl_index++;
2669
2670 BS_STAT(psp[seg_index]->ps_bs,
2671 psp[seg_index]->ps_bs->bs_pages_in++);
2672 } else
2673 break;
2674 }
de355530 2675 if (cl_index < pages_in_cl || xfer_size >= size) {
0b4e3aa0 2676 /*
de355530
A
2677 * we've hit an unallocated page or the
2678 * end of this request... go fire the I/O
1c79356b 2679 */
0b4e3aa0
A
2680 break;
2681 }
2682 /*
de355530
A
2683 * we've hit the end of the current physical segment
2684 * and there's more to do, so try moving to the next one
0b4e3aa0
A
2685 */
2686 seg_index++;
2687
de355530
A
2688 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2689 psp[seg_index] = CLMAP_PS(clmap);
0b4e3aa0
A
2690 ps_info_valid = 1;
2691
2692 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2693 /*
de355530
A
2694 * if the physical segment we're about to step into
2695 * is not contiguous to the one we're currently
2696 * in, or it's in a different paging file, or
0b4e3aa0
A
2697 * it hasn't been allocated....
2698 * we stop here and generate the I/O
2699 */
2700 break;
1c79356b 2701 }
0b4e3aa0 2702 /*
de355530 2703 * start with first page of the next physical segment
0b4e3aa0
A
2704 */
2705 cl_index = 0;
1c79356b 2706 }
0b4e3aa0
A
2707 if (xfer_size) {
2708 /*
2709 * we have a contiguous range of allocated pages
2710 * to read from
2711 */
2712 page_list_count = 0;
2713 memory_object_super_upl_request(vs->vs_control,
de355530
A
2714 (memory_object_offset_t)vs_offset,
2715 xfer_size, xfer_size,
2716 &upl, NULL, &page_list_count,
2717 request_flags | UPL_SET_INTERNAL);
0b4e3aa0 2718
de355530
A
2719 error = ps_read_file(psp[beg_pseg], upl, (vm_offset_t) 0,
2720 ps_offset[beg_pseg] + (beg_indx * vm_page_size), xfer_size, &residual, 0);
0b4e3aa0
A
2721 } else
2722 continue;
1c79356b 2723
0b4e3aa0
A
2724 failed_size = 0;
2725
2726 /*
de355530
A
2727 * Adjust counts and send response to VM. Optimize for the
2728 * common case, i.e. no error and/or partial data.
2729 * If there was an error, then we need to error the entire
2730 * range, even if some data was successfully read.
2731 * If there was a partial read we may supply some
0b4e3aa0
A
2732 * data and may error some as well. In all cases the
2733 * VM must receive some notification for every page in the
2734 * range.
2735 */
2736 if ((error == KERN_SUCCESS) && (residual == 0)) {
2737 /*
de355530
A
2738 * Got everything we asked for, supply the data to
2739 * the VM. Note that as a side effect of supplying
2740 * the data, the buffer holding the supplied data is
2741 * deallocated from the pager's address space.
0b4e3aa0 2742 */
de355530 2743 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
0b4e3aa0
A
2744 } else {
2745 failed_size = xfer_size;
2746
2747 if (error == KERN_SUCCESS) {
2748 if (residual == xfer_size) {
de355530
A
2749 /*
2750 * If a read operation returns no error
2751 * and no data moved, we turn it into
2752 * an error, assuming we're reading at
2753 * or beyong EOF.
2754 * Fall through and error the entire
2755 * range.
2756 */
0b4e3aa0
A
2757 error = KERN_FAILURE;
2758 } else {
de355530
A
2759 /*
2760 * Otherwise, we have partial read. If
2761 * the part read is a integral number
2762 * of pages supply it. Otherwise round
2763 * it up to a page boundary, zero fill
2764 * the unread part, and supply it.
2765 * Fall through and error the remainder
2766 * of the range, if any.
2767 */
0b4e3aa0
A
2768 int fill, lsize;
2769
de355530
A
2770 fill = residual & ~vm_page_size;
2771 lsize = (xfer_size - residual) + fill;
2772 pvs_object_data_provided(vs, upl, vs_offset, lsize);
0b4e3aa0
A
2773
2774 if (lsize < xfer_size) {
de355530 2775 failed_size = xfer_size - lsize;
0b4e3aa0
A
2776 error = KERN_FAILURE;
2777 }
2778 }
2779 }
2780 }
1c79356b
A
2781 /*
2782 * If there was an error in any part of the range, tell
de355530
A
2783 * the VM. Note that error is explicitly checked again since
2784 * it can be modified above.
1c79356b
A
2785 */
2786 if (error != KERN_SUCCESS) {
0b4e3aa0 2787 BS_STAT(psp[beg_pseg]->ps_bs,
de355530 2788 psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop(failed_size));
1c79356b 2789 }
0b4e3aa0
A
2790 size -= xfer_size;
2791 vs_offset += xfer_size;
1c79356b 2792 }
1c79356b
A
2793
2794 } /* END while (cnt && (error == 0)) */
2795 return error;
2796}
2797
2798int vs_do_async_write = 1;
2799
2800kern_return_t
2801vs_cluster_write(
2802 vstruct_t vs,
2803 upl_t internal_upl,
2804 vm_offset_t offset,
2805 vm_size_t cnt,
2806 boolean_t dp_internal,
2807 int flags)
2808{
1c79356b
A
2809 vm_offset_t size;
2810 vm_offset_t transfer_size;
1c79356b
A
2811 int error = 0;
2812 struct clmap clmap;
0b4e3aa0
A
2813
2814 vm_offset_t actual_offset; /* Offset within paging segment */
1c79356b 2815 paging_segment_t ps;
0b4e3aa0
A
2816 vm_offset_t subx_size;
2817 vm_offset_t mobj_base_addr;
2818 vm_offset_t mobj_target_addr;
2819 int mobj_size;
2820
1c79356b
A
2821 struct vs_async *vsa;
2822 vm_map_copy_t copy;
1c79356b
A
2823
2824 upl_t upl;
0b4e3aa0 2825 upl_page_info_t *pl;
1c79356b
A
2826 int page_index;
2827 int list_size;
2828 int cl_size;
1c79356b 2829
1c79356b 2830 if (!dp_internal) {
0b4e3aa0 2831 int page_list_count;
1c79356b
A
2832 int request_flags;
2833 int super_size;
0b4e3aa0
A
2834 int first_dirty;
2835 int num_dirty;
2836 int num_of_pages;
2837 int seg_index;
2838 int pages_in_cl;
2839 int must_abort;
1c79356b 2840 vm_offset_t upl_offset;
0b4e3aa0
A
2841 vm_offset_t seg_offset;
2842 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2843 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2844
1c79356b 2845
0b4e3aa0
A
2846 pages_in_cl = 1 << vs->vs_clshift;
2847 cl_size = pages_in_cl * vm_page_size;
1c79356b
A
2848
2849 if (bs_low) {
2850 super_size = cl_size;
0b4e3aa0 2851
1c79356b
A
2852 request_flags = UPL_NOBLOCK |
2853 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2854 UPL_NO_SYNC | UPL_SET_INTERNAL;
2855 } else {
2856 super_size = VM_SUPER_CLUSTER;
0b4e3aa0 2857
1c79356b
A
2858 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2859 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2860 UPL_NO_SYNC | UPL_SET_INTERNAL;
2861 }
2862
0b4e3aa0
A
2863 page_list_count = 0;
2864 memory_object_super_upl_request(vs->vs_control,
2865 (memory_object_offset_t)offset,
2866 cnt, super_size,
2867 &upl, NULL, &page_list_count,
de355530 2868 request_flags | UPL_PAGEOUT);
1c79356b 2869
0b4e3aa0 2870 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1c79356b 2871
de355530 2872 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
1c79356b 2873
de355530
A
2874 ps_offset[seg_index] = ps_clmap(vs, upl->offset + (seg_index * cl_size),
2875 &clmap, CL_ALLOC,
2876 transfer_size < cl_size ?
2877 transfer_size : cl_size, 0);
1c79356b 2878
0b4e3aa0
A
2879 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2880 upl_abort(upl, 0);
2881 upl_deallocate(upl);
2882
2883 return KERN_FAILURE;
1c79356b 2884
0b4e3aa0
A
2885 }
2886 psp[seg_index] = CLMAP_PS(clmap);
1c79356b 2887
0b4e3aa0
A
2888 if (transfer_size > cl_size) {
2889 transfer_size -= cl_size;
2890 seg_index++;
2891 } else
2892 transfer_size = 0;
2893 }
de355530 2894 for (page_index = 0, num_of_pages = upl->size / vm_page_size; page_index < num_of_pages; ) {
0b4e3aa0
A
2895 /*
2896 * skip over non-dirty pages
2897 */
2898 for ( ; page_index < num_of_pages; page_index++) {
de355530 2899 if (UPL_DIRTY_PAGE(pl, page_index) || UPL_PRECIOUS_PAGE(pl, page_index))
0b4e3aa0
A
2900 /*
2901 * this is a page we need to write
de355530
A
2902 * go see if we can buddy it up with others
2903 * that are contiguous to it
0b4e3aa0
A
2904 */
2905 break;
2906 /*
de355530
A
2907 * if the page is not-dirty, but present we need to commit it...
2908 * this is an unusual case since we only asked for dirty pages
0b4e3aa0
A
2909 */
2910 if (UPL_PAGE_PRESENT(pl, page_index)) {
2911 boolean_t empty = FALSE;
2912 upl_commit_range(upl,
2913 page_index * vm_page_size,
2914 vm_page_size,
2915 UPL_COMMIT_NOTIFY_EMPTY,
2916 pl,
d52fe63f 2917 page_list_count,
0b4e3aa0
A
2918 &empty);
2919 if (empty)
2920 upl_deallocate(upl);
1c79356b 2921 }
1c79356b 2922 }
0b4e3aa0
A
2923 if (page_index == num_of_pages)
2924 /*
2925 * no more pages to look at, we're out of here
2926 */
2927 break;
1c79356b 2928
0b4e3aa0 2929 /*
de355530
A
2930 * gather up contiguous dirty pages... we have at least 1
2931 * otherwise we would have bailed above
0b4e3aa0
A
2932 * make sure that each physical segment that we step
2933 * into is contiguous to the one we're currently in
2934 * if it's not, we have to stop and write what we have
2935 */
de355530
A
2936 for (first_dirty = page_index; page_index < num_of_pages; ) {
2937 if ( !UPL_DIRTY_PAGE(pl, page_index) && !UPL_PRECIOUS_PAGE(pl, page_index))
0b4e3aa0
A
2938 break;
2939 page_index++;
2940 /*
2941 * if we just looked at the last page in the UPL
2942 * we don't need to check for physical segment
2943 * continuity
2944 */
2945 if (page_index < num_of_pages) {
2946 int cur_seg;
2947 int nxt_seg;
2948
de355530 2949 cur_seg = (page_index - 1) / pages_in_cl;
0b4e3aa0
A
2950 nxt_seg = page_index / pages_in_cl;
2951
2952 if (cur_seg != nxt_seg) {
2953 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
de355530
A
2954 /*
2955 * if the segment we're about to step into
2956 * is not contiguous to the one we're currently
2957 * in, or it's in a different paging file....
2958 * we stop here and generate the I/O
2959 */
0b4e3aa0 2960 break;
1c79356b 2961 }
1c79356b 2962 }
0b4e3aa0
A
2963 }
2964 num_dirty = page_index - first_dirty;
2965 must_abort = 1;
1c79356b 2966
0b4e3aa0
A
2967 if (num_dirty) {
2968 upl_offset = first_dirty * vm_page_size;
2969 seg_index = first_dirty / pages_in_cl;
2970 seg_offset = upl_offset - (seg_index * cl_size);
2971 transfer_size = num_dirty * vm_page_size;
2972
de355530
A
2973 error = ps_write_file(psp[seg_index], upl, upl_offset,
2974 ps_offset[seg_index] + seg_offset, transfer_size, flags);
0b4e3aa0 2975
de355530
A
2976 if (error == 0) {
2977 while (transfer_size) {
2978 int seg_size;
1c79356b 2979
de355530
A
2980 if ((seg_size = cl_size - (upl_offset % cl_size)) > transfer_size)
2981 seg_size = transfer_size;
0b4e3aa0 2982
de355530 2983 ps_vs_write_complete(vs, upl->offset + upl_offset, seg_size, error);
0b4e3aa0 2984
de355530
A
2985 transfer_size -= seg_size;
2986 upl_offset += seg_size;
2987 }
0b4e3aa0 2988 }
9bccf70c 2989 must_abort = 0;
0b4e3aa0
A
2990 }
2991 if (must_abort) {
2992 boolean_t empty = FALSE;
2993 upl_abort_range(upl,
2994 first_dirty * vm_page_size,
2995 num_dirty * vm_page_size,
2996 UPL_ABORT_NOTIFY_EMPTY,
2997 &empty);
2998 if (empty)
2999 upl_deallocate(upl);
1c79356b 3000 }
1c79356b 3001 }
0b4e3aa0 3002
1c79356b
A
3003 } else {
3004 assert(cnt <= (vm_page_size << vs->vs_clshift));
3005 list_size = cnt;
3006
3007 page_index = 0;
3008 /* The caller provides a mapped_data which is derived */
3009 /* from a temporary object. The targeted pages are */
3010 /* guaranteed to be set at offset 0 in the mapped_data */
3011 /* The actual offset however must still be derived */
3012 /* from the offset in the vs in question */
3013 mobj_base_addr = offset;
3014 mobj_target_addr = mobj_base_addr;
3015
3016 for (transfer_size = list_size; transfer_size != 0;) {
3017 actual_offset = ps_clmap(vs, mobj_target_addr,
3018 &clmap, CL_ALLOC,
3019 transfer_size < cl_size ?
3020 transfer_size : cl_size, 0);
3021 if(actual_offset == (vm_offset_t) -1) {
3022 error = 1;
3023 break;
3024 }
3025 cnt = MIN(transfer_size,
3026 CLMAP_NPGS(clmap) * vm_page_size);
3027 ps = CLMAP_PS(clmap);
3028 /* Assume that the caller has given us contiguous */
3029 /* pages */
3030 if(cnt) {
3031 error = ps_write_file(ps, internal_upl,
3032 0, actual_offset,
3033 cnt, flags);
3034 if (error)
3035 break;
de355530
A
3036 ps_vs_write_complete(vs, mobj_target_addr,
3037 cnt, error);
3038 }
1c79356b
A
3039 if (error)
3040 break;
3041 actual_offset += cnt;
3042 mobj_target_addr += cnt;
3043 transfer_size -= cnt;
3044 cnt = 0;
3045
3046 if (error)
3047 break;
3048 }
3049 }
3050 if(error)
3051 return KERN_FAILURE;
3052 else
3053 return KERN_SUCCESS;
3054}
3055
3056vm_size_t
3057ps_vstruct_allocated_size(
3058 vstruct_t vs)
3059{
3060 int num_pages;
3061 struct vs_map *vsmap;
3062 int i, j, k;
3063
3064 num_pages = 0;
3065 if (vs->vs_indirect) {
3066 /* loop on indirect maps */
3067 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3068 vsmap = vs->vs_imap[i];
3069 if (vsmap == NULL)
3070 continue;
3071 /* loop on clusters in this indirect map */
3072 for (j = 0; j < CLMAP_ENTRIES; j++) {
3073 if (VSM_ISCLR(vsmap[j]) ||
3074 VSM_ISERR(vsmap[j]))
3075 continue;
3076 /* loop on pages in this cluster */
3077 for (k = 0; k < VSCLSIZE(vs); k++) {
3078 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3079 num_pages++;
3080 }
3081 }
3082 }
3083 } else {
3084 vsmap = vs->vs_dmap;
3085 if (vsmap == NULL)
3086 return 0;
3087 /* loop on clusters in the direct map */
3088 for (j = 0; j < CLMAP_ENTRIES; j++) {
3089 if (VSM_ISCLR(vsmap[j]) ||
3090 VSM_ISERR(vsmap[j]))
3091 continue;
3092 /* loop on pages in this cluster */
3093 for (k = 0; k < VSCLSIZE(vs); k++) {
3094 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3095 num_pages++;
3096 }
3097 }
3098 }
3099
de355530 3100 return ptoa(num_pages);
1c79356b
A
3101}
3102
3103size_t
3104ps_vstruct_allocated_pages(
3105 vstruct_t vs,
3106 default_pager_page_t *pages,
3107 size_t pages_size)
3108{
3109 int num_pages;
3110 struct vs_map *vsmap;
3111 vm_offset_t offset;
3112 int i, j, k;
3113
3114 num_pages = 0;
3115 offset = 0;
3116 if (vs->vs_indirect) {
3117 /* loop on indirect maps */
3118 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3119 vsmap = vs->vs_imap[i];
3120 if (vsmap == NULL) {
3121 offset += (vm_page_size * CLMAP_ENTRIES *
3122 VSCLSIZE(vs));
3123 continue;
3124 }
3125 /* loop on clusters in this indirect map */
3126 for (j = 0; j < CLMAP_ENTRIES; j++) {
3127 if (VSM_ISCLR(vsmap[j]) ||
3128 VSM_ISERR(vsmap[j])) {
3129 offset += vm_page_size * VSCLSIZE(vs);
3130 continue;
3131 }
3132 /* loop on pages in this cluster */
3133 for (k = 0; k < VSCLSIZE(vs); k++) {
3134 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3135 num_pages++;
3136 if (num_pages < pages_size)
3137 pages++->dpp_offset =
3138 offset;
3139 }
3140 offset += vm_page_size;
3141 }
3142 }
3143 }
3144 } else {
3145 vsmap = vs->vs_dmap;
3146 if (vsmap == NULL)
3147 return 0;
3148 /* loop on clusters in the direct map */
3149 for (j = 0; j < CLMAP_ENTRIES; j++) {
3150 if (VSM_ISCLR(vsmap[j]) ||
3151 VSM_ISERR(vsmap[j])) {
3152 offset += vm_page_size * VSCLSIZE(vs);
3153 continue;
3154 }
3155 /* loop on pages in this cluster */
3156 for (k = 0; k < VSCLSIZE(vs); k++) {
3157 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3158 num_pages++;
3159 if (num_pages < pages_size)
3160 pages++->dpp_offset = offset;
3161 }
3162 offset += vm_page_size;
3163 }
3164 }
3165 }
3166
3167 return num_pages;
3168}
3169
3170
3171kern_return_t
3172ps_vstruct_transfer_from_segment(
3173 vstruct_t vs,
3174 paging_segment_t segment,
1c79356b 3175 upl_t upl)
1c79356b
A
3176{
3177 struct vs_map *vsmap;
3178 struct vs_map old_vsmap;
3179 struct vs_map new_vsmap;
3180 int i, j, k;
3181
3182 VS_LOCK(vs); /* block all work on this vstruct */
3183 /* can't allow the normal multiple write */
3184 /* semantic because writes may conflict */
3185 vs->vs_xfer_pending = TRUE;
3186 vs_wait_for_sync_writers(vs);
3187 vs_start_write(vs);
3188 vs_wait_for_readers(vs);
3189 /* we will unlock the vs to allow other writes while transferring */
3190 /* and will be guaranteed of the persistance of the vs struct */
3191 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3192 /* vs_async_pending */
3193 /* OK we now have guaranteed no other parties are accessing this */
3194 /* vs. Now that we are also supporting simple lock versions of */
3195 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3196 /* our purpose in holding it before was the multiple write case */
3197 /* we now use the boolean xfer_pending to do that. We can use */
3198 /* a boolean instead of a count because we have guaranteed single */
3199 /* file access to this code in its caller */
3200 VS_UNLOCK(vs);
3201vs_changed:
3202 if (vs->vs_indirect) {
3203 int vsmap_size;
3204 int clmap_off;
3205 /* loop on indirect maps */
3206 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3207 vsmap = vs->vs_imap[i];
3208 if (vsmap == NULL)
3209 continue;
3210 /* loop on clusters in this indirect map */
3211 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3212 VSCLSIZE(vs) * i);
3213 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3214 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3215 else
3216 vsmap_size = CLMAP_ENTRIES;
3217 for (j = 0; j < vsmap_size; j++) {
3218 if (VSM_ISCLR(vsmap[j]) ||
3219 VSM_ISERR(vsmap[j]) ||
3220 (VSM_PS(vsmap[j]) != segment))
3221 continue;
3222 if(vs_cluster_transfer(vs,
3223 (vm_page_size * (j << vs->vs_clshift))
3224 + clmap_off,
3225 vm_page_size << vs->vs_clshift,
1c79356b 3226 upl)
1c79356b
A
3227 != KERN_SUCCESS) {
3228 VS_LOCK(vs);
3229 vs->vs_xfer_pending = FALSE;
3230 VS_UNLOCK(vs);
3231 vs_finish_write(vs);
3232 return KERN_FAILURE;
3233 }
3234 /* allow other readers/writers during transfer*/
3235 VS_LOCK(vs);
3236 vs->vs_xfer_pending = FALSE;
3237 VS_UNLOCK(vs);
3238 vs_finish_write(vs);
3239 VS_LOCK(vs);
3240 vs->vs_xfer_pending = TRUE;
1c79356b
A
3241 vs_wait_for_sync_writers(vs);
3242 vs_start_write(vs);
3243 vs_wait_for_readers(vs);
0b4e3aa0 3244 VS_UNLOCK(vs);
1c79356b
A
3245 if (!(vs->vs_indirect)) {
3246 goto vs_changed;
3247 }
3248 }
3249 }
3250 } else {
3251 vsmap = vs->vs_dmap;
3252 if (vsmap == NULL) {
3253 VS_LOCK(vs);
3254 vs->vs_xfer_pending = FALSE;
3255 VS_UNLOCK(vs);
3256 vs_finish_write(vs);
3257 return KERN_SUCCESS;
3258 }
3259 /* loop on clusters in the direct map */
3260 for (j = 0; j < vs->vs_size; j++) {
3261 if (VSM_ISCLR(vsmap[j]) ||
3262 VSM_ISERR(vsmap[j]) ||
3263 (VSM_PS(vsmap[j]) != segment))
3264 continue;
3265 if(vs_cluster_transfer(vs,
3266 vm_page_size * (j << vs->vs_clshift),
3267 vm_page_size << vs->vs_clshift,
1c79356b 3268 upl) != KERN_SUCCESS) {
1c79356b
A
3269 VS_LOCK(vs);
3270 vs->vs_xfer_pending = FALSE;
3271 VS_UNLOCK(vs);
3272 vs_finish_write(vs);
3273 return KERN_FAILURE;
3274 }
3275 /* allow other readers/writers during transfer*/
3276 VS_LOCK(vs);
3277 vs->vs_xfer_pending = FALSE;
3278 VS_UNLOCK(vs);
3279 vs_finish_write(vs);
3280 VS_LOCK(vs);
3281 vs->vs_xfer_pending = TRUE;
3282 VS_UNLOCK(vs);
3283 vs_wait_for_sync_writers(vs);
3284 vs_start_write(vs);
3285 vs_wait_for_readers(vs);
3286 if (vs->vs_indirect) {
3287 goto vs_changed;
3288 }
3289 }
3290 }
3291
3292 VS_LOCK(vs);
3293 vs->vs_xfer_pending = FALSE;
3294 VS_UNLOCK(vs);
3295 vs_finish_write(vs);
3296 return KERN_SUCCESS;
3297}
3298
3299
3300
3301vs_map_t
3302vs_get_map_entry(
3303 vstruct_t vs,
3304 vm_offset_t offset)
3305{
3306 struct vs_map *vsmap;
3307 vm_offset_t cluster;
3308
de355530 3309 cluster = atop(offset) >> vs->vs_clshift;
1c79356b
A
3310 if (vs->vs_indirect) {
3311 long ind_block = cluster/CLMAP_ENTRIES;
3312
3313 /* Is the indirect block allocated? */
3314 vsmap = vs->vs_imap[ind_block];
3315 if(vsmap == (vs_map_t) NULL)
3316 return vsmap;
3317 } else
3318 vsmap = vs->vs_dmap;
3319 vsmap += cluster%CLMAP_ENTRIES;
3320 return vsmap;
3321}
3322
3323kern_return_t
3324vs_cluster_transfer(
3325 vstruct_t vs,
3326 vm_offset_t offset,
3327 vm_size_t cnt,
1c79356b 3328 upl_t upl)
1c79356b
A
3329{
3330 vm_offset_t actual_offset;
3331 paging_segment_t ps;
3332 struct clmap clmap;
3333 kern_return_t error = KERN_SUCCESS;
3334 int size, size_wanted, i;
3335 unsigned int residual;
3336 int unavail_size;
3337 default_pager_thread_t *dpt;
3338 boolean_t dealloc;
3339 struct vs_map *vsmap_ptr;
3340 struct vs_map read_vsmap;
3341 struct vs_map original_read_vsmap;
3342 struct vs_map write_vsmap;
3343 upl_t sync_upl;
1c79356b
A
3344 vm_offset_t ioaddr;
3345
1c79356b
A
3346 /* vs_cluster_transfer reads in the pages of a cluster and
3347 * then writes these pages back to new backing store. The
3348 * segment the pages are being read from is assumed to have
3349 * been taken off-line and is no longer considered for new
3350 * space requests.
3351 */
3352
3353 /*
3354 * This loop will be executed once per cluster referenced.
3355 * Typically this means once, since it's unlikely that the
3356 * VM system will ask for anything spanning cluster boundaries.
3357 *
3358 * If there are holes in a cluster (in a paging segment), we stop
3359 * reading at the hole, then loop again, hoping to
3360 * find valid pages later in the cluster. This continues until
3361 * the entire range has been examined, and read, if present. The
3362 * pages are written as they are read. If a failure occurs after
3363 * some pages are written the unmap call at the bottom of the loop
3364 * recovers the backing store and the old backing store remains
3365 * in effect.
3366 */
3367
1c79356b
A
3368 VSM_CLR(write_vsmap);
3369 VSM_CLR(original_read_vsmap);
3370 /* grab the actual object's pages to sync with I/O */
3371 while (cnt && (error == KERN_SUCCESS)) {
3372 vsmap_ptr = vs_get_map_entry(vs, offset);
3373 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3374
3375 if (actual_offset == (vm_offset_t) -1) {
3376
3377 /*
3378 * Nothing left to write in this cluster at least
3379 * set write cluster information for any previous
3380 * write, clear for next cluster, if there is one
3381 */
3382 unsigned int local_size, clmask, clsize;
3383
3384 clsize = vm_page_size << vs->vs_clshift;
3385 clmask = clsize - 1;
3386 local_size = clsize - (offset & clmask);
3387 ASSERT(local_size);
3388 local_size = MIN(local_size, cnt);
3389
3390 /* This cluster has no data in it beyond what may */
3391 /* have been found on a previous iteration through */
3392 /* the loop "write_vsmap" */
3393 *vsmap_ptr = write_vsmap;
3394 VSM_CLR(write_vsmap);
3395 VSM_CLR(original_read_vsmap);
3396
3397 cnt -= local_size;
3398 offset += local_size;
3399 continue;
3400 }
3401
3402 /*
3403 * Count up contiguous available or unavailable
3404 * pages.
3405 */
3406 ps = CLMAP_PS(clmap);
3407 ASSERT(ps);
3408 size = 0;
3409 unavail_size = 0;
3410 for (i = 0;
3411 (size < cnt) && (unavail_size < cnt) &&
3412 (i < CLMAP_NPGS(clmap)); i++) {
3413 if (CLMAP_ISSET(clmap, i)) {
3414 if (unavail_size != 0)
3415 break;
3416 size += vm_page_size;
3417 BS_STAT(ps->ps_bs,
3418 ps->ps_bs->bs_pages_in++);
3419 } else {
3420 if (size != 0)
3421 break;
3422 unavail_size += vm_page_size;
3423 }
3424 }
3425
3426 if (size == 0) {
3427 ASSERT(unavail_size);
3428 cnt -= unavail_size;
3429 offset += unavail_size;
3430 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3431 == 0) {
3432 /* There is no more to transfer in this
3433 cluster
3434 */
3435 *vsmap_ptr = write_vsmap;
3436 VSM_CLR(write_vsmap);
3437 VSM_CLR(original_read_vsmap);
3438 }
3439 continue;
3440 }
3441
3442 if(VSM_ISCLR(original_read_vsmap))
3443 original_read_vsmap = *vsmap_ptr;
3444
3445 if(ps->ps_segtype == PS_PARTITION) {
3446/*
9bccf70c 3447 NEED TO ISSUE WITH SYNC & NO COMMIT
1c79356b
A
3448 error = ps_read_device(ps, actual_offset, &buffer,
3449 size, &residual, flags);
3450*/
3451 } else {
9bccf70c 3452 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
0b4e3aa0 3453 error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset,
1c79356b 3454 size, &residual,
9bccf70c 3455 (UPL_IOSYNC | UPL_NOCOMMIT));
1c79356b
A
3456 }
3457
3458 read_vsmap = *vsmap_ptr;
3459
3460
3461 /*
3462 * Adjust counts and put data in new BS. Optimize for the
3463 * common case, i.e. no error and/or partial data.
3464 * If there was an error, then we need to error the entire
3465 * range, even if some data was successfully read.
3466 *
3467 */
3468 if ((error == KERN_SUCCESS) && (residual == 0)) {
0b4e3aa0
A
3469 int page_list_count = 0;
3470
1c79356b
A
3471 /*
3472 * Got everything we asked for, supply the data to
3473 * the new BS. Note that as a side effect of supplying
3474 * the data, the buffer holding the supplied data is
3475 * deallocated from the pager's address space unless
3476 * the write is unsuccessful.
3477 */
3478
3479 /* note buffer will be cleaned up in all cases by */
3480 /* internal_cluster_write or if an error on write */
3481 /* the vm_map_copy_page_discard call */
3482 *vsmap_ptr = write_vsmap;
3483
1c79356b
A
3484 if(vs_cluster_write(vs, upl, offset,
3485 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
1c79356b
A
3486 error = KERN_FAILURE;
3487 if(!(VSM_ISCLR(*vsmap_ptr))) {
3488 /* unmap the new backing store object */
3489 ps_clunmap(vs, offset, size);
3490 }
3491 /* original vsmap */
3492 *vsmap_ptr = original_read_vsmap;
3493 VSM_CLR(write_vsmap);
3494 } else {
3495 if((offset + size) &
3496 ((vm_page_size << vs->vs_clshift)
3497 - 1)) {
3498 /* There is more to transfer in this
3499 cluster
3500 */
3501 write_vsmap = *vsmap_ptr;
3502 *vsmap_ptr = read_vsmap;
3503 } else {
3504 /* discard the old backing object */
3505 write_vsmap = *vsmap_ptr;
3506 *vsmap_ptr = read_vsmap;
3507 ps_clunmap(vs, offset, size);
3508 *vsmap_ptr = write_vsmap;
3509 VSM_CLR(write_vsmap);
3510 VSM_CLR(original_read_vsmap);
3511 }
3512 }
3513 } else {
3514 size_wanted = size;
3515 if (error == KERN_SUCCESS) {
3516 if (residual == size) {
3517 /*
3518 * If a read operation returns no error
3519 * and no data moved, we turn it into
3520 * an error, assuming we're reading at
3521 * or beyond EOF.
3522 * Fall through and error the entire
3523 * range.
3524 */
3525 error = KERN_FAILURE;
3526 *vsmap_ptr = write_vsmap;
3527 if(!(VSM_ISCLR(*vsmap_ptr))) {
3528 /* unmap the new backing store object */
3529 ps_clunmap(vs, offset, size);
3530 }
3531 *vsmap_ptr = original_read_vsmap;
3532 VSM_CLR(write_vsmap);
3533 continue;
3534 } else {
3535 /*
3536 * Otherwise, we have partial read.
3537 * This is also considered an error
3538 * for the purposes of cluster transfer
3539 */
3540 error = KERN_FAILURE;
3541 *vsmap_ptr = write_vsmap;
3542 if(!(VSM_ISCLR(*vsmap_ptr))) {
3543 /* unmap the new backing store object */
3544 ps_clunmap(vs, offset, size);
3545 }
3546 *vsmap_ptr = original_read_vsmap;
3547 VSM_CLR(write_vsmap);
3548 continue;
3549 }
3550 }
3551
3552 }
3553 cnt -= size;
3554 offset += size;
3555
3556 } /* END while (cnt && (error == 0)) */
3557 if(!VSM_ISCLR(write_vsmap))
3558 *vsmap_ptr = write_vsmap;
3559
1c79356b
A
3560 return error;
3561}
3562
3563kern_return_t
3564default_pager_add_file(MACH_PORT_FACE backing_store,
3565 int *vp,
3566 int record_size,
3567 long size)
3568{
3569 backing_store_t bs;
3570 paging_segment_t ps;
3571 int i;
3572 int error;
1c79356b
A
3573
3574 if ((bs = backing_store_lookup(backing_store))
3575 == BACKING_STORE_NULL)
3576 return KERN_INVALID_ARGUMENT;
3577
3578 PSL_LOCK();
3579 for (i = 0; i <= paging_segment_max; i++) {
3580 ps = paging_segments[i];
3581 if (ps == PAGING_SEGMENT_NULL)
3582 continue;
3583 if (ps->ps_segtype != PS_FILE)
3584 continue;
3585
3586 /*
3587 * Check for overlap on same device.
3588 */
3589 if (ps->ps_vnode == (struct vnode *)vp) {
3590 PSL_UNLOCK();
3591 BS_UNLOCK(bs);
3592 return KERN_INVALID_ARGUMENT;
3593 }
3594 }
3595 PSL_UNLOCK();
3596
3597 /*
3598 * Set up the paging segment
3599 */
3600 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3601 if (ps == PAGING_SEGMENT_NULL) {
3602 BS_UNLOCK(bs);
3603 return KERN_RESOURCE_SHORTAGE;
3604 }
3605
3606 ps->ps_segtype = PS_FILE;
3607 ps->ps_vnode = (struct vnode *)vp;
3608 ps->ps_offset = 0;
3609 ps->ps_record_shift = local_log2(vm_page_size / record_size);
3610 ps->ps_recnum = size;
3611 ps->ps_pgnum = size >> ps->ps_record_shift;
3612
3613 ps->ps_pgcount = ps->ps_pgnum;
3614 ps->ps_clshift = local_log2(bs->bs_clsize);
3615 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3616 ps->ps_hint = 0;
3617
3618 PS_LOCK_INIT(ps);
3619 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3620 if (!ps->ps_bmap) {
3621 kfree((vm_offset_t)ps, sizeof *ps);
3622 BS_UNLOCK(bs);
3623 return KERN_RESOURCE_SHORTAGE;
3624 }
3625 for (i = 0; i < ps->ps_ncls; i++) {
3626 clrbit(ps->ps_bmap, i);
3627 }
3628
3629 ps->ps_going_away = FALSE;
3630 ps->ps_bs = bs;
3631
3632 if ((error = ps_enter(ps)) != 0) {
3633 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3634 kfree((vm_offset_t)ps, sizeof *ps);
3635 BS_UNLOCK(bs);
3636 return KERN_RESOURCE_SHORTAGE;
3637 }
3638
3639 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3640 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3641 PSL_LOCK();
3642 dp_pages_free += ps->ps_pgcount;
3643 PSL_UNLOCK();
3644
3645 BS_UNLOCK(bs);
3646
3647 bs_more_space(ps->ps_clcount);
3648
3649 DEBUG(DEBUG_BS_INTERNAL,
3650 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3651 device, offset, size, record_size,
3652 ps->ps_record_shift, ps->ps_pgnum));
3653
3654 return KERN_SUCCESS;
3655}
3656
3657
3658
1c79356b
A
3659kern_return_t
3660ps_read_file(
3661 paging_segment_t ps,
3662 upl_t upl,
0b4e3aa0 3663 vm_offset_t upl_offset,
1c79356b
A
3664 vm_offset_t offset,
3665 unsigned int size,
3666 unsigned int *residualp,
3667 int flags)
3668{
3669 vm_object_offset_t f_offset;
3670 int error = 0;
3671 int result;
1c79356b
A
3672
3673
de355530 3674 clustered_reads[atop(size)]++;
1c79356b
A
3675
3676 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3677
3678 /* for transfer case we need to pass uploffset and flags */
3679 error = vnode_pagein(ps->ps_vnode,
0b4e3aa0 3680 upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
1c79356b
A
3681
3682 /* The vnode_pagein semantic is somewhat at odds with the existing */
3683 /* device_read semantic. Partial reads are not experienced at this */
3684 /* level. It is up to the bit map code and cluster read code to */
3685 /* check that requested data locations are actually backed, and the */
3686 /* pagein code to either read all of the requested data or return an */
3687 /* error. */
3688
3689 if (error)
3690 result = KERN_FAILURE;
3691 else {
3692 *residualp = 0;
3693 result = KERN_SUCCESS;
3694 }
3695 return result;
1c79356b
A
3696}
3697
3698kern_return_t
3699ps_write_file(
3700 paging_segment_t ps,
3701 upl_t upl,
3702 vm_offset_t upl_offset,
3703 vm_offset_t offset,
3704 unsigned int size,
3705 int flags)
3706{
3707 vm_object_offset_t f_offset;
3708 kern_return_t result;
1c79356b
A
3709
3710 int error = 0;
3711
de355530 3712 clustered_writes[atop(size)]++;
1c79356b
A
3713 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3714
3715 if (vnode_pageout(ps->ps_vnode,
3716 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3717 result = KERN_FAILURE;
3718 else
3719 result = KERN_SUCCESS;
3720
3721 return result;
3722}
3723
3724kern_return_t
3725default_pager_triggers(MACH_PORT_FACE default_pager,
3726 int hi_wat,
3727 int lo_wat,
3728 int flags,
3729 MACH_PORT_FACE trigger_port)
3730{
0b4e3aa0
A
3731 MACH_PORT_FACE release;
3732 kern_return_t kr;
1c79356b 3733
0b4e3aa0
A
3734 PSL_LOCK();
3735 if (flags == HI_WAT_ALERT) {
3736 release = min_pages_trigger_port;
1c79356b
A
3737 min_pages_trigger_port = trigger_port;
3738 minimum_pages_remaining = hi_wat/vm_page_size;
3739 bs_low = FALSE;
0b4e3aa0
A
3740 kr = KERN_SUCCESS;
3741 } else if (flags == LO_WAT_ALERT) {
3742 release = max_pages_trigger_port;
1c79356b
A
3743 max_pages_trigger_port = trigger_port;
3744 maximum_pages_free = lo_wat/vm_page_size;
0b4e3aa0
A
3745 kr = KERN_SUCCESS;
3746 } else {
3747 release = trigger_port;
3748 kr = KERN_INVALID_ARGUMENT;
1c79356b 3749 }
0b4e3aa0
A
3750 PSL_UNLOCK();
3751
3752 if (IP_VALID(release))
3753 ipc_port_release_send(release);
3754
3755 return kr;
1c79356b 3756}