]> git.saurik.com Git - apple/xnu.git/blob - osfmk/default_pager/dp_backing_store.c
30d1b6e293dbe65b1390db1147364b7c688f79d9
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /*
26 * @OSF_COPYRIGHT@
27 */
28 /*
29 * Mach Operating System
30 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
31 * All Rights Reserved.
32 *
33 * Permission to use, copy, modify and distribute this software and its
34 * documentation is hereby granted, provided that both the copyright
35 * notice and this permission notice appear in all copies of the
36 * software, derivative works or modified versions, and any portions
37 * thereof, and that both notices appear in supporting documentation.
38 *
39 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
40 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
41 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
42 *
43 * Carnegie Mellon requests users of this software to return to
44 *
45 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
46 * School of Computer Science
47 * Carnegie Mellon University
48 * Pittsburgh PA 15213-3890
49 *
50 * any improvements or extensions that they make and grant Carnegie Mellon
51 * the rights to redistribute these changes.
52 */
53
54 /*
55 * Default Pager.
56 * Paging File Management.
57 */
58
59 #include <mach/memory_object_control.h>
60 #include <mach/memory_object_server.h>
61 #include "default_pager_internal.h"
62 #include <default_pager/default_pager_alerts.h>
63 #include <ipc/ipc_port.h>
64 #include <ipc/ipc_space.h>
65 #include <kern/queue.h>
66 #include <kern/counters.h>
67 #include <kern/sched_prim.h>
68 #include <vm/vm_kern.h>
69 #include <vm/vm_pageout.h>
70 /* CDY CDY */
71 #include <vm/vm_map.h>
72
73 /*
74 * ALLOC_STRIDE... the maximum number of bytes allocated from
75 * a swap file before moving on to the next swap file... if
76 * all swap files reside on a single disk, this value should
77 * be very large (this is the default assumption)... if the
78 * swap files are spread across multiple disks, than this value
79 * should be small (128 * 1024)...
80 *
81 * This should be determined dynamically in the future
82 */
83
84 #define ALLOC_STRIDE (1024 * 1024 * 1024)
85 int physical_transfer_cluster_count = 0;
86
87 #define VM_SUPER_CLUSTER 0x40000
88 #define VM_SUPER_PAGES 64
89
90 /*
91 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
92 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
93 */
94 #define VSTRUCT_DEF_CLSHIFT 2
95 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
96 int default_pager_clsize = 0;
97
98 /* statistics */
99 unsigned int clustered_writes[VM_SUPER_PAGES+1];
100 unsigned int clustered_reads[VM_SUPER_PAGES+1];
101
102 /*
103 * Globals used for asynchronous paging operations:
104 * vs_async_list: head of list of to-be-completed I/O ops
105 * async_num_queued: number of pages completed, but not yet
106 * processed by async thread.
107 * async_requests_out: number of pages of requests not completed.
108 */
109
110 #if 0
111 struct vs_async *vs_async_list;
112 int async_num_queued;
113 int async_requests_out;
114 #endif
115
116
117 #define VS_ASYNC_REUSE 1
118 struct vs_async *vs_async_free_list;
119
120 mutex_t default_pager_async_lock; /* Protects globals above */
121
122
123 int vs_alloc_async_failed = 0; /* statistics */
124 int vs_alloc_async_count = 0; /* statistics */
125 struct vs_async *vs_alloc_async(void); /* forward */
126 void vs_free_async(struct vs_async *vsa); /* forward */
127
128
129 #define VS_ALLOC_ASYNC() vs_alloc_async()
130 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
131
132 #define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
133 #define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
134 #define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, \
135 ETAP_IO_DEV_PAGEH)
136 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
137 /*
138 * Paging Space Hysteresis triggers and the target notification port
139 *
140 */
141
142 unsigned int minimum_pages_remaining = 0;
143 unsigned int maximum_pages_free = 0;
144 ipc_port_t min_pages_trigger_port = NULL;
145 ipc_port_t max_pages_trigger_port = NULL;
146
147 boolean_t bs_low = FALSE;
148 int backing_store_release_trigger_disable = 0;
149
150
151
152 /*
153 * Object sizes are rounded up to the next power of 2,
154 * unless they are bigger than a given maximum size.
155 */
156 vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
157
158 /*
159 * List of all backing store and segments.
160 */
161 struct backing_store_list_head backing_store_list;
162 paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
163 mutex_t paging_segments_lock;
164 int paging_segment_max = 0;
165 int paging_segment_count = 0;
166 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
167
168
169 /*
170 * Total pages free in system
171 * This differs from clusters committed/avail which is a measure of the
172 * over commitment of paging segments to backing store. An idea which is
173 * likely to be deprecated.
174 */
175 unsigned int dp_pages_free = 0;
176 unsigned int cluster_transfer_minimum = 100;
177
178 kern_return_t ps_write_file(paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, int); /* forward */
179 kern_return_t ps_read_file (paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, unsigned int *, int); /* forward */
180
181
182 default_pager_thread_t *
183 get_read_buffer()
184 {
185 int i;
186
187 DPT_LOCK(dpt_lock);
188 while(TRUE) {
189 for (i=0; i<default_pager_internal_count; i++) {
190 if(dpt_array[i]->checked_out == FALSE) {
191 dpt_array[i]->checked_out = TRUE;
192 DPT_UNLOCK(dpt_lock);
193 return dpt_array[i];
194 }
195 }
196 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
197 }
198 }
199
200 void
201 bs_initialize(void)
202 {
203 int i;
204
205 /*
206 * List of all backing store.
207 */
208 BSL_LOCK_INIT();
209 queue_init(&backing_store_list.bsl_queue);
210 PSL_LOCK_INIT();
211
212 VS_ASYNC_LOCK_INIT();
213 #if VS_ASYNC_REUSE
214 vs_async_free_list = NULL;
215 #endif /* VS_ASYNC_REUSE */
216
217 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
218 clustered_writes[i] = 0;
219 clustered_reads[i] = 0;
220 }
221
222 }
223
224 /*
225 * When things do not quite workout...
226 */
227 void bs_no_paging_space(boolean_t); /* forward */
228
229 void
230 bs_no_paging_space(
231 boolean_t out_of_memory)
232 {
233
234 if (out_of_memory)
235 dprintf(("*** OUT OF MEMORY ***\n"));
236 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
237 }
238
239 void bs_more_space(int); /* forward */
240 void bs_commit(int); /* forward */
241
242 boolean_t user_warned = FALSE;
243 unsigned int clusters_committed = 0;
244 unsigned int clusters_available = 0;
245 unsigned int clusters_committed_peak = 0;
246
247 void
248 bs_more_space(
249 int nclusters)
250 {
251 BSL_LOCK();
252 /*
253 * Account for new paging space.
254 */
255 clusters_available += nclusters;
256
257 if (clusters_available >= clusters_committed) {
258 if (verbose && user_warned) {
259 printf("%s%s - %d excess clusters now.\n",
260 my_name,
261 "paging space is OK now",
262 clusters_available - clusters_committed);
263 user_warned = FALSE;
264 clusters_committed_peak = 0;
265 }
266 } else {
267 if (verbose && user_warned) {
268 printf("%s%s - still short of %d clusters.\n",
269 my_name,
270 "WARNING: paging space over-committed",
271 clusters_committed - clusters_available);
272 clusters_committed_peak -= nclusters;
273 }
274 }
275 BSL_UNLOCK();
276
277 return;
278 }
279
280 void
281 bs_commit(
282 int nclusters)
283 {
284 BSL_LOCK();
285 clusters_committed += nclusters;
286 if (clusters_committed > clusters_available) {
287 if (verbose && !user_warned) {
288 user_warned = TRUE;
289 printf("%s%s - short of %d clusters.\n",
290 my_name,
291 "WARNING: paging space over-committed",
292 clusters_committed - clusters_available);
293 }
294 if (clusters_committed > clusters_committed_peak) {
295 clusters_committed_peak = clusters_committed;
296 }
297 } else {
298 if (verbose && user_warned) {
299 printf("%s%s - was short of up to %d clusters.\n",
300 my_name,
301 "paging space is OK now",
302 clusters_committed_peak - clusters_available);
303 user_warned = FALSE;
304 clusters_committed_peak = 0;
305 }
306 }
307 BSL_UNLOCK();
308
309 return;
310 }
311
312 int default_pager_info_verbose = 1;
313
314 void
315 bs_global_info(
316 vm_size_t *totalp,
317 vm_size_t *freep)
318 {
319 vm_size_t pages_total, pages_free;
320 paging_segment_t ps;
321 int i;
322
323 PSL_LOCK();
324 pages_total = pages_free = 0;
325 for (i = 0; i <= paging_segment_max; i++) {
326 ps = paging_segments[i];
327 if (ps == PAGING_SEGMENT_NULL)
328 continue;
329
330 /*
331 * no need to lock: by the time this data
332 * gets back to any remote requestor it
333 * will be obsolete anyways
334 */
335 pages_total += ps->ps_pgnum;
336 pages_free += ps->ps_clcount << ps->ps_clshift;
337 DEBUG(DEBUG_BS_INTERNAL,
338 ("segment #%d: %d total, %d free\n",
339 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
340 }
341 *totalp = pages_total;
342 *freep = pages_free;
343 if (verbose && user_warned && default_pager_info_verbose) {
344 if (clusters_available < clusters_committed) {
345 printf("%s %d clusters committed, %d available.\n",
346 my_name,
347 clusters_committed,
348 clusters_available);
349 }
350 }
351 PSL_UNLOCK();
352 }
353
354 backing_store_t backing_store_alloc(void); /* forward */
355
356 backing_store_t
357 backing_store_alloc(void)
358 {
359 backing_store_t bs;
360
361 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
362 if (bs == BACKING_STORE_NULL)
363 panic("backing_store_alloc: no memory");
364
365 BS_LOCK_INIT(bs);
366 bs->bs_port = MACH_PORT_NULL;
367 bs->bs_priority = 0;
368 bs->bs_clsize = 0;
369 bs->bs_pages_total = 0;
370 bs->bs_pages_in = 0;
371 bs->bs_pages_in_fail = 0;
372 bs->bs_pages_out = 0;
373 bs->bs_pages_out_fail = 0;
374
375 return bs;
376 }
377
378 backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
379
380 /* Even in both the component space and external versions of this pager, */
381 /* backing_store_lookup will be called from tasks in the application space */
382 backing_store_t
383 backing_store_lookup(
384 MACH_PORT_FACE port)
385 {
386 backing_store_t bs;
387
388 /*
389 port is currently backed with a vs structure in the alias field
390 we could create an ISBS alias and a port_is_bs call but frankly
391 I see no reason for the test, the bs->port == port check below
392 will work properly on junk entries.
393
394 if ((port == MACH_PORT_NULL) || port_is_vs(port))
395 */
396 if ((port == MACH_PORT_NULL))
397 return BACKING_STORE_NULL;
398
399 BSL_LOCK();
400 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
401 bs_links) {
402 BS_LOCK(bs);
403 if (bs->bs_port == port) {
404 BSL_UNLOCK();
405 /* Success, return it locked. */
406 return bs;
407 }
408 BS_UNLOCK(bs);
409 }
410 BSL_UNLOCK();
411 return BACKING_STORE_NULL;
412 }
413
414 void backing_store_add(backing_store_t); /* forward */
415
416 void
417 backing_store_add(
418 backing_store_t bs)
419 {
420 MACH_PORT_FACE port = bs->bs_port;
421 MACH_PORT_FACE pset = default_pager_default_set;
422 kern_return_t kr = KERN_SUCCESS;
423
424 if (kr != KERN_SUCCESS)
425 panic("backing_store_add: add to set");
426
427 }
428
429 /*
430 * Set up default page shift, but only if not already
431 * set and argument is within range.
432 */
433 boolean_t
434 bs_set_default_clsize(unsigned int npages)
435 {
436 switch(npages){
437 case 1:
438 case 2:
439 case 4:
440 case 8:
441 if (default_pager_clsize == 0) /* if not yet set */
442 vstruct_def_clshift = local_log2(npages);
443 return(TRUE);
444 }
445 return(FALSE);
446 }
447
448 int bs_get_global_clsize(int clsize); /* forward */
449
450 int
451 bs_get_global_clsize(
452 int clsize)
453 {
454 int i;
455 memory_object_default_t dmm;
456 kern_return_t kr;
457
458 /*
459 * Only allow setting of cluster size once. If called
460 * with no cluster size (default), we use the compiled-in default
461 * for the duration. The same cluster size is used for all
462 * paging segments.
463 */
464 if (default_pager_clsize == 0) {
465 /*
466 * Keep cluster size in bit shift because it's quicker
467 * arithmetic, and easier to keep at a power of 2.
468 */
469 if (clsize != NO_CLSIZE) {
470 for (i = 0; (1 << i) < clsize; i++);
471 if (i > MAX_CLUSTER_SHIFT)
472 i = MAX_CLUSTER_SHIFT;
473 vstruct_def_clshift = i;
474 }
475 default_pager_clsize = (1 << vstruct_def_clshift);
476
477 /*
478 * Let the user know the new (and definitive) cluster size.
479 */
480 if (verbose)
481 printf("%scluster size = %d page%s\n",
482 my_name, default_pager_clsize,
483 (default_pager_clsize == 1) ? "" : "s");
484
485 /*
486 * Let the kernel know too, in case it hasn't used the
487 * default value provided in main() yet.
488 */
489 dmm = default_pager_object;
490 clsize = default_pager_clsize * vm_page_size; /* in bytes */
491 kr = host_default_memory_manager(host_priv_self(),
492 &dmm,
493 clsize);
494 memory_object_default_deallocate(dmm);
495
496 if (kr != KERN_SUCCESS) {
497 panic("bs_get_global_cl_size:host_default_memory_manager");
498 }
499 if (dmm != default_pager_object) {
500 panic("bs_get_global_cl_size:there is another default pager");
501 }
502 }
503 ASSERT(default_pager_clsize > 0 &&
504 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
505
506 return default_pager_clsize;
507 }
508
509 kern_return_t
510 default_pager_backing_store_create(
511 memory_object_default_t pager,
512 int priority,
513 int clsize, /* in bytes */
514 MACH_PORT_FACE *backing_store)
515 {
516 backing_store_t bs;
517 MACH_PORT_FACE port;
518 kern_return_t kr;
519 struct vstruct_alias *alias_struct;
520
521 if (pager != default_pager_object)
522 return KERN_INVALID_ARGUMENT;
523
524 bs = backing_store_alloc();
525 port = ipc_port_alloc_kernel();
526 ipc_port_make_send(port);
527 assert (port != IP_NULL);
528
529 DEBUG(DEBUG_BS_EXTERNAL,
530 ("priority=%d clsize=%d bs_port=0x%x\n",
531 priority, clsize, (int) backing_store));
532
533 alias_struct = (struct vstruct_alias *)
534 kalloc(sizeof (struct vstruct_alias));
535 if(alias_struct != NULL) {
536 alias_struct->vs = (struct vstruct *)bs;
537 alias_struct->name = ISVS;
538 port->alias = (int) alias_struct;
539 }
540 else {
541 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
542 kfree((vm_offset_t)bs, sizeof (struct backing_store));
543 return KERN_RESOURCE_SHORTAGE;
544 }
545
546 bs->bs_port = port;
547 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
548 priority = BS_MAXPRI;
549 else if (priority == BS_NOPRI)
550 priority = BS_MAXPRI;
551 else
552 priority = BS_MINPRI;
553 bs->bs_priority = priority;
554
555 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
556
557 BSL_LOCK();
558 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
559 bs_links);
560 BSL_UNLOCK();
561
562 backing_store_add(bs);
563
564 *backing_store = port;
565 return KERN_SUCCESS;
566 }
567
568 kern_return_t
569 default_pager_backing_store_info(
570 MACH_PORT_FACE backing_store,
571 backing_store_flavor_t flavour,
572 backing_store_info_t info,
573 mach_msg_type_number_t *size)
574 {
575 backing_store_t bs;
576 backing_store_basic_info_t basic;
577 int i;
578 paging_segment_t ps;
579
580 if (flavour != BACKING_STORE_BASIC_INFO ||
581 *size < BACKING_STORE_BASIC_INFO_COUNT)
582 return KERN_INVALID_ARGUMENT;
583
584 basic = (backing_store_basic_info_t)info;
585 *size = BACKING_STORE_BASIC_INFO_COUNT;
586
587 VSTATS_LOCK(&global_stats.gs_lock);
588 basic->pageout_calls = global_stats.gs_pageout_calls;
589 basic->pagein_calls = global_stats.gs_pagein_calls;
590 basic->pages_in = global_stats.gs_pages_in;
591 basic->pages_out = global_stats.gs_pages_out;
592 basic->pages_unavail = global_stats.gs_pages_unavail;
593 basic->pages_init = global_stats.gs_pages_init;
594 basic->pages_init_writes= global_stats.gs_pages_init_writes;
595 VSTATS_UNLOCK(&global_stats.gs_lock);
596
597 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
598 return KERN_INVALID_ARGUMENT;
599
600 basic->bs_pages_total = bs->bs_pages_total;
601 PSL_LOCK();
602 bs->bs_pages_free = 0;
603 for (i = 0; i <= paging_segment_max; i++) {
604 ps = paging_segments[i];
605 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
606 PS_LOCK(ps);
607 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
608 PS_UNLOCK(ps);
609 }
610 }
611 PSL_UNLOCK();
612 basic->bs_pages_free = bs->bs_pages_free;
613 basic->bs_pages_in = bs->bs_pages_in;
614 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
615 basic->bs_pages_out = bs->bs_pages_out;
616 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
617
618 basic->bs_priority = bs->bs_priority;
619 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */
620
621 BS_UNLOCK(bs);
622
623 return KERN_SUCCESS;
624 }
625
626 int ps_delete(paging_segment_t); /* forward */
627
628 int
629 ps_delete(
630 paging_segment_t ps)
631 {
632 vstruct_t vs;
633 kern_return_t error = KERN_SUCCESS;
634 int vs_count;
635
636 VSL_LOCK(); /* get the lock on the list of vs's */
637
638 /* The lock relationship and sequence is farily complicated */
639 /* this code looks at a live list, locking and unlocking the list */
640 /* as it traverses it. It depends on the locking behavior of */
641 /* default_pager_no_senders. no_senders always locks the vstruct */
642 /* targeted for removal before locking the vstruct list. However */
643 /* it will remove that member of the list without locking its */
644 /* neighbors. We can be sure when we hold a lock on a vstruct */
645 /* it cannot be removed from the list but we must hold the list */
646 /* lock to be sure that its pointers to its neighbors are valid. */
647 /* Also, we can hold off destruction of a vstruct when the list */
648 /* lock and the vs locks are not being held by bumping the */
649 /* vs_async_pending count. */
650
651
652 while(backing_store_release_trigger_disable != 0) {
653 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
654 }
655
656 /* we will choose instead to hold a send right */
657 vs_count = vstruct_list.vsl_count;
658 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
659 if(vs == (vstruct_t)&vstruct_list) {
660 VSL_UNLOCK();
661 return KERN_SUCCESS;
662 }
663 VS_LOCK(vs);
664 vs_async_wait(vs); /* wait for any pending async writes */
665 if ((vs_count != 0) && (vs != NULL))
666 vs->vs_async_pending += 1; /* hold parties calling */
667 /* vs_async_wait */
668 VS_UNLOCK(vs);
669 VSL_UNLOCK();
670 while((vs_count != 0) && (vs != NULL)) {
671 /* We take the count of AMO's before beginning the */
672 /* transfer of of the target segment. */
673 /* We are guaranteed that the target segment cannot get */
674 /* more users. We also know that queue entries are */
675 /* made at the back of the list. If some of the entries */
676 /* we would check disappear while we are traversing the */
677 /* list then we will either check new entries which */
678 /* do not have any backing store in the target segment */
679 /* or re-check old entries. This might not be optimal */
680 /* but it will always be correct. The alternative is to */
681 /* take a snapshot of the list. */
682 vstruct_t next_vs;
683
684 if(dp_pages_free < cluster_transfer_minimum)
685 error = KERN_FAILURE;
686 else {
687 vm_object_t transfer_object;
688 int count;
689 upl_t upl;
690
691 transfer_object = vm_object_allocate(VM_SUPER_CLUSTER);
692 count = 0;
693 error = vm_object_upl_request(transfer_object,
694 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
695 &upl, NULL, &count,
696 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
697 | UPL_SET_INTERNAL);
698 if(error == KERN_SUCCESS) {
699 error = ps_vstruct_transfer_from_segment(
700 vs, ps, upl);
701 upl_commit(upl, NULL);
702 upl_deallocate(upl);
703 } else {
704 error = KERN_FAILURE;
705 }
706 vm_object_deallocate(transfer_object);
707 }
708 if(error) {
709 VS_LOCK(vs);
710 vs->vs_async_pending -= 1; /* release vs_async_wait */
711 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
712 vs->vs_waiting_async = FALSE;
713 VS_UNLOCK(vs);
714 thread_wakeup(&vs->vs_async_pending);
715 } else {
716 VS_UNLOCK(vs);
717 }
718 return KERN_FAILURE;
719 }
720
721 VSL_LOCK();
722
723 while(backing_store_release_trigger_disable != 0) {
724 VSL_SLEEP(&backing_store_release_trigger_disable,
725 THREAD_UNINT);
726 }
727
728 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
729 if((next_vs != (vstruct_t)&vstruct_list) &&
730 (vs != next_vs) && (vs_count != 1)) {
731 VS_LOCK(next_vs);
732 vs_async_wait(next_vs); /* wait for any */
733 /* pending async writes */
734 next_vs->vs_async_pending += 1; /* hold parties */
735 /* calling vs_async_wait */
736 VS_UNLOCK(next_vs);
737 }
738 VSL_UNLOCK();
739 VS_LOCK(vs);
740 vs->vs_async_pending -= 1;
741 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
742 vs->vs_waiting_async = FALSE;
743 VS_UNLOCK(vs);
744 thread_wakeup(&vs->vs_async_pending);
745 } else {
746 VS_UNLOCK(vs);
747 }
748 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
749 vs = NULL;
750 else
751 vs = next_vs;
752 vs_count--;
753 }
754 return KERN_SUCCESS;
755 }
756
757
758 kern_return_t
759 default_pager_backing_store_delete(
760 MACH_PORT_FACE backing_store)
761 {
762 backing_store_t bs;
763 int i;
764 paging_segment_t ps;
765 int error;
766 int interim_pages_removed = 0;
767 kern_return_t kr;
768
769 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
770 return KERN_INVALID_ARGUMENT;
771
772 #if 0
773 /* not implemented */
774 BS_UNLOCK(bs);
775 return KERN_FAILURE;
776 #endif
777
778 restart:
779 PSL_LOCK();
780 error = KERN_SUCCESS;
781 for (i = 0; i <= paging_segment_max; i++) {
782 ps = paging_segments[i];
783 if (ps != PAGING_SEGMENT_NULL &&
784 ps->ps_bs == bs &&
785 ! ps->ps_going_away) {
786 PS_LOCK(ps);
787 /* disable access to this segment */
788 ps->ps_going_away = TRUE;
789 PS_UNLOCK(ps);
790 /*
791 * The "ps" segment is "off-line" now,
792 * we can try and delete it...
793 */
794 if(dp_pages_free < (cluster_transfer_minimum
795 + ps->ps_pgcount)) {
796 error = KERN_FAILURE;
797 PSL_UNLOCK();
798 }
799 else {
800 /* remove all pages associated with the */
801 /* segment from the list of free pages */
802 /* when transfer is through, all target */
803 /* segment pages will appear to be free */
804
805 dp_pages_free -= ps->ps_pgcount;
806 interim_pages_removed += ps->ps_pgcount;
807 PSL_UNLOCK();
808 error = ps_delete(ps);
809 }
810 if (error != KERN_SUCCESS) {
811 /*
812 * We couldn't delete the segment,
813 * probably because there's not enough
814 * virtual memory left.
815 * Re-enable all the segments.
816 */
817 PSL_LOCK();
818 break;
819 }
820 goto restart;
821 }
822 }
823
824 if (error != KERN_SUCCESS) {
825 for (i = 0; i <= paging_segment_max; i++) {
826 ps = paging_segments[i];
827 if (ps != PAGING_SEGMENT_NULL &&
828 ps->ps_bs == bs &&
829 ps->ps_going_away) {
830 PS_LOCK(ps);
831 /* re-enable access to this segment */
832 ps->ps_going_away = FALSE;
833 PS_UNLOCK(ps);
834 }
835 }
836 dp_pages_free += interim_pages_removed;
837 PSL_UNLOCK();
838 BS_UNLOCK(bs);
839 return error;
840 }
841
842 for (i = 0; i <= paging_segment_max; i++) {
843 ps = paging_segments[i];
844 if (ps != PAGING_SEGMENT_NULL &&
845 ps->ps_bs == bs) {
846 if(ps->ps_going_away) {
847 paging_segments[i] = PAGING_SEGMENT_NULL;
848 paging_segment_count--;
849 PS_LOCK(ps);
850 kfree((vm_offset_t)ps->ps_bmap,
851 RMAPSIZE(ps->ps_ncls));
852 kfree((vm_offset_t)ps, sizeof *ps);
853 }
854 }
855 }
856
857 /* Scan the entire ps array separately to make certain we find the */
858 /* proper paging_segment_max */
859 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
860 if(paging_segments[i] != PAGING_SEGMENT_NULL)
861 paging_segment_max = i;
862 }
863
864 PSL_UNLOCK();
865
866 /*
867 * All the segments have been deleted.
868 * We can remove the backing store.
869 */
870
871 /*
872 * Disable lookups of this backing store.
873 */
874 if((void *)bs->bs_port->alias != NULL)
875 kfree((vm_offset_t) bs->bs_port->alias,
876 sizeof (struct vstruct_alias));
877 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
878 bs->bs_port = MACH_PORT_NULL;
879 BS_UNLOCK(bs);
880
881 /*
882 * Remove backing store from backing_store list.
883 */
884 BSL_LOCK();
885 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
886 bs_links);
887 BSL_UNLOCK();
888
889 /*
890 * Free the backing store structure.
891 */
892 kfree((vm_offset_t)bs, sizeof *bs);
893
894 return KERN_SUCCESS;
895 }
896
897 int ps_enter(paging_segment_t); /* forward */
898
899 int
900 ps_enter(
901 paging_segment_t ps)
902 {
903 int i;
904
905 PSL_LOCK();
906
907 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
908 if (paging_segments[i] == PAGING_SEGMENT_NULL)
909 break;
910 }
911
912 if (i < MAX_NUM_PAGING_SEGMENTS) {
913 paging_segments[i] = ps;
914 if (i > paging_segment_max)
915 paging_segment_max = i;
916 paging_segment_count++;
917 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
918 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
919 ps_select_array[ps->ps_bs->bs_priority] = 0;
920 i = 0;
921 } else {
922 PSL_UNLOCK();
923 return KERN_RESOURCE_SHORTAGE;
924 }
925
926 PSL_UNLOCK();
927 return i;
928 }
929
930 #ifdef DEVICE_PAGING
931 kern_return_t
932 default_pager_add_segment(
933 MACH_PORT_FACE backing_store,
934 MACH_PORT_FACE device,
935 recnum_t offset,
936 recnum_t count,
937 int record_size)
938 {
939 backing_store_t bs;
940 paging_segment_t ps;
941 int i;
942 int error;
943
944 if ((bs = backing_store_lookup(backing_store))
945 == BACKING_STORE_NULL)
946 return KERN_INVALID_ARGUMENT;
947
948 PSL_LOCK();
949 for (i = 0; i <= paging_segment_max; i++) {
950 ps = paging_segments[i];
951 if (ps == PAGING_SEGMENT_NULL)
952 continue;
953
954 /*
955 * Check for overlap on same device.
956 */
957 if (!(ps->ps_device != device
958 || offset >= ps->ps_offset + ps->ps_recnum
959 || offset + count <= ps->ps_offset)) {
960 PSL_UNLOCK();
961 BS_UNLOCK(bs);
962 return KERN_INVALID_ARGUMENT;
963 }
964 }
965 PSL_UNLOCK();
966
967 /*
968 * Set up the paging segment
969 */
970 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
971 if (ps == PAGING_SEGMENT_NULL) {
972 BS_UNLOCK(bs);
973 return KERN_RESOURCE_SHORTAGE;
974 }
975
976 ps->ps_segtype = PS_PARTITION;
977 ps->ps_device = device;
978 ps->ps_offset = offset;
979 ps->ps_record_shift = local_log2(vm_page_size / record_size);
980 ps->ps_recnum = count;
981 ps->ps_pgnum = count >> ps->ps_record_shift;
982
983 ps->ps_pgcount = ps->ps_pgnum;
984 ps->ps_clshift = local_log2(bs->bs_clsize);
985 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
986 ps->ps_hint = 0;
987
988 PS_LOCK_INIT(ps);
989 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
990 if (!ps->ps_bmap) {
991 kfree((vm_offset_t)ps, sizeof *ps);
992 BS_UNLOCK(bs);
993 return KERN_RESOURCE_SHORTAGE;
994 }
995 for (i = 0; i < ps->ps_ncls; i++) {
996 clrbit(ps->ps_bmap, i);
997 }
998
999 ps->ps_going_away = FALSE;
1000 ps->ps_bs = bs;
1001
1002 if ((error = ps_enter(ps)) != 0) {
1003 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1004 kfree((vm_offset_t)ps, sizeof *ps);
1005 BS_UNLOCK(bs);
1006 return KERN_RESOURCE_SHORTAGE;
1007 }
1008
1009 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1010 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1011 BS_UNLOCK(bs);
1012
1013 PSL_LOCK();
1014 dp_pages_free += ps->ps_pgcount;
1015 PSL_UNLOCK();
1016
1017 bs_more_space(ps->ps_clcount);
1018
1019 DEBUG(DEBUG_BS_INTERNAL,
1020 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1021 device, offset, count, record_size,
1022 ps->ps_record_shift, ps->ps_pgnum));
1023
1024 return KERN_SUCCESS;
1025 }
1026
1027 boolean_t
1028 bs_add_device(
1029 char *dev_name,
1030 MACH_PORT_FACE master)
1031 {
1032 security_token_t null_security_token = {
1033 { 0, 0 }
1034 };
1035 MACH_PORT_FACE device;
1036 int info[DEV_GET_SIZE_COUNT];
1037 mach_msg_type_number_t info_count;
1038 MACH_PORT_FACE bs = MACH_PORT_NULL;
1039 unsigned int rec_size;
1040 recnum_t count;
1041 int clsize;
1042 MACH_PORT_FACE reply_port;
1043
1044 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1045 null_security_token, dev_name, &device))
1046 return FALSE;
1047
1048 info_count = DEV_GET_SIZE_COUNT;
1049 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1050 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1051 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1052 clsize = bs_get_global_clsize(0);
1053 if (!default_pager_backing_store_create(
1054 default_pager_object,
1055 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1056 (clsize * vm_page_size),
1057 &bs)) {
1058 if (!default_pager_add_segment(bs, device,
1059 0, count, rec_size)) {
1060 return TRUE;
1061 }
1062 ipc_port_release_receive(bs);
1063 }
1064 }
1065
1066 ipc_port_release_send(device);
1067 return FALSE;
1068 }
1069 #endif /* DEVICE_PAGING */
1070
1071 #if VS_ASYNC_REUSE
1072
1073 struct vs_async *
1074 vs_alloc_async(void)
1075 {
1076 struct vs_async *vsa;
1077 MACH_PORT_FACE reply_port;
1078 kern_return_t kr;
1079
1080 VS_ASYNC_LOCK();
1081 if (vs_async_free_list == NULL) {
1082 VS_ASYNC_UNLOCK();
1083 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1084 if (vsa != NULL) {
1085 /*
1086 * Try allocating a reply port named after the
1087 * address of the vs_async structure.
1088 */
1089 struct vstruct_alias *alias_struct;
1090
1091 reply_port = ipc_port_alloc_kernel();
1092 alias_struct = (struct vstruct_alias *)
1093 kalloc(sizeof (struct vstruct_alias));
1094 if(alias_struct != NULL) {
1095 alias_struct->vs = (struct vstruct *)vsa;
1096 alias_struct->name = ISVS;
1097 reply_port->alias = (int) alias_struct;
1098 vsa->reply_port = reply_port;
1099 vs_alloc_async_count++;
1100 }
1101 else {
1102 vs_alloc_async_failed++;
1103 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1104 (reply_port));
1105 kfree((vm_offset_t)vsa,
1106 sizeof (struct vs_async));
1107 vsa = NULL;
1108 }
1109 }
1110 } else {
1111 vsa = vs_async_free_list;
1112 vs_async_free_list = vs_async_free_list->vsa_next;
1113 VS_ASYNC_UNLOCK();
1114 }
1115
1116 return vsa;
1117 }
1118
1119 void
1120 vs_free_async(
1121 struct vs_async *vsa)
1122 {
1123 VS_ASYNC_LOCK();
1124 vsa->vsa_next = vs_async_free_list;
1125 vs_async_free_list = vsa;
1126 VS_ASYNC_UNLOCK();
1127 }
1128
1129 #else /* VS_ASYNC_REUSE */
1130
1131 struct vs_async *
1132 vs_alloc_async(void)
1133 {
1134 struct vs_async *vsa;
1135 MACH_PORT_FACE reply_port;
1136 kern_return_t kr;
1137
1138 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1139 if (vsa != NULL) {
1140 /*
1141 * Try allocating a reply port named after the
1142 * address of the vs_async structure.
1143 */
1144 reply_port = ipc_port_alloc_kernel();
1145 alias_struct = (vstruct_alias *)
1146 kalloc(sizeof (struct vstruct_alias));
1147 if(alias_struct != NULL) {
1148 alias_struct->vs = reply_port;
1149 alias_struct->name = ISVS;
1150 reply_port->alias = (int) vsa;
1151 vsa->reply_port = reply_port;
1152 vs_alloc_async_count++;
1153 }
1154 else {
1155 vs_alloc_async_failed++;
1156 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1157 (reply_port));
1158 kfree((vm_offset_t) vsa,
1159 sizeof (struct vs_async));
1160 vsa = NULL;
1161 }
1162 }
1163
1164 return vsa;
1165 }
1166
1167 void
1168 vs_free_async(
1169 struct vs_async *vsa)
1170 {
1171 MACH_PORT_FACE reply_port;
1172 kern_return_t kr;
1173
1174 reply_port = vsa->reply_port;
1175 kfree((vm_offset_t) reply_port->alias, sizeof (struct vstuct_alias));
1176 kfree((vm_offset_t) vsa, sizeof (struct vs_async));
1177 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1178 #if 0
1179 VS_ASYNC_LOCK();
1180 vs_alloc_async_count--;
1181 VS_ASYNC_UNLOCK();
1182 #endif
1183 }
1184
1185 #endif /* VS_ASYNC_REUSE */
1186
1187 zone_t vstruct_zone;
1188
1189 vstruct_t
1190 ps_vstruct_create(
1191 vm_size_t size)
1192 {
1193 vstruct_t vs;
1194 int i;
1195
1196 vs = (vstruct_t) zalloc(vstruct_zone);
1197 if (vs == VSTRUCT_NULL) {
1198 return VSTRUCT_NULL;
1199 }
1200
1201 VS_LOCK_INIT(vs);
1202
1203 /*
1204 * The following fields will be provided later.
1205 */
1206 vs->vs_mem_obj = NULL;
1207 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1208 vs->vs_references = 1;
1209 vs->vs_seqno = 0;
1210
1211 #ifdef MACH_KERNEL
1212 vs->vs_waiting_seqno = FALSE;
1213 vs->vs_waiting_read = FALSE;
1214 vs->vs_waiting_write = FALSE;
1215 vs->vs_waiting_async = FALSE;
1216 #else
1217 mutex_init(&vs->vs_waiting_seqno, ETAP_DPAGE_VSSEQNO);
1218 mutex_init(&vs->vs_waiting_read, ETAP_DPAGE_VSREAD);
1219 mutex_init(&vs->vs_waiting_write, ETAP_DPAGE_VSWRITE);
1220 mutex_init(&vs->vs_waiting_refs, ETAP_DPAGE_VSREFS);
1221 mutex_init(&vs->vs_waiting_async, ETAP_DPAGE_VSASYNC);
1222 #endif
1223
1224 vs->vs_readers = 0;
1225 vs->vs_writers = 0;
1226
1227 vs->vs_errors = 0;
1228
1229 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1230 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1231 vs->vs_async_pending = 0;
1232
1233 /*
1234 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1235 * depending on the size of the memory object.
1236 */
1237 if (INDIRECT_CLMAP(vs->vs_size)) {
1238 vs->vs_imap = (struct vs_map **)
1239 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1240 vs->vs_indirect = TRUE;
1241 } else {
1242 vs->vs_dmap = (struct vs_map *)
1243 kalloc(CLMAP_SIZE(vs->vs_size));
1244 vs->vs_indirect = FALSE;
1245 }
1246 vs->vs_xfer_pending = FALSE;
1247 DEBUG(DEBUG_VS_INTERNAL,
1248 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1249
1250 /*
1251 * Check to see that we got the space.
1252 */
1253 if (!vs->vs_dmap) {
1254 kfree((vm_offset_t)vs, sizeof *vs);
1255 return VSTRUCT_NULL;
1256 }
1257
1258 /*
1259 * Zero the indirect pointers, or clear the direct pointers.
1260 */
1261 if (vs->vs_indirect)
1262 memset(vs->vs_imap, 0,
1263 INDIRECT_CLMAP_SIZE(vs->vs_size));
1264 else
1265 for (i = 0; i < vs->vs_size; i++)
1266 VSM_CLR(vs->vs_dmap[i]);
1267
1268 VS_MAP_LOCK_INIT(vs);
1269
1270 bs_commit(vs->vs_size);
1271
1272 return vs;
1273 }
1274
1275 paging_segment_t ps_select_segment(int, int *); /* forward */
1276
1277 paging_segment_t
1278 ps_select_segment(
1279 int shift,
1280 int *psindex)
1281 {
1282 paging_segment_t ps;
1283 int i;
1284 int j;
1285
1286 /*
1287 * Optimize case where there's only one segment.
1288 * paging_segment_max will index the one and only segment.
1289 */
1290
1291 PSL_LOCK();
1292 if (paging_segment_count == 1) {
1293 paging_segment_t lps; /* used to avoid extra PS_UNLOCK */
1294 ipc_port_t trigger = IP_NULL;
1295
1296 ps = paging_segments[paging_segment_max];
1297 *psindex = paging_segment_max;
1298 PS_LOCK(ps);
1299 if (ps->ps_going_away) {
1300 /* this segment is being turned off */
1301 lps = PAGING_SEGMENT_NULL;
1302 } else {
1303 ASSERT(ps->ps_clshift >= shift);
1304 if (ps->ps_clcount) {
1305 ps->ps_clcount--;
1306 dp_pages_free -= 1 << ps->ps_clshift;
1307 if(min_pages_trigger_port &&
1308 (dp_pages_free < minimum_pages_remaining)) {
1309 trigger = min_pages_trigger_port;
1310 min_pages_trigger_port = NULL;
1311 bs_low = TRUE;
1312 }
1313 lps = ps;
1314 } else
1315 lps = PAGING_SEGMENT_NULL;
1316 }
1317 PS_UNLOCK(ps);
1318 PSL_UNLOCK();
1319
1320 if (trigger != IP_NULL) {
1321 default_pager_space_alert(trigger, HI_WAT_ALERT);
1322 ipc_port_release_send(trigger);
1323 }
1324 return lps;
1325 }
1326
1327 if (paging_segment_count == 0) {
1328 PSL_UNLOCK();
1329 return PAGING_SEGMENT_NULL;
1330 }
1331
1332 for (i = BS_MAXPRI;
1333 i >= BS_MINPRI; i--) {
1334 int start_index;
1335
1336 if ((ps_select_array[i] == BS_NOPRI) ||
1337 (ps_select_array[i] == BS_FULLPRI))
1338 continue;
1339 start_index = ps_select_array[i];
1340
1341 if(!(paging_segments[start_index])) {
1342 j = start_index+1;
1343 physical_transfer_cluster_count = 0;
1344 }
1345 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1346 (((paging_segments[start_index])->ps_clshift)
1347 + vm_page_shift))) {
1348 physical_transfer_cluster_count = 0;
1349 j = start_index + 1;
1350 } else {
1351 physical_transfer_cluster_count+=1;
1352 j = start_index;
1353 if(start_index == 0)
1354 start_index = paging_segment_max;
1355 else
1356 start_index = start_index - 1;
1357 }
1358
1359 while (1) {
1360 if (j > paging_segment_max)
1361 j = 0;
1362 if ((ps = paging_segments[j]) &&
1363 (ps->ps_bs->bs_priority == i)) {
1364 /*
1365 * Force the ps cluster size to be
1366 * >= that of the vstruct.
1367 */
1368 PS_LOCK(ps);
1369 if (ps->ps_going_away) {
1370 /* this segment is being turned off */
1371 } else if ((ps->ps_clcount) &&
1372 (ps->ps_clshift >= shift)) {
1373 ipc_port_t trigger = IP_NULL;
1374
1375 ps->ps_clcount--;
1376 dp_pages_free -= 1 << ps->ps_clshift;
1377 if(min_pages_trigger_port &&
1378 (dp_pages_free <
1379 minimum_pages_remaining)) {
1380 trigger = min_pages_trigger_port;
1381 min_pages_trigger_port = NULL;
1382 }
1383 PS_UNLOCK(ps);
1384 /*
1385 * found one, quit looking.
1386 */
1387 ps_select_array[i] = j;
1388 PSL_UNLOCK();
1389
1390 if (trigger != IP_NULL) {
1391 default_pager_space_alert(
1392 trigger,
1393 HI_WAT_ALERT);
1394 ipc_port_release_send(trigger);
1395 }
1396 *psindex = j;
1397 return ps;
1398 }
1399 PS_UNLOCK(ps);
1400 }
1401 if (j == start_index) {
1402 /*
1403 * none at this priority -- mark it full
1404 */
1405 ps_select_array[i] = BS_FULLPRI;
1406 break;
1407 }
1408 j++;
1409 }
1410 }
1411 PSL_UNLOCK();
1412 return PAGING_SEGMENT_NULL;
1413 }
1414
1415 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1416
1417 vm_offset_t
1418 ps_allocate_cluster(
1419 vstruct_t vs,
1420 int *psindex,
1421 paging_segment_t use_ps)
1422 {
1423 int byte_num;
1424 int bit_num = 0;
1425 paging_segment_t ps;
1426 vm_offset_t cluster;
1427 ipc_port_t trigger = IP_NULL;
1428
1429 /*
1430 * Find best paging segment.
1431 * ps_select_segment will decrement cluster count on ps.
1432 * Must pass cluster shift to find the most appropriate segment.
1433 */
1434 /* NOTE: The addition of paging segment delete capability threatened
1435 * to seriously complicate the treatment of paging segments in this
1436 * module and the ones that call it (notably ps_clmap), because of the
1437 * difficulty in assuring that the paging segment would continue to
1438 * exist between being unlocked and locked. This was
1439 * avoided because all calls to this module are based in either
1440 * dp_memory_object calls which rely on the vs lock, or by
1441 * the transfer function which is part of the segment delete path.
1442 * The transfer function which is part of paging segment delete is
1443 * protected from multiple callers by the backing store lock.
1444 * The paging segment delete function treats mappings to a paging
1445 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1446 * while data is transferred to the remaining segments. This is in
1447 * line with the view that incomplete or in-transition mappings between
1448 * data, a vstruct, and backing store are protected by the vs lock.
1449 * This and the ordering of the paging segment "going_away" bit setting
1450 * protects us.
1451 */
1452 if (use_ps != PAGING_SEGMENT_NULL) {
1453 ps = use_ps;
1454 PSL_LOCK();
1455 PS_LOCK(ps);
1456 ps->ps_clcount--;
1457 dp_pages_free -= 1 << ps->ps_clshift;
1458 if(min_pages_trigger_port &&
1459 (dp_pages_free < minimum_pages_remaining)) {
1460 trigger = min_pages_trigger_port;
1461 min_pages_trigger_port = NULL;
1462 }
1463 PSL_UNLOCK();
1464 PS_UNLOCK(ps);
1465 if (trigger != IP_NULL) {
1466 default_pager_space_alert(trigger, HI_WAT_ALERT);
1467 ipc_port_release_send(trigger);
1468 }
1469
1470 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1471 PAGING_SEGMENT_NULL) {
1472 #if 0
1473 bs_no_paging_space(TRUE);
1474 #endif
1475 #if 0
1476 if (verbose)
1477 #endif
1478 dprintf(("no space in available paging segments; "
1479 "swapon suggested\n"));
1480 /* the count got off maybe, reset to zero */
1481 PSL_LOCK();
1482 dp_pages_free = 0;
1483 if(min_pages_trigger_port) {
1484 trigger = min_pages_trigger_port;
1485 min_pages_trigger_port = NULL;
1486 bs_low = TRUE;
1487 }
1488 PSL_UNLOCK();
1489 if (trigger != IP_NULL) {
1490 default_pager_space_alert(trigger, HI_WAT_ALERT);
1491 ipc_port_release_send(trigger);
1492 }
1493 return (vm_offset_t) -1;
1494 }
1495 ASSERT(ps->ps_clcount != 0);
1496
1497 /*
1498 * Look for an available cluster. At the end of the loop,
1499 * byte_num is the byte offset and bit_num is the bit offset of the
1500 * first zero bit in the paging segment bitmap.
1501 */
1502 PS_LOCK(ps);
1503 byte_num = ps->ps_hint;
1504 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1505 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1506 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1507 if (isclr((ps->ps_bmap + byte_num), bit_num))
1508 break;
1509 }
1510 ASSERT(bit_num != NBBY);
1511 break;
1512 }
1513 }
1514 ps->ps_hint = byte_num;
1515 cluster = (byte_num*NBBY) + bit_num;
1516
1517 /* Space was reserved, so this must be true */
1518 ASSERT(cluster < ps->ps_ncls);
1519
1520 setbit(ps->ps_bmap, cluster);
1521 PS_UNLOCK(ps);
1522
1523 return cluster;
1524 }
1525
1526 void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */
1527
1528 void
1529 ps_deallocate_cluster(
1530 paging_segment_t ps,
1531 vm_offset_t cluster)
1532 {
1533 ipc_port_t trigger = IP_NULL;
1534
1535 if (cluster >= (vm_offset_t) ps->ps_ncls)
1536 panic("ps_deallocate_cluster: Invalid cluster number");
1537
1538 /*
1539 * Lock the paging segment, clear the cluster's bitmap and increment the
1540 * number of free cluster.
1541 */
1542 PSL_LOCK();
1543 PS_LOCK(ps);
1544 clrbit(ps->ps_bmap, cluster);
1545 ++ps->ps_clcount;
1546 dp_pages_free += 1 << ps->ps_clshift;
1547 if(max_pages_trigger_port
1548 && (backing_store_release_trigger_disable == 0)
1549 && (dp_pages_free > maximum_pages_free)) {
1550 trigger = max_pages_trigger_port;
1551 max_pages_trigger_port = NULL;
1552 }
1553 PSL_UNLOCK();
1554
1555 /*
1556 * Move the hint down to the freed cluster if it is
1557 * less than the current hint.
1558 */
1559 if ((cluster/NBBY) < ps->ps_hint) {
1560 ps->ps_hint = (cluster/NBBY);
1561 }
1562
1563 PS_UNLOCK(ps);
1564
1565 /*
1566 * If we're freeing space on a full priority, reset the array.
1567 */
1568 PSL_LOCK();
1569 if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1570 ps_select_array[ps->ps_bs->bs_priority] = 0;
1571 PSL_UNLOCK();
1572
1573 if (trigger != IP_NULL) {
1574 VSL_LOCK();
1575 if(backing_store_release_trigger_disable != 0) {
1576 assert_wait((event_t)
1577 &backing_store_release_trigger_disable,
1578 THREAD_UNINT);
1579 VSL_UNLOCK();
1580 thread_block(THREAD_CONTINUE_NULL);
1581 } else {
1582 VSL_UNLOCK();
1583 }
1584 default_pager_space_alert(trigger, LO_WAT_ALERT);
1585 ipc_port_release_send(trigger);
1586 }
1587
1588 return;
1589 }
1590
1591 void ps_dealloc_vsmap(struct vs_map *, vm_size_t); /* forward */
1592
1593 void
1594 ps_dealloc_vsmap(
1595 struct vs_map *vsmap,
1596 vm_size_t size)
1597 {
1598 int i;
1599 for (i = 0; i < size; i++)
1600 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1601 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1602 VSM_CLOFF(vsmap[i]));
1603 }
1604
1605 void
1606 ps_vstruct_dealloc(
1607 vstruct_t vs)
1608 {
1609 int i;
1610 spl_t s;
1611
1612 VS_MAP_LOCK(vs);
1613
1614 /*
1615 * If this is an indirect structure, then we walk through the valid
1616 * (non-zero) indirect pointers and deallocate the clusters
1617 * associated with each used map entry (via ps_dealloc_vsmap).
1618 * When all of the clusters in an indirect block have been
1619 * freed, we deallocate the block. When all of the indirect
1620 * blocks have been deallocated we deallocate the memory
1621 * holding the indirect pointers.
1622 */
1623 if (vs->vs_indirect) {
1624 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1625 if (vs->vs_imap[i] != NULL) {
1626 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1627 kfree((vm_offset_t)vs->vs_imap[i],
1628 CLMAP_THRESHOLD);
1629 }
1630 }
1631 kfree((vm_offset_t)vs->vs_imap,
1632 INDIRECT_CLMAP_SIZE(vs->vs_size));
1633 } else {
1634 /*
1635 * Direct map. Free used clusters, then memory.
1636 */
1637 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1638 kfree((vm_offset_t)vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1639 }
1640 VS_MAP_UNLOCK(vs);
1641
1642 bs_commit(- vs->vs_size);
1643
1644 zfree(vstruct_zone, (vm_offset_t)vs);
1645 }
1646
1647 int ps_map_extend(vstruct_t, int); /* forward */
1648
1649 int ps_map_extend(
1650 vstruct_t vs,
1651 int new_size)
1652 {
1653 struct vs_map **new_imap;
1654 struct vs_map *new_dmap = NULL;
1655 int newdsize;
1656 int i;
1657 void *old_map = NULL;
1658 int old_map_size = 0;
1659
1660 if (vs->vs_size >= new_size) {
1661 /*
1662 * Someone has already done the work.
1663 */
1664 return 0;
1665 }
1666
1667 /*
1668 * If the new size extends into the indirect range, then we have one
1669 * of two cases: we are going from indirect to indirect, or we are
1670 * going from direct to indirect. If we are going from indirect to
1671 * indirect, then it is possible that the new size will fit in the old
1672 * indirect map. If this is the case, then just reset the size of the
1673 * vstruct map and we are done. If the new size will not
1674 * fit into the old indirect map, then we have to allocate a new
1675 * indirect map and copy the old map pointers into this new map.
1676 *
1677 * If we are going from direct to indirect, then we have to allocate a
1678 * new indirect map and copy the old direct pages into the first
1679 * indirect page of the new map.
1680 * NOTE: allocating memory here is dangerous, as we're in the
1681 * pageout path.
1682 */
1683 if (INDIRECT_CLMAP(new_size)) {
1684 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1685
1686 /*
1687 * Get a new indirect map and zero it.
1688 */
1689 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1690 if (vs->vs_indirect &&
1691 (new_map_size == old_map_size)) {
1692 bs_commit(new_size - vs->vs_size);
1693 vs->vs_size = new_size;
1694 return 0;
1695 }
1696
1697 new_imap = (struct vs_map **)kalloc(new_map_size);
1698 if (new_imap == NULL) {
1699 return -1;
1700 }
1701 memset(new_imap, 0, new_map_size);
1702
1703 if (vs->vs_indirect) {
1704 /* Copy old entries into new map */
1705 memcpy(new_imap, vs->vs_imap, old_map_size);
1706 /* Arrange to free the old map */
1707 old_map = (void *) vs->vs_imap;
1708 newdsize = 0;
1709 } else { /* Old map was a direct map */
1710 /* Allocate an indirect page */
1711 if ((new_imap[0] = (struct vs_map *)
1712 kalloc(CLMAP_THRESHOLD)) == NULL) {
1713 kfree((vm_offset_t)new_imap, new_map_size);
1714 return -1;
1715 }
1716 new_dmap = new_imap[0];
1717 newdsize = CLMAP_ENTRIES;
1718 }
1719 } else {
1720 new_imap = NULL;
1721 newdsize = new_size;
1722 /*
1723 * If the new map is a direct map, then the old map must
1724 * also have been a direct map. All we have to do is
1725 * to allocate a new direct map, copy the old entries
1726 * into it and free the old map.
1727 */
1728 if ((new_dmap = (struct vs_map *)
1729 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1730 return -1;
1731 }
1732 }
1733 if (newdsize) {
1734
1735 /* Free the old map */
1736 old_map = (void *) vs->vs_dmap;
1737 old_map_size = CLMAP_SIZE(vs->vs_size);
1738
1739 /* Copy info from the old map into the new map */
1740 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1741
1742 /* Initialize the rest of the new map */
1743 for (i = vs->vs_size; i < newdsize; i++)
1744 VSM_CLR(new_dmap[i]);
1745 }
1746 if (new_imap) {
1747 vs->vs_imap = new_imap;
1748 vs->vs_indirect = TRUE;
1749 } else
1750 vs->vs_dmap = new_dmap;
1751 bs_commit(new_size - vs->vs_size);
1752 vs->vs_size = new_size;
1753 if (old_map)
1754 kfree((vm_offset_t)old_map, old_map_size);
1755 return 0;
1756 }
1757
1758 vm_offset_t
1759 ps_clmap(
1760 vstruct_t vs,
1761 vm_offset_t offset,
1762 struct clmap *clmap,
1763 int flag,
1764 vm_size_t size,
1765 int error)
1766 {
1767 vm_offset_t cluster; /* The cluster of offset. */
1768 vm_offset_t newcl; /* The new cluster allocated. */
1769 vm_offset_t newoff;
1770 int i;
1771 struct vs_map *vsmap;
1772
1773 VS_MAP_LOCK(vs);
1774
1775 ASSERT(vs->vs_dmap);
1776 cluster = atop_32(offset) >> vs->vs_clshift;
1777
1778 /*
1779 * Initialize cluster error value
1780 */
1781 clmap->cl_error = 0;
1782
1783 /*
1784 * If the object has grown, extend the page map.
1785 */
1786 if (cluster >= vs->vs_size) {
1787 if (flag == CL_FIND) {
1788 /* Do not allocate if just doing a lookup */
1789 VS_MAP_UNLOCK(vs);
1790 return (vm_offset_t) -1;
1791 }
1792 if (ps_map_extend(vs, cluster + 1)) {
1793 VS_MAP_UNLOCK(vs);
1794 return (vm_offset_t) -1;
1795 }
1796 }
1797
1798 /*
1799 * Look for the desired cluster. If the map is indirect, then we
1800 * have a two level lookup. First find the indirect block, then
1801 * find the actual cluster. If the indirect block has not yet
1802 * been allocated, then do so. If the cluster has not yet been
1803 * allocated, then do so.
1804 *
1805 * If any of the allocations fail, then return an error.
1806 * Don't allocate if just doing a lookup.
1807 */
1808 if (vs->vs_indirect) {
1809 long ind_block = cluster/CLMAP_ENTRIES;
1810
1811 /* Is the indirect block allocated? */
1812 vsmap = vs->vs_imap[ind_block];
1813 if (vsmap == NULL) {
1814 if (flag == CL_FIND) {
1815 VS_MAP_UNLOCK(vs);
1816 return (vm_offset_t) -1;
1817 }
1818
1819 /* Allocate the indirect block */
1820 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1821 if (vsmap == NULL) {
1822 VS_MAP_UNLOCK(vs);
1823 return (vm_offset_t) -1;
1824 }
1825 /* Initialize the cluster offsets */
1826 for (i = 0; i < CLMAP_ENTRIES; i++)
1827 VSM_CLR(vsmap[i]);
1828 vs->vs_imap[ind_block] = vsmap;
1829 }
1830 } else
1831 vsmap = vs->vs_dmap;
1832
1833 ASSERT(vsmap);
1834 vsmap += cluster%CLMAP_ENTRIES;
1835
1836 /*
1837 * At this point, vsmap points to the struct vs_map desired.
1838 *
1839 * Look in the map for the cluster, if there was an error on a
1840 * previous write, flag it and return. If it is not yet
1841 * allocated, then allocate it, if we're writing; if we're
1842 * doing a lookup and the cluster's not allocated, return error.
1843 */
1844 if (VSM_ISERR(*vsmap)) {
1845 clmap->cl_error = VSM_GETERR(*vsmap);
1846 VS_MAP_UNLOCK(vs);
1847 return (vm_offset_t) -1;
1848 } else if (VSM_ISCLR(*vsmap)) {
1849 int psindex;
1850
1851 if (flag == CL_FIND) {
1852 /*
1853 * If there's an error and the entry is clear, then
1854 * we've run out of swap space. Record the error
1855 * here and return.
1856 */
1857 if (error) {
1858 VSM_SETERR(*vsmap, error);
1859 }
1860 VS_MAP_UNLOCK(vs);
1861 return (vm_offset_t) -1;
1862 } else {
1863 /*
1864 * Attempt to allocate a cluster from the paging segment
1865 */
1866 newcl = ps_allocate_cluster(vs, &psindex,
1867 PAGING_SEGMENT_NULL);
1868 if (newcl == -1) {
1869 VS_MAP_UNLOCK(vs);
1870 return (vm_offset_t) -1;
1871 }
1872 VSM_CLR(*vsmap);
1873 VSM_SETCLOFF(*vsmap, newcl);
1874 VSM_SETPS(*vsmap, psindex);
1875 }
1876 } else
1877 newcl = VSM_CLOFF(*vsmap);
1878
1879 /*
1880 * Fill in pertinent fields of the clmap
1881 */
1882 clmap->cl_ps = VSM_PS(*vsmap);
1883 clmap->cl_numpages = VSCLSIZE(vs);
1884 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1885
1886 /*
1887 * Byte offset in paging segment is byte offset to cluster plus
1888 * byte offset within cluster. It looks ugly, but should be
1889 * relatively quick.
1890 */
1891 ASSERT(trunc_page(offset) == offset);
1892 newcl = ptoa_32(newcl) << vs->vs_clshift;
1893 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1894 if (flag == CL_ALLOC) {
1895 /*
1896 * set bits in the allocation bitmap according to which
1897 * pages were requested. size is in bytes.
1898 */
1899 i = atop_32(newoff);
1900 while ((size > 0) && (i < VSCLSIZE(vs))) {
1901 VSM_SETALLOC(*vsmap, i);
1902 i++;
1903 size -= vm_page_size;
1904 }
1905 }
1906 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1907 if (newoff) {
1908 /*
1909 * Offset is not cluster aligned, so number of pages
1910 * and bitmaps must be adjusted
1911 */
1912 clmap->cl_numpages -= atop_32(newoff);
1913 CLMAP_SHIFT(clmap, vs);
1914 CLMAP_SHIFTALLOC(clmap, vs);
1915 }
1916
1917 /*
1918 *
1919 * The setting of valid bits and handling of write errors
1920 * must be done here, while we hold the lock on the map.
1921 * It logically should be done in ps_vs_write_complete().
1922 * The size and error information has been passed from
1923 * ps_vs_write_complete(). If the size parameter is non-zero,
1924 * then there is work to be done. If error is also non-zero,
1925 * then the error number is recorded in the cluster and the
1926 * entire cluster is in error.
1927 */
1928 if (size && flag == CL_FIND) {
1929 vm_offset_t off = (vm_offset_t) 0;
1930
1931 if (!error) {
1932 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1933 i++) {
1934 VSM_SETPG(*vsmap, i);
1935 size -= vm_page_size;
1936 }
1937 ASSERT(i <= VSCLSIZE(vs));
1938 } else {
1939 BS_STAT(clmap->cl_ps->ps_bs,
1940 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1941 atop_32(size));
1942 off = VSM_CLOFF(*vsmap);
1943 VSM_SETERR(*vsmap, error);
1944 }
1945 /*
1946 * Deallocate cluster if error, and no valid pages
1947 * already present.
1948 */
1949 if (off != (vm_offset_t) 0)
1950 ps_deallocate_cluster(clmap->cl_ps, off);
1951 VS_MAP_UNLOCK(vs);
1952 return (vm_offset_t) 0;
1953 } else
1954 VS_MAP_UNLOCK(vs);
1955
1956 DEBUG(DEBUG_VS_INTERNAL,
1957 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1958 newcl+newoff, (int) vs, (int) vsmap, flag));
1959 DEBUG(DEBUG_VS_INTERNAL,
1960 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1961 (int) clmap->cl_ps, clmap->cl_numpages,
1962 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1963
1964 return (newcl + newoff);
1965 }
1966
1967 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */
1968
1969 void
1970 ps_clunmap(
1971 vstruct_t vs,
1972 vm_offset_t offset,
1973 vm_size_t length)
1974 {
1975 vm_offset_t cluster; /* The cluster number of offset */
1976 struct vs_map *vsmap;
1977
1978 VS_MAP_LOCK(vs);
1979
1980 /*
1981 * Loop through all clusters in this range, freeing paging segment
1982 * clusters and map entries as encountered.
1983 */
1984 while (length > 0) {
1985 vm_offset_t newoff;
1986 int i;
1987
1988 cluster = atop_32(offset) >> vs->vs_clshift;
1989 if (vs->vs_indirect) /* indirect map */
1990 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
1991 else
1992 vsmap = vs->vs_dmap;
1993 if (vsmap == NULL) {
1994 VS_MAP_UNLOCK(vs);
1995 return;
1996 }
1997 vsmap += cluster%CLMAP_ENTRIES;
1998 if (VSM_ISCLR(*vsmap)) {
1999 length -= vm_page_size;
2000 offset += vm_page_size;
2001 continue;
2002 }
2003 /*
2004 * We've got a valid mapping. Clear it and deallocate
2005 * paging segment cluster pages.
2006 * Optimize for entire cluster cleraing.
2007 */
2008 if (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) {
2009 /*
2010 * Not cluster aligned.
2011 */
2012 ASSERT(trunc_page(newoff) == newoff);
2013 i = atop_32(newoff);
2014 } else
2015 i = 0;
2016 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2017 VSM_CLRPG(*vsmap, i);
2018 VSM_CLRALLOC(*vsmap, i);
2019 length -= vm_page_size;
2020 offset += vm_page_size;
2021 i++;
2022 }
2023
2024 /*
2025 * If map entry is empty, clear and deallocate cluster.
2026 */
2027 if (!VSM_ALLOC(*vsmap)) {
2028 ps_deallocate_cluster(VSM_PS(*vsmap),
2029 VSM_CLOFF(*vsmap));
2030 VSM_CLR(*vsmap);
2031 }
2032 }
2033
2034 VS_MAP_UNLOCK(vs);
2035 }
2036
2037 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2038
2039 void
2040 ps_vs_write_complete(
2041 vstruct_t vs,
2042 vm_offset_t offset,
2043 vm_size_t size,
2044 int error)
2045 {
2046 struct clmap clmap;
2047
2048 /*
2049 * Get the struct vsmap for this cluster.
2050 * Use READ, even though it was written, because the
2051 * cluster MUST be present, unless there was an error
2052 * in the original ps_clmap (e.g. no space), in which
2053 * case, nothing happens.
2054 *
2055 * Must pass enough information to ps_clmap to allow it
2056 * to set the vs_map structure bitmap under lock.
2057 */
2058 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2059 }
2060
2061 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */
2062
2063 void
2064 vs_cl_write_complete(
2065 vstruct_t vs,
2066 paging_segment_t ps,
2067 vm_offset_t offset,
2068 vm_offset_t addr,
2069 vm_size_t size,
2070 boolean_t async,
2071 int error)
2072 {
2073 kern_return_t kr;
2074
2075 if (error) {
2076 /*
2077 * For internal objects, the error is recorded on a
2078 * per-cluster basis by ps_clmap() which is called
2079 * by ps_vs_write_complete() below.
2080 */
2081 dprintf(("write failed error = 0x%x\n", error));
2082 /* add upl_abort code here */
2083 } else
2084 GSTAT(global_stats.gs_pages_out += atop_32(size));
2085 /*
2086 * Notify the vstruct mapping code, so it can do its accounting.
2087 */
2088 ps_vs_write_complete(vs, offset, size, error);
2089
2090 if (async) {
2091 VS_LOCK(vs);
2092 ASSERT(vs->vs_async_pending > 0);
2093 vs->vs_async_pending -= size;
2094 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2095 vs->vs_waiting_async = FALSE;
2096 VS_UNLOCK(vs);
2097 /* mutex_unlock(&vs->vs_waiting_async); */
2098 thread_wakeup(&vs->vs_async_pending);
2099 } else {
2100 VS_UNLOCK(vs);
2101 }
2102 }
2103 }
2104
2105 #ifdef DEVICE_PAGING
2106 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2107
2108 kern_return_t
2109 device_write_reply(
2110 MACH_PORT_FACE reply_port,
2111 kern_return_t device_code,
2112 io_buf_len_t bytes_written)
2113 {
2114 struct vs_async *vsa;
2115
2116 vsa = (struct vs_async *)
2117 ((struct vstruct_alias *)(reply_port->alias))->vs;
2118
2119 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2120 device_code = KERN_FAILURE;
2121 }
2122
2123 vsa->vsa_error = device_code;
2124
2125
2126 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2127 if(vsa->vsa_flags & VSA_TRANSFER) {
2128 /* revisit when async disk segments redone */
2129 if(vsa->vsa_error) {
2130 /* need to consider error condition. re-write data or */
2131 /* throw it away here. */
2132 vm_offset_t ioaddr;
2133 if(vm_map_copyout(kernel_map, &ioaddr,
2134 (vm_map_copy_t)vsa->vsa_addr) != KERN_SUCCESS)
2135 panic("vs_cluster_write: unable to copy source list\n");
2136 vm_deallocate(kernel_map, ioaddr, vsa->vsa_size);
2137 }
2138 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2139 vsa->vsa_size, vsa->vsa_error);
2140 } else {
2141 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2142 vsa->vsa_addr, vsa->vsa_size, TRUE,
2143 vsa->vsa_error);
2144 }
2145 VS_FREE_ASYNC(vsa);
2146
2147 return KERN_SUCCESS;
2148 }
2149
2150 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2151 kern_return_t
2152 device_write_reply_inband(
2153 MACH_PORT_FACE reply_port,
2154 kern_return_t return_code,
2155 io_buf_len_t bytes_written)
2156 {
2157 panic("device_write_reply_inband: illegal");
2158 return KERN_SUCCESS;
2159 }
2160
2161 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2162 kern_return_t
2163 device_read_reply(
2164 MACH_PORT_FACE reply_port,
2165 kern_return_t return_code,
2166 io_buf_ptr_t data,
2167 mach_msg_type_number_t dataCnt)
2168 {
2169 struct vs_async *vsa;
2170 vsa = (struct vs_async *)
2171 ((struct vstruct_alias *)(reply_port->alias))->vs;
2172 vsa->vsa_addr = (vm_offset_t)data;
2173 vsa->vsa_size = (vm_size_t)dataCnt;
2174 vsa->vsa_error = return_code;
2175 thread_wakeup(&vsa->vsa_lock);
2176 return KERN_SUCCESS;
2177 }
2178
2179 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2180 kern_return_t
2181 device_read_reply_inband(
2182 MACH_PORT_FACE reply_port,
2183 kern_return_t return_code,
2184 io_buf_ptr_inband_t data,
2185 mach_msg_type_number_t dataCnt)
2186 {
2187 panic("device_read_reply_inband: illegal");
2188 return KERN_SUCCESS;
2189 }
2190
2191 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2192 kern_return_t
2193 device_read_reply_overwrite(
2194 MACH_PORT_FACE reply_port,
2195 kern_return_t return_code,
2196 io_buf_len_t bytes_read)
2197 {
2198 panic("device_read_reply_overwrite: illegal\n");
2199 return KERN_SUCCESS;
2200 }
2201
2202 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2203 kern_return_t
2204 device_open_reply(
2205 MACH_PORT_FACE reply_port,
2206 kern_return_t return_code,
2207 MACH_PORT_FACE device_port)
2208 {
2209 panic("device_open_reply: illegal\n");
2210 return KERN_SUCCESS;
2211 }
2212
2213 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
2214
2215 kern_return_t
2216 ps_read_device(
2217 paging_segment_t ps,
2218 vm_offset_t offset,
2219 vm_offset_t *bufferp,
2220 unsigned int size,
2221 unsigned int *residualp,
2222 int flags)
2223 {
2224 kern_return_t kr;
2225 recnum_t dev_offset;
2226 unsigned int bytes_wanted;
2227 unsigned int bytes_read;
2228 unsigned int total_read;
2229 vm_offset_t dev_buffer;
2230 vm_offset_t buf_ptr;
2231 unsigned int records_read;
2232 struct vs_async *vsa;
2233 mutex_t vs_waiting_read_reply;
2234
2235 device_t device;
2236 vm_map_copy_t device_data = NULL;
2237 default_pager_thread_t *dpt = NULL;
2238
2239 device = dev_port_lookup(ps->ps_device);
2240 clustered_reads[atop_32(size)]++;
2241
2242 dev_offset = (ps->ps_offset +
2243 (offset >> (vm_page_shift - ps->ps_record_shift)));
2244 bytes_wanted = size;
2245 total_read = 0;
2246 *bufferp = (vm_offset_t)NULL;
2247
2248 do {
2249 vsa = VS_ALLOC_ASYNC();
2250 if (vsa) {
2251 vsa->vsa_vs = NULL;
2252 vsa->vsa_addr = 0;
2253 vsa->vsa_offset = 0;
2254 vsa->vsa_size = 0;
2255 vsa->vsa_ps = NULL;
2256 }
2257 mutex_init(&vsa->vsa_lock, ETAP_DPAGE_VSSEQNO);
2258 ip_lock(vsa->reply_port);
2259 vsa->reply_port->ip_sorights++;
2260 ip_reference(vsa->reply_port);
2261 ip_unlock(vsa->reply_port);
2262 kr = ds_device_read_common(device,
2263 vsa->reply_port,
2264 (mach_msg_type_name_t)
2265 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2266 (dev_mode_t) 0,
2267 dev_offset,
2268 bytes_wanted,
2269 (IO_READ | IO_CALL),
2270 (io_buf_ptr_t *) &dev_buffer,
2271 (mach_msg_type_number_t *) &bytes_read);
2272 if(kr == MIG_NO_REPLY) {
2273 assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2274 thread_block(THREAD_CONTINUE_NULL);
2275
2276 dev_buffer = vsa->vsa_addr;
2277 bytes_read = (unsigned int)vsa->vsa_size;
2278 kr = vsa->vsa_error;
2279 }
2280 VS_FREE_ASYNC(vsa);
2281 if (kr != KERN_SUCCESS || bytes_read == 0) {
2282 break;
2283 }
2284 total_read += bytes_read;
2285
2286 /*
2287 * If we got the entire range, use the returned dev_buffer.
2288 */
2289 if (bytes_read == size) {
2290 *bufferp = (vm_offset_t)dev_buffer;
2291 break;
2292 }
2293
2294 #if 1
2295 dprintf(("read only %d bytes out of %d\n",
2296 bytes_read, bytes_wanted));
2297 #endif
2298 if(dpt == NULL) {
2299 dpt = get_read_buffer();
2300 buf_ptr = dpt->dpt_buffer;
2301 *bufferp = (vm_offset_t)buf_ptr;
2302 }
2303 /*
2304 * Otherwise, copy the data into the provided buffer (*bufferp)
2305 * and append the rest of the range as it comes in.
2306 */
2307 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2308 buf_ptr += bytes_read;
2309 bytes_wanted -= bytes_read;
2310 records_read = (bytes_read >>
2311 (vm_page_shift - ps->ps_record_shift));
2312 dev_offset += records_read;
2313 DEBUG(DEBUG_VS_INTERNAL,
2314 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2315 dev_buffer, bytes_read));
2316 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2317 != KERN_SUCCESS)
2318 Panic("dealloc buf");
2319 } while (bytes_wanted);
2320
2321 *residualp = size - total_read;
2322 if((dev_buffer != *bufferp) && (total_read != 0)) {
2323 vm_offset_t temp_buffer;
2324 vm_allocate(kernel_map, &temp_buffer, total_read, TRUE);
2325 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2326 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2327 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2328 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2329 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2330 (vm_map_copy_t *)&device_data, FALSE))
2331 panic("ps_read_device: cannot copyin locally provided buffer\n");
2332 }
2333 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2334 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2335 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2336 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2337 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2338 (vm_map_copy_t *)&device_data, FALSE))
2339 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2340 }
2341 else {
2342 device_data = NULL;
2343 }
2344 *bufferp = (vm_offset_t)device_data;
2345
2346 if(dpt != NULL) {
2347 /* Free the receive buffer */
2348 dpt->checked_out = 0;
2349 thread_wakeup(&dpt_array);
2350 }
2351 return KERN_SUCCESS;
2352 }
2353
2354 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
2355
2356 kern_return_t
2357 ps_write_device(
2358 paging_segment_t ps,
2359 vm_offset_t offset,
2360 vm_offset_t addr,
2361 unsigned int size,
2362 struct vs_async *vsa)
2363 {
2364 recnum_t dev_offset;
2365 io_buf_len_t bytes_to_write, bytes_written;
2366 recnum_t records_written;
2367 kern_return_t kr;
2368 MACH_PORT_FACE reply_port;
2369
2370
2371
2372 clustered_writes[atop_32(size)]++;
2373
2374 dev_offset = (ps->ps_offset +
2375 (offset >> (vm_page_shift - ps->ps_record_shift)));
2376 bytes_to_write = size;
2377
2378 if (vsa) {
2379 /*
2380 * Asynchronous write.
2381 */
2382 reply_port = vsa->reply_port;
2383 ip_lock(reply_port);
2384 reply_port->ip_sorights++;
2385 ip_reference(reply_port);
2386 ip_unlock(reply_port);
2387 {
2388 device_t device;
2389 device = dev_port_lookup(ps->ps_device);
2390
2391 vsa->vsa_addr = addr;
2392 kr=ds_device_write_common(device,
2393 reply_port,
2394 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2395 (dev_mode_t) 0,
2396 dev_offset,
2397 (io_buf_ptr_t) addr,
2398 size,
2399 (IO_WRITE | IO_CALL),
2400 &bytes_written);
2401 }
2402 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2403 if (verbose)
2404 dprintf(("%s0x%x, addr=0x%x,"
2405 "size=0x%x,offset=0x%x\n",
2406 "device_write_request returned ",
2407 kr, addr, size, offset));
2408 BS_STAT(ps->ps_bs,
2409 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2410 /* do the completion notification to free resources */
2411 device_write_reply(reply_port, kr, 0);
2412 return PAGER_ERROR;
2413 }
2414 } else do {
2415 /*
2416 * Synchronous write.
2417 */
2418 {
2419 device_t device;
2420 device = dev_port_lookup(ps->ps_device);
2421 kr=ds_device_write_common(device,
2422 IP_NULL, 0,
2423 (dev_mode_t) 0,
2424 dev_offset,
2425 (io_buf_ptr_t) addr,
2426 size,
2427 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2428 &bytes_written);
2429 }
2430 if (kr != KERN_SUCCESS) {
2431 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2432 "device_write returned ",
2433 kr, addr, size, offset));
2434 BS_STAT(ps->ps_bs,
2435 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2436 return PAGER_ERROR;
2437 }
2438 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2439 Panic("fragmented write");
2440 records_written = (bytes_written >>
2441 (vm_page_shift - ps->ps_record_shift));
2442 dev_offset += records_written;
2443 #if 1
2444 if (bytes_written != bytes_to_write) {
2445 dprintf(("wrote only %d bytes out of %d\n",
2446 bytes_written, bytes_to_write));
2447 }
2448 #endif
2449 bytes_to_write -= bytes_written;
2450 addr += bytes_written;
2451 } while (bytes_to_write > 0);
2452
2453 return PAGER_SUCCESS;
2454 }
2455
2456
2457 #else /* !DEVICE_PAGING */
2458
2459 kern_return_t
2460 ps_read_device(
2461 paging_segment_t ps,
2462 vm_offset_t offset,
2463 vm_offset_t *bufferp,
2464 unsigned int size,
2465 unsigned int *residualp,
2466 int flags)
2467 {
2468 panic("ps_read_device not supported");
2469 }
2470
2471 ps_write_device(
2472 paging_segment_t ps,
2473 vm_offset_t offset,
2474 vm_offset_t addr,
2475 unsigned int size,
2476 struct vs_async *vsa)
2477 {
2478 panic("ps_write_device not supported");
2479 }
2480
2481 #endif /* DEVICE_PAGING */
2482 void pvs_object_data_provided(vstruct_t, upl_t, vm_offset_t, vm_size_t); /* forward */
2483
2484 void
2485 pvs_object_data_provided(
2486 vstruct_t vs,
2487 upl_t upl,
2488 vm_offset_t offset,
2489 vm_size_t size)
2490 {
2491
2492 DEBUG(DEBUG_VS_INTERNAL,
2493 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2494 upl, offset, size));
2495
2496 ASSERT(size > 0);
2497 GSTAT(global_stats.gs_pages_in += atop_32(size));
2498
2499
2500 #if USE_PRECIOUS
2501 ps_clunmap(vs, offset, size);
2502 #endif /* USE_PRECIOUS */
2503
2504 }
2505
2506 kern_return_t
2507 pvs_cluster_read(
2508 vstruct_t vs,
2509 vm_offset_t vs_offset,
2510 vm_size_t cnt)
2511 {
2512 upl_t upl;
2513 kern_return_t error = KERN_SUCCESS;
2514 int size;
2515 unsigned int residual;
2516 unsigned int request_flags;
2517 int seg_index;
2518 int pages_in_cl;
2519 int cl_size;
2520 int cl_mask;
2521 int cl_index;
2522 int xfer_size;
2523 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2524 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2525 struct clmap clmap;
2526
2527 pages_in_cl = 1 << vs->vs_clshift;
2528 cl_size = pages_in_cl * vm_page_size;
2529 cl_mask = cl_size - 1;
2530
2531 /*
2532 * This loop will be executed multiple times until the entire
2533 * request has been satisfied... if the request spans cluster
2534 * boundaries, the clusters will be checked for logical continunity,
2535 * if contiguous the I/O request will span multiple clusters, otherwise
2536 * it will be broken up into the minimal set of I/O's
2537 *
2538 * If there are holes in a request (either unallocated pages in a paging
2539 * segment or an unallocated paging segment), we stop
2540 * reading at the hole, inform the VM of any data read, inform
2541 * the VM of an unavailable range, then loop again, hoping to
2542 * find valid pages later in the requested range. This continues until
2543 * the entire range has been examined, and read, if present.
2544 */
2545
2546 #if USE_PRECIOUS
2547 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
2548 #else
2549 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
2550 #endif
2551 while (cnt && (error == KERN_SUCCESS)) {
2552 int ps_info_valid;
2553 int page_list_count;
2554
2555 if((vs_offset & cl_mask) &&
2556 (cnt > (VM_SUPER_CLUSTER -
2557 (vs_offset & cl_mask)))) {
2558 size = VM_SUPER_CLUSTER;
2559 size -= vs_offset & cl_mask;
2560 } else if (cnt > VM_SUPER_CLUSTER) {
2561 size = VM_SUPER_CLUSTER;
2562 } else {
2563 size = cnt;
2564 }
2565 cnt -= size;
2566
2567 ps_info_valid = 0;
2568 seg_index = 0;
2569
2570 while (size > 0 && error == KERN_SUCCESS) {
2571 int abort_size;
2572 int failed_size;
2573 int beg_pseg;
2574 int beg_indx;
2575 vm_offset_t cur_offset;
2576
2577
2578 if ( !ps_info_valid) {
2579 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2580 psp[seg_index] = CLMAP_PS(clmap);
2581 ps_info_valid = 1;
2582 }
2583 /*
2584 * skip over unallocated physical segments
2585 */
2586 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2587 abort_size = cl_size - (vs_offset & cl_mask);
2588 abort_size = MIN(abort_size, size);
2589
2590 page_list_count = 0;
2591 memory_object_super_upl_request(
2592 vs->vs_control,
2593 (memory_object_offset_t)vs_offset,
2594 abort_size, abort_size,
2595 &upl, NULL, &page_list_count,
2596 request_flags);
2597
2598 if (clmap.cl_error) {
2599 upl_abort(upl, UPL_ABORT_ERROR);
2600 } else {
2601 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2602 }
2603 upl_deallocate(upl);
2604
2605 size -= abort_size;
2606 vs_offset += abort_size;
2607
2608 seg_index++;
2609 ps_info_valid = 0;
2610 continue;
2611 }
2612 cl_index = (vs_offset & cl_mask) / vm_page_size;
2613
2614 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2615 /*
2616 * skip over unallocated pages
2617 */
2618 if (CLMAP_ISSET(clmap, cl_index))
2619 break;
2620 abort_size += vm_page_size;
2621 }
2622 if (abort_size) {
2623 /*
2624 * Let VM system know about holes in clusters.
2625 */
2626 GSTAT(global_stats.gs_pages_unavail += atop_32(abort_size));
2627
2628 page_list_count = 0;
2629 memory_object_super_upl_request(
2630 vs->vs_control,
2631 (memory_object_offset_t)vs_offset,
2632 abort_size, abort_size,
2633 &upl, NULL, &page_list_count,
2634 request_flags);
2635
2636 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2637 upl_deallocate(upl);
2638
2639 size -= abort_size;
2640 vs_offset += abort_size;
2641
2642 if (cl_index == pages_in_cl) {
2643 /*
2644 * if we're at the end of this physical cluster
2645 * then bump to the next one and continue looking
2646 */
2647 seg_index++;
2648 ps_info_valid = 0;
2649 continue;
2650 }
2651 if (size == 0)
2652 break;
2653 }
2654 /*
2655 * remember the starting point of the first allocated page
2656 * for the I/O we're about to issue
2657 */
2658 beg_pseg = seg_index;
2659 beg_indx = cl_index;
2660 cur_offset = vs_offset;
2661
2662 /*
2663 * calculate the size of the I/O that we can do...
2664 * this may span multiple physical segments if
2665 * they are contiguous
2666 */
2667 for (xfer_size = 0; xfer_size < size; ) {
2668
2669 while (cl_index < pages_in_cl
2670 && xfer_size < size) {
2671 /*
2672 * accumulate allocated pages within
2673 * a physical segment
2674 */
2675 if (CLMAP_ISSET(clmap, cl_index)) {
2676 xfer_size += vm_page_size;
2677 cur_offset += vm_page_size;
2678 cl_index++;
2679
2680 BS_STAT(psp[seg_index]->ps_bs,
2681 psp[seg_index]->ps_bs->bs_pages_in++);
2682 } else
2683 break;
2684 }
2685 if (cl_index < pages_in_cl
2686 || xfer_size >= size) {
2687 /*
2688 * we've hit an unallocated page or
2689 * the end of this request... go fire
2690 * the I/O
2691 */
2692 break;
2693 }
2694 /*
2695 * we've hit the end of the current physical
2696 * segment and there's more to do, so try
2697 * moving to the next one
2698 */
2699 seg_index++;
2700
2701 ps_offset[seg_index] =
2702 ps_clmap(vs,
2703 cur_offset & ~cl_mask,
2704 &clmap, CL_FIND, 0, 0);
2705 psp[seg_index] = CLMAP_PS(clmap);
2706 ps_info_valid = 1;
2707
2708 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2709 /*
2710 * if the physical segment we're about
2711 * to step into is not contiguous to
2712 * the one we're currently in, or it's
2713 * in a different paging file, or
2714 * it hasn't been allocated....
2715 * we stop here and generate the I/O
2716 */
2717 break;
2718 }
2719 /*
2720 * start with first page of the next physical
2721 * segment
2722 */
2723 cl_index = 0;
2724 }
2725 if (xfer_size) {
2726 /*
2727 * we have a contiguous range of allocated pages
2728 * to read from
2729 */
2730 page_list_count = 0;
2731 memory_object_super_upl_request(vs->vs_control,
2732 (memory_object_offset_t)vs_offset,
2733 xfer_size, xfer_size,
2734 &upl, NULL, &page_list_count,
2735 request_flags | UPL_SET_INTERNAL);
2736
2737 error = ps_read_file(psp[beg_pseg],
2738 upl, (vm_offset_t) 0,
2739 ps_offset[beg_pseg] +
2740 (beg_indx * vm_page_size),
2741 xfer_size, &residual, 0);
2742 } else
2743 continue;
2744
2745 failed_size = 0;
2746
2747 /*
2748 * Adjust counts and send response to VM. Optimize
2749 * for the common case, i.e. no error and/or partial
2750 * data. If there was an error, then we need to error
2751 * the entire range, even if some data was successfully
2752 * read. If there was a partial read we may supply some
2753 * data and may error some as well. In all cases the
2754 * VM must receive some notification for every page in the
2755 * range.
2756 */
2757 if ((error == KERN_SUCCESS) && (residual == 0)) {
2758 /*
2759 * Got everything we asked for, supply the data
2760 * to the VM. Note that as a side effect of
2761 * supplying * the data, the buffer holding the
2762 * supplied data is * deallocated from the pager's
2763 * address space.
2764 */
2765 pvs_object_data_provided(
2766 vs, upl, vs_offset, xfer_size);
2767 } else {
2768 failed_size = xfer_size;
2769
2770 if (error == KERN_SUCCESS) {
2771 if (residual == xfer_size) {
2772 /*
2773 * If a read operation returns no error
2774 * and no data moved, we turn it into
2775 * an error, assuming we're reading at
2776 * or beyong EOF.
2777 * Fall through and error the entire
2778 * range.
2779 */
2780 error = KERN_FAILURE;
2781 } else {
2782 /*
2783 * Otherwise, we have partial read. If
2784 * the part read is a integral number
2785 * of pages supply it. Otherwise round
2786 * it up to a page boundary, zero fill
2787 * the unread part, and supply it.
2788 * Fall through and error the remainder
2789 * of the range, if any.
2790 */
2791 int fill, lsize;
2792
2793 fill = residual
2794 & ~vm_page_size;
2795 lsize = (xfer_size - residual)
2796 + fill;
2797 pvs_object_data_provided(
2798 vs, upl,
2799 vs_offset, lsize);
2800
2801 if (lsize < xfer_size) {
2802 failed_size =
2803 xfer_size - lsize;
2804 error = KERN_FAILURE;
2805 }
2806 }
2807 }
2808 }
2809 /*
2810 * If there was an error in any part of the range, tell
2811 * the VM. Note that error is explicitly checked again
2812 * since it can be modified above.
2813 */
2814 if (error != KERN_SUCCESS) {
2815 BS_STAT(psp[beg_pseg]->ps_bs,
2816 psp[beg_pseg]->ps_bs->bs_pages_in_fail
2817 += atop_32(failed_size));
2818 }
2819 size -= xfer_size;
2820 vs_offset += xfer_size;
2821 }
2822
2823 } /* END while (cnt && (error == 0)) */
2824 return error;
2825 }
2826
2827 int vs_do_async_write = 1;
2828
2829 kern_return_t
2830 vs_cluster_write(
2831 vstruct_t vs,
2832 upl_t internal_upl,
2833 vm_offset_t offset,
2834 vm_size_t cnt,
2835 boolean_t dp_internal,
2836 int flags)
2837 {
2838 vm_offset_t size;
2839 vm_offset_t transfer_size;
2840 int error = 0;
2841 struct clmap clmap;
2842
2843 vm_offset_t actual_offset; /* Offset within paging segment */
2844 paging_segment_t ps;
2845 vm_offset_t subx_size;
2846 vm_offset_t mobj_base_addr;
2847 vm_offset_t mobj_target_addr;
2848 int mobj_size;
2849
2850 struct vs_async *vsa;
2851 vm_map_copy_t copy;
2852
2853 upl_t upl;
2854 upl_page_info_t *pl;
2855 int page_index;
2856 int list_size;
2857 int cl_size;
2858
2859 if (!dp_internal) {
2860 int page_list_count;
2861 int request_flags;
2862 int super_size;
2863 int first_dirty;
2864 int num_dirty;
2865 int num_of_pages;
2866 int seg_index;
2867 int pages_in_cl;
2868 int must_abort;
2869 vm_offset_t upl_offset;
2870 vm_offset_t seg_offset;
2871 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2872 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2873
2874
2875 pages_in_cl = 1 << vs->vs_clshift;
2876 cl_size = pages_in_cl * vm_page_size;
2877
2878 if (bs_low) {
2879 super_size = cl_size;
2880
2881 request_flags = UPL_NOBLOCK |
2882 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2883 UPL_NO_SYNC | UPL_SET_INTERNAL;
2884 } else {
2885 super_size = VM_SUPER_CLUSTER;
2886
2887 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2888 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2889 UPL_NO_SYNC | UPL_SET_INTERNAL;
2890 }
2891
2892 page_list_count = 0;
2893 memory_object_super_upl_request(vs->vs_control,
2894 (memory_object_offset_t)offset,
2895 cnt, super_size,
2896 &upl, NULL, &page_list_count,
2897 request_flags | UPL_FOR_PAGEOUT);
2898
2899 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2900
2901 for (seg_index = 0, transfer_size = upl->size;
2902 transfer_size > 0; ) {
2903
2904 ps_offset[seg_index] =
2905 ps_clmap(vs, upl->offset + (seg_index * cl_size),
2906 &clmap, CL_ALLOC,
2907 transfer_size < cl_size ?
2908 transfer_size : cl_size, 0);
2909
2910 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2911 upl_abort(upl, 0);
2912 upl_deallocate(upl);
2913
2914 return KERN_FAILURE;
2915
2916 }
2917 psp[seg_index] = CLMAP_PS(clmap);
2918
2919 if (transfer_size > cl_size) {
2920 transfer_size -= cl_size;
2921 seg_index++;
2922 } else
2923 transfer_size = 0;
2924 }
2925 for (page_index = 0,
2926 num_of_pages = upl->size / vm_page_size;
2927 page_index < num_of_pages; ) {
2928 /*
2929 * skip over non-dirty pages
2930 */
2931 for ( ; page_index < num_of_pages; page_index++) {
2932 if (UPL_DIRTY_PAGE(pl, page_index)
2933 || UPL_PRECIOUS_PAGE(pl, page_index))
2934 /*
2935 * this is a page we need to write
2936 * go see if we can buddy it up with
2937 * others that are contiguous to it
2938 */
2939 break;
2940 /*
2941 * if the page is not-dirty, but present we
2942 * need to commit it... This is an unusual
2943 * case since we only asked for dirty pages
2944 */
2945 if (UPL_PAGE_PRESENT(pl, page_index)) {
2946 boolean_t empty = FALSE;
2947 upl_commit_range(upl,
2948 page_index * vm_page_size,
2949 vm_page_size,
2950 UPL_COMMIT_NOTIFY_EMPTY,
2951 pl,
2952 page_list_count,
2953 &empty);
2954 if (empty)
2955 upl_deallocate(upl);
2956 }
2957 }
2958 if (page_index == num_of_pages)
2959 /*
2960 * no more pages to look at, we're out of here
2961 */
2962 break;
2963
2964 /*
2965 * gather up contiguous dirty pages... we have at
2966 * least 1 otherwise we would have bailed above
2967 * make sure that each physical segment that we step
2968 * into is contiguous to the one we're currently in
2969 * if it's not, we have to stop and write what we have
2970 */
2971 for (first_dirty = page_index;
2972 page_index < num_of_pages; ) {
2973 if ( !UPL_DIRTY_PAGE(pl, page_index)
2974 && !UPL_PRECIOUS_PAGE(pl, page_index))
2975 break;
2976 page_index++;
2977 /*
2978 * if we just looked at the last page in the UPL
2979 * we don't need to check for physical segment
2980 * continuity
2981 */
2982 if (page_index < num_of_pages) {
2983 int cur_seg;
2984 int nxt_seg;
2985
2986 cur_seg =
2987 (page_index - 1) / pages_in_cl;
2988 nxt_seg = page_index / pages_in_cl;
2989
2990 if (cur_seg != nxt_seg) {
2991 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
2992 /*
2993 * if the segment we're about
2994 * to step into is not
2995 * contiguous to the one we're
2996 * currently in, or it's in a
2997 * different paging file....
2998 * we stop here and generate
2999 * the I/O
3000 */
3001 break;
3002 }
3003 }
3004 }
3005 num_dirty = page_index - first_dirty;
3006 must_abort = 1;
3007
3008 if (num_dirty) {
3009 upl_offset = first_dirty * vm_page_size;
3010 seg_index = first_dirty / pages_in_cl;
3011 seg_offset = upl_offset - (seg_index * cl_size);
3012 transfer_size = num_dirty * vm_page_size;
3013
3014
3015 while (transfer_size) {
3016 int seg_size;
3017
3018 if ((seg_size = cl_size -
3019 (upl_offset % cl_size))
3020 > transfer_size)
3021 seg_size = transfer_size;
3022
3023 ps_vs_write_complete(vs,
3024 upl->offset + upl_offset,
3025 seg_size, error);
3026
3027 transfer_size -= seg_size;
3028 upl_offset += seg_size;
3029 }
3030 upl_offset = first_dirty * vm_page_size;
3031 transfer_size = num_dirty * vm_page_size;
3032 error = ps_write_file(psp[seg_index],
3033 upl, upl_offset,
3034 ps_offset[seg_index]
3035 + seg_offset,
3036 transfer_size, flags);
3037 must_abort = 0;
3038 }
3039 if (must_abort) {
3040 boolean_t empty = FALSE;
3041 upl_abort_range(upl,
3042 first_dirty * vm_page_size,
3043 num_dirty * vm_page_size,
3044 UPL_ABORT_NOTIFY_EMPTY,
3045 &empty);
3046 if (empty)
3047 upl_deallocate(upl);
3048 }
3049 }
3050
3051 } else {
3052 assert(cnt <= (vm_page_size << vs->vs_clshift));
3053 list_size = cnt;
3054
3055 page_index = 0;
3056 /* The caller provides a mapped_data which is derived */
3057 /* from a temporary object. The targeted pages are */
3058 /* guaranteed to be set at offset 0 in the mapped_data */
3059 /* The actual offset however must still be derived */
3060 /* from the offset in the vs in question */
3061 mobj_base_addr = offset;
3062 mobj_target_addr = mobj_base_addr;
3063
3064 for (transfer_size = list_size; transfer_size != 0;) {
3065 actual_offset = ps_clmap(vs, mobj_target_addr,
3066 &clmap, CL_ALLOC,
3067 transfer_size < cl_size ?
3068 transfer_size : cl_size, 0);
3069 if(actual_offset == (vm_offset_t) -1) {
3070 error = 1;
3071 break;
3072 }
3073 cnt = MIN(transfer_size,
3074 CLMAP_NPGS(clmap) * vm_page_size);
3075 ps = CLMAP_PS(clmap);
3076 /* Assume that the caller has given us contiguous */
3077 /* pages */
3078 if(cnt) {
3079 ps_vs_write_complete(vs, mobj_target_addr,
3080 cnt, error);
3081 error = ps_write_file(ps, internal_upl,
3082 0, actual_offset,
3083 cnt, flags);
3084 if (error)
3085 break;
3086 }
3087 if (error)
3088 break;
3089 actual_offset += cnt;
3090 mobj_target_addr += cnt;
3091 transfer_size -= cnt;
3092 cnt = 0;
3093
3094 if (error)
3095 break;
3096 }
3097 }
3098 if(error)
3099 return KERN_FAILURE;
3100 else
3101 return KERN_SUCCESS;
3102 }
3103
3104 vm_size_t
3105 ps_vstruct_allocated_size(
3106 vstruct_t vs)
3107 {
3108 int num_pages;
3109 struct vs_map *vsmap;
3110 int i, j, k;
3111
3112 num_pages = 0;
3113 if (vs->vs_indirect) {
3114 /* loop on indirect maps */
3115 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3116 vsmap = vs->vs_imap[i];
3117 if (vsmap == NULL)
3118 continue;
3119 /* loop on clusters in this indirect map */
3120 for (j = 0; j < CLMAP_ENTRIES; j++) {
3121 if (VSM_ISCLR(vsmap[j]) ||
3122 VSM_ISERR(vsmap[j]))
3123 continue;
3124 /* loop on pages in this cluster */
3125 for (k = 0; k < VSCLSIZE(vs); k++) {
3126 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3127 num_pages++;
3128 }
3129 }
3130 }
3131 } else {
3132 vsmap = vs->vs_dmap;
3133 if (vsmap == NULL)
3134 return 0;
3135 /* loop on clusters in the direct map */
3136 for (j = 0; j < CLMAP_ENTRIES; j++) {
3137 if (VSM_ISCLR(vsmap[j]) ||
3138 VSM_ISERR(vsmap[j]))
3139 continue;
3140 /* loop on pages in this cluster */
3141 for (k = 0; k < VSCLSIZE(vs); k++) {
3142 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3143 num_pages++;
3144 }
3145 }
3146 }
3147
3148 return ptoa_32(num_pages);
3149 }
3150
3151 size_t
3152 ps_vstruct_allocated_pages(
3153 vstruct_t vs,
3154 default_pager_page_t *pages,
3155 size_t pages_size)
3156 {
3157 int num_pages;
3158 struct vs_map *vsmap;
3159 vm_offset_t offset;
3160 int i, j, k;
3161
3162 num_pages = 0;
3163 offset = 0;
3164 if (vs->vs_indirect) {
3165 /* loop on indirect maps */
3166 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3167 vsmap = vs->vs_imap[i];
3168 if (vsmap == NULL) {
3169 offset += (vm_page_size * CLMAP_ENTRIES *
3170 VSCLSIZE(vs));
3171 continue;
3172 }
3173 /* loop on clusters in this indirect map */
3174 for (j = 0; j < CLMAP_ENTRIES; j++) {
3175 if (VSM_ISCLR(vsmap[j]) ||
3176 VSM_ISERR(vsmap[j])) {
3177 offset += vm_page_size * VSCLSIZE(vs);
3178 continue;
3179 }
3180 /* loop on pages in this cluster */
3181 for (k = 0; k < VSCLSIZE(vs); k++) {
3182 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3183 num_pages++;
3184 if (num_pages < pages_size)
3185 pages++->dpp_offset =
3186 offset;
3187 }
3188 offset += vm_page_size;
3189 }
3190 }
3191 }
3192 } else {
3193 vsmap = vs->vs_dmap;
3194 if (vsmap == NULL)
3195 return 0;
3196 /* loop on clusters in the direct map */
3197 for (j = 0; j < CLMAP_ENTRIES; j++) {
3198 if (VSM_ISCLR(vsmap[j]) ||
3199 VSM_ISERR(vsmap[j])) {
3200 offset += vm_page_size * VSCLSIZE(vs);
3201 continue;
3202 }
3203 /* loop on pages in this cluster */
3204 for (k = 0; k < VSCLSIZE(vs); k++) {
3205 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3206 num_pages++;
3207 if (num_pages < pages_size)
3208 pages++->dpp_offset = offset;
3209 }
3210 offset += vm_page_size;
3211 }
3212 }
3213 }
3214
3215 return num_pages;
3216 }
3217
3218
3219 kern_return_t
3220 ps_vstruct_transfer_from_segment(
3221 vstruct_t vs,
3222 paging_segment_t segment,
3223 upl_t upl)
3224 {
3225 struct vs_map *vsmap;
3226 struct vs_map old_vsmap;
3227 struct vs_map new_vsmap;
3228 int i, j, k;
3229
3230 VS_LOCK(vs); /* block all work on this vstruct */
3231 /* can't allow the normal multiple write */
3232 /* semantic because writes may conflict */
3233 vs->vs_xfer_pending = TRUE;
3234 vs_wait_for_sync_writers(vs);
3235 vs_start_write(vs);
3236 vs_wait_for_readers(vs);
3237 /* we will unlock the vs to allow other writes while transferring */
3238 /* and will be guaranteed of the persistance of the vs struct */
3239 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3240 /* vs_async_pending */
3241 /* OK we now have guaranteed no other parties are accessing this */
3242 /* vs. Now that we are also supporting simple lock versions of */
3243 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3244 /* our purpose in holding it before was the multiple write case */
3245 /* we now use the boolean xfer_pending to do that. We can use */
3246 /* a boolean instead of a count because we have guaranteed single */
3247 /* file access to this code in its caller */
3248 VS_UNLOCK(vs);
3249 vs_changed:
3250 if (vs->vs_indirect) {
3251 int vsmap_size;
3252 int clmap_off;
3253 /* loop on indirect maps */
3254 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3255 vsmap = vs->vs_imap[i];
3256 if (vsmap == NULL)
3257 continue;
3258 /* loop on clusters in this indirect map */
3259 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3260 VSCLSIZE(vs) * i);
3261 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3262 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3263 else
3264 vsmap_size = CLMAP_ENTRIES;
3265 for (j = 0; j < vsmap_size; j++) {
3266 if (VSM_ISCLR(vsmap[j]) ||
3267 VSM_ISERR(vsmap[j]) ||
3268 (VSM_PS(vsmap[j]) != segment))
3269 continue;
3270 if(vs_cluster_transfer(vs,
3271 (vm_page_size * (j << vs->vs_clshift))
3272 + clmap_off,
3273 vm_page_size << vs->vs_clshift,
3274 upl)
3275 != KERN_SUCCESS) {
3276 VS_LOCK(vs);
3277 vs->vs_xfer_pending = FALSE;
3278 VS_UNLOCK(vs);
3279 vs_finish_write(vs);
3280 return KERN_FAILURE;
3281 }
3282 /* allow other readers/writers during transfer*/
3283 VS_LOCK(vs);
3284 vs->vs_xfer_pending = FALSE;
3285 VS_UNLOCK(vs);
3286 vs_finish_write(vs);
3287 VS_LOCK(vs);
3288 vs->vs_xfer_pending = TRUE;
3289 vs_wait_for_sync_writers(vs);
3290 vs_start_write(vs);
3291 vs_wait_for_readers(vs);
3292 VS_UNLOCK(vs);
3293 if (!(vs->vs_indirect)) {
3294 goto vs_changed;
3295 }
3296 }
3297 }
3298 } else {
3299 vsmap = vs->vs_dmap;
3300 if (vsmap == NULL) {
3301 VS_LOCK(vs);
3302 vs->vs_xfer_pending = FALSE;
3303 VS_UNLOCK(vs);
3304 vs_finish_write(vs);
3305 return KERN_SUCCESS;
3306 }
3307 /* loop on clusters in the direct map */
3308 for (j = 0; j < vs->vs_size; j++) {
3309 if (VSM_ISCLR(vsmap[j]) ||
3310 VSM_ISERR(vsmap[j]) ||
3311 (VSM_PS(vsmap[j]) != segment))
3312 continue;
3313 if(vs_cluster_transfer(vs,
3314 vm_page_size * (j << vs->vs_clshift),
3315 vm_page_size << vs->vs_clshift,
3316 upl) != KERN_SUCCESS) {
3317 VS_LOCK(vs);
3318 vs->vs_xfer_pending = FALSE;
3319 VS_UNLOCK(vs);
3320 vs_finish_write(vs);
3321 return KERN_FAILURE;
3322 }
3323 /* allow other readers/writers during transfer*/
3324 VS_LOCK(vs);
3325 vs->vs_xfer_pending = FALSE;
3326 VS_UNLOCK(vs);
3327 vs_finish_write(vs);
3328 VS_LOCK(vs);
3329 vs->vs_xfer_pending = TRUE;
3330 VS_UNLOCK(vs);
3331 vs_wait_for_sync_writers(vs);
3332 vs_start_write(vs);
3333 vs_wait_for_readers(vs);
3334 if (vs->vs_indirect) {
3335 goto vs_changed;
3336 }
3337 }
3338 }
3339
3340 VS_LOCK(vs);
3341 vs->vs_xfer_pending = FALSE;
3342 VS_UNLOCK(vs);
3343 vs_finish_write(vs);
3344 return KERN_SUCCESS;
3345 }
3346
3347
3348
3349 vs_map_t
3350 vs_get_map_entry(
3351 vstruct_t vs,
3352 vm_offset_t offset)
3353 {
3354 struct vs_map *vsmap;
3355 vm_offset_t cluster;
3356
3357 cluster = atop_32(offset) >> vs->vs_clshift;
3358 if (vs->vs_indirect) {
3359 long ind_block = cluster/CLMAP_ENTRIES;
3360
3361 /* Is the indirect block allocated? */
3362 vsmap = vs->vs_imap[ind_block];
3363 if(vsmap == (vs_map_t) NULL)
3364 return vsmap;
3365 } else
3366 vsmap = vs->vs_dmap;
3367 vsmap += cluster%CLMAP_ENTRIES;
3368 return vsmap;
3369 }
3370
3371 kern_return_t
3372 vs_cluster_transfer(
3373 vstruct_t vs,
3374 vm_offset_t offset,
3375 vm_size_t cnt,
3376 upl_t upl)
3377 {
3378 vm_offset_t actual_offset;
3379 paging_segment_t ps;
3380 struct clmap clmap;
3381 kern_return_t error = KERN_SUCCESS;
3382 int size, size_wanted, i;
3383 unsigned int residual;
3384 int unavail_size;
3385 default_pager_thread_t *dpt;
3386 boolean_t dealloc;
3387 struct vs_map *vsmap_ptr;
3388 struct vs_map read_vsmap;
3389 struct vs_map original_read_vsmap;
3390 struct vs_map write_vsmap;
3391 upl_t sync_upl;
3392 vm_offset_t ioaddr;
3393
3394 /* vs_cluster_transfer reads in the pages of a cluster and
3395 * then writes these pages back to new backing store. The
3396 * segment the pages are being read from is assumed to have
3397 * been taken off-line and is no longer considered for new
3398 * space requests.
3399 */
3400
3401 /*
3402 * This loop will be executed once per cluster referenced.
3403 * Typically this means once, since it's unlikely that the
3404 * VM system will ask for anything spanning cluster boundaries.
3405 *
3406 * If there are holes in a cluster (in a paging segment), we stop
3407 * reading at the hole, then loop again, hoping to
3408 * find valid pages later in the cluster. This continues until
3409 * the entire range has been examined, and read, if present. The
3410 * pages are written as they are read. If a failure occurs after
3411 * some pages are written the unmap call at the bottom of the loop
3412 * recovers the backing store and the old backing store remains
3413 * in effect.
3414 */
3415
3416 VSM_CLR(write_vsmap);
3417 VSM_CLR(original_read_vsmap);
3418 /* grab the actual object's pages to sync with I/O */
3419 while (cnt && (error == KERN_SUCCESS)) {
3420 vsmap_ptr = vs_get_map_entry(vs, offset);
3421 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3422
3423 if (actual_offset == (vm_offset_t) -1) {
3424
3425 /*
3426 * Nothing left to write in this cluster at least
3427 * set write cluster information for any previous
3428 * write, clear for next cluster, if there is one
3429 */
3430 unsigned int local_size, clmask, clsize;
3431
3432 clsize = vm_page_size << vs->vs_clshift;
3433 clmask = clsize - 1;
3434 local_size = clsize - (offset & clmask);
3435 ASSERT(local_size);
3436 local_size = MIN(local_size, cnt);
3437
3438 /* This cluster has no data in it beyond what may */
3439 /* have been found on a previous iteration through */
3440 /* the loop "write_vsmap" */
3441 *vsmap_ptr = write_vsmap;
3442 VSM_CLR(write_vsmap);
3443 VSM_CLR(original_read_vsmap);
3444
3445 cnt -= local_size;
3446 offset += local_size;
3447 continue;
3448 }
3449
3450 /*
3451 * Count up contiguous available or unavailable
3452 * pages.
3453 */
3454 ps = CLMAP_PS(clmap);
3455 ASSERT(ps);
3456 size = 0;
3457 unavail_size = 0;
3458 for (i = 0;
3459 (size < cnt) && (unavail_size < cnt) &&
3460 (i < CLMAP_NPGS(clmap)); i++) {
3461 if (CLMAP_ISSET(clmap, i)) {
3462 if (unavail_size != 0)
3463 break;
3464 size += vm_page_size;
3465 BS_STAT(ps->ps_bs,
3466 ps->ps_bs->bs_pages_in++);
3467 } else {
3468 if (size != 0)
3469 break;
3470 unavail_size += vm_page_size;
3471 }
3472 }
3473
3474 if (size == 0) {
3475 ASSERT(unavail_size);
3476 cnt -= unavail_size;
3477 offset += unavail_size;
3478 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3479 == 0) {
3480 /* There is no more to transfer in this
3481 cluster
3482 */
3483 *vsmap_ptr = write_vsmap;
3484 VSM_CLR(write_vsmap);
3485 VSM_CLR(original_read_vsmap);
3486 }
3487 continue;
3488 }
3489
3490 if(VSM_ISCLR(original_read_vsmap))
3491 original_read_vsmap = *vsmap_ptr;
3492
3493 if(ps->ps_segtype == PS_PARTITION) {
3494 /*
3495 NEED TO ISSUE WITH SYNC & NO COMMIT
3496 error = ps_read_device(ps, actual_offset, &buffer,
3497 size, &residual, flags);
3498 */
3499 } else {
3500 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3501 error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset,
3502 size, &residual,
3503 (UPL_IOSYNC | UPL_NOCOMMIT));
3504 }
3505
3506 read_vsmap = *vsmap_ptr;
3507
3508
3509 /*
3510 * Adjust counts and put data in new BS. Optimize for the
3511 * common case, i.e. no error and/or partial data.
3512 * If there was an error, then we need to error the entire
3513 * range, even if some data was successfully read.
3514 *
3515 */
3516 if ((error == KERN_SUCCESS) && (residual == 0)) {
3517 int page_list_count = 0;
3518
3519 /*
3520 * Got everything we asked for, supply the data to
3521 * the new BS. Note that as a side effect of supplying
3522 * the data, the buffer holding the supplied data is
3523 * deallocated from the pager's address space unless
3524 * the write is unsuccessful.
3525 */
3526
3527 /* note buffer will be cleaned up in all cases by */
3528 /* internal_cluster_write or if an error on write */
3529 /* the vm_map_copy_page_discard call */
3530 *vsmap_ptr = write_vsmap;
3531
3532 if(vs_cluster_write(vs, upl, offset,
3533 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3534 error = KERN_FAILURE;
3535 if(!(VSM_ISCLR(*vsmap_ptr))) {
3536 /* unmap the new backing store object */
3537 ps_clunmap(vs, offset, size);
3538 }
3539 /* original vsmap */
3540 *vsmap_ptr = original_read_vsmap;
3541 VSM_CLR(write_vsmap);
3542 } else {
3543 if((offset + size) &
3544 ((vm_page_size << vs->vs_clshift)
3545 - 1)) {
3546 /* There is more to transfer in this
3547 cluster
3548 */
3549 write_vsmap = *vsmap_ptr;
3550 *vsmap_ptr = read_vsmap;
3551 } else {
3552 /* discard the old backing object */
3553 write_vsmap = *vsmap_ptr;
3554 *vsmap_ptr = read_vsmap;
3555 ps_clunmap(vs, offset, size);
3556 *vsmap_ptr = write_vsmap;
3557 VSM_CLR(write_vsmap);
3558 VSM_CLR(original_read_vsmap);
3559 }
3560 }
3561 } else {
3562 size_wanted = size;
3563 if (error == KERN_SUCCESS) {
3564 if (residual == size) {
3565 /*
3566 * If a read operation returns no error
3567 * and no data moved, we turn it into
3568 * an error, assuming we're reading at
3569 * or beyond EOF.
3570 * Fall through and error the entire
3571 * range.
3572 */
3573 error = KERN_FAILURE;
3574 *vsmap_ptr = write_vsmap;
3575 if(!(VSM_ISCLR(*vsmap_ptr))) {
3576 /* unmap the new backing store object */
3577 ps_clunmap(vs, offset, size);
3578 }
3579 *vsmap_ptr = original_read_vsmap;
3580 VSM_CLR(write_vsmap);
3581 continue;
3582 } else {
3583 /*
3584 * Otherwise, we have partial read.
3585 * This is also considered an error
3586 * for the purposes of cluster transfer
3587 */
3588 error = KERN_FAILURE;
3589 *vsmap_ptr = write_vsmap;
3590 if(!(VSM_ISCLR(*vsmap_ptr))) {
3591 /* unmap the new backing store object */
3592 ps_clunmap(vs, offset, size);
3593 }
3594 *vsmap_ptr = original_read_vsmap;
3595 VSM_CLR(write_vsmap);
3596 continue;
3597 }
3598 }
3599
3600 }
3601 cnt -= size;
3602 offset += size;
3603
3604 } /* END while (cnt && (error == 0)) */
3605 if(!VSM_ISCLR(write_vsmap))
3606 *vsmap_ptr = write_vsmap;
3607
3608 return error;
3609 }
3610
3611 kern_return_t
3612 default_pager_add_file(MACH_PORT_FACE backing_store,
3613 int *vp,
3614 int record_size,
3615 long size)
3616 {
3617 backing_store_t bs;
3618 paging_segment_t ps;
3619 int i;
3620 int error;
3621
3622 if ((bs = backing_store_lookup(backing_store))
3623 == BACKING_STORE_NULL)
3624 return KERN_INVALID_ARGUMENT;
3625
3626 PSL_LOCK();
3627 for (i = 0; i <= paging_segment_max; i++) {
3628 ps = paging_segments[i];
3629 if (ps == PAGING_SEGMENT_NULL)
3630 continue;
3631 if (ps->ps_segtype != PS_FILE)
3632 continue;
3633
3634 /*
3635 * Check for overlap on same device.
3636 */
3637 if (ps->ps_vnode == (struct vnode *)vp) {
3638 PSL_UNLOCK();
3639 BS_UNLOCK(bs);
3640 return KERN_INVALID_ARGUMENT;
3641 }
3642 }
3643 PSL_UNLOCK();
3644
3645 /*
3646 * Set up the paging segment
3647 */
3648 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3649 if (ps == PAGING_SEGMENT_NULL) {
3650 BS_UNLOCK(bs);
3651 return KERN_RESOURCE_SHORTAGE;
3652 }
3653
3654 ps->ps_segtype = PS_FILE;
3655 ps->ps_vnode = (struct vnode *)vp;
3656 ps->ps_offset = 0;
3657 ps->ps_record_shift = local_log2(vm_page_size / record_size);
3658 ps->ps_recnum = size;
3659 ps->ps_pgnum = size >> ps->ps_record_shift;
3660
3661 ps->ps_pgcount = ps->ps_pgnum;
3662 ps->ps_clshift = local_log2(bs->bs_clsize);
3663 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3664 ps->ps_hint = 0;
3665
3666 PS_LOCK_INIT(ps);
3667 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3668 if (!ps->ps_bmap) {
3669 kfree((vm_offset_t)ps, sizeof *ps);
3670 BS_UNLOCK(bs);
3671 return KERN_RESOURCE_SHORTAGE;
3672 }
3673 for (i = 0; i < ps->ps_ncls; i++) {
3674 clrbit(ps->ps_bmap, i);
3675 }
3676
3677 ps->ps_going_away = FALSE;
3678 ps->ps_bs = bs;
3679
3680 if ((error = ps_enter(ps)) != 0) {
3681 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3682 kfree((vm_offset_t)ps, sizeof *ps);
3683 BS_UNLOCK(bs);
3684 return KERN_RESOURCE_SHORTAGE;
3685 }
3686
3687 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3688 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3689 PSL_LOCK();
3690 dp_pages_free += ps->ps_pgcount;
3691 PSL_UNLOCK();
3692
3693 BS_UNLOCK(bs);
3694
3695 bs_more_space(ps->ps_clcount);
3696
3697 DEBUG(DEBUG_BS_INTERNAL,
3698 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3699 device, offset, size, record_size,
3700 ps->ps_record_shift, ps->ps_pgnum));
3701
3702 return KERN_SUCCESS;
3703 }
3704
3705
3706
3707 kern_return_t
3708 ps_read_file(
3709 paging_segment_t ps,
3710 upl_t upl,
3711 vm_offset_t upl_offset,
3712 vm_offset_t offset,
3713 unsigned int size,
3714 unsigned int *residualp,
3715 int flags)
3716 {
3717 vm_object_offset_t f_offset;
3718 int error = 0;
3719 int result;
3720
3721
3722 clustered_reads[atop_32(size)]++;
3723
3724 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3725
3726 /* for transfer case we need to pass uploffset and flags */
3727 error = vnode_pagein(ps->ps_vnode,
3728 upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
3729
3730 /* The vnode_pagein semantic is somewhat at odds with the existing */
3731 /* device_read semantic. Partial reads are not experienced at this */
3732 /* level. It is up to the bit map code and cluster read code to */
3733 /* check that requested data locations are actually backed, and the */
3734 /* pagein code to either read all of the requested data or return an */
3735 /* error. */
3736
3737 if (error)
3738 result = KERN_FAILURE;
3739 else {
3740 *residualp = 0;
3741 result = KERN_SUCCESS;
3742 }
3743 return result;
3744 }
3745
3746 kern_return_t
3747 ps_write_file(
3748 paging_segment_t ps,
3749 upl_t upl,
3750 vm_offset_t upl_offset,
3751 vm_offset_t offset,
3752 unsigned int size,
3753 int flags)
3754 {
3755 vm_object_offset_t f_offset;
3756 kern_return_t result;
3757
3758 int error = 0;
3759
3760 clustered_writes[atop_32(size)]++;
3761 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3762
3763 if (vnode_pageout(ps->ps_vnode,
3764 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3765 result = KERN_FAILURE;
3766 else
3767 result = KERN_SUCCESS;
3768
3769 return result;
3770 }
3771
3772 kern_return_t
3773 default_pager_triggers(MACH_PORT_FACE default_pager,
3774 int hi_wat,
3775 int lo_wat,
3776 int flags,
3777 MACH_PORT_FACE trigger_port)
3778 {
3779 MACH_PORT_FACE release;
3780 kern_return_t kr;
3781
3782 PSL_LOCK();
3783 if (flags == HI_WAT_ALERT) {
3784 release = min_pages_trigger_port;
3785 min_pages_trigger_port = trigger_port;
3786 minimum_pages_remaining = hi_wat/vm_page_size;
3787 bs_low = FALSE;
3788 kr = KERN_SUCCESS;
3789 } else if (flags == LO_WAT_ALERT) {
3790 release = max_pages_trigger_port;
3791 max_pages_trigger_port = trigger_port;
3792 maximum_pages_free = lo_wat/vm_page_size;
3793 kr = KERN_SUCCESS;
3794 } else {
3795 release = trigger_port;
3796 kr = KERN_INVALID_ARGUMENT;
3797 }
3798 PSL_UNLOCK();
3799
3800 if (IP_VALID(release))
3801 ipc_port_release_send(release);
3802
3803 return kr;
3804 }