]> git.saurik.com Git - apple/xnu.git/blob - osfmk/default_pager/dp_backing_store.c
xnu-517.9.5.tar.gz
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
1 /*
2 * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * @OSF_COPYRIGHT@
24 */
25 /*
26 * Mach Operating System
27 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
28 * All Rights Reserved.
29 *
30 * Permission to use, copy, modify and distribute this software and its
31 * documentation is hereby granted, provided that both the copyright
32 * notice and this permission notice appear in all copies of the
33 * software, derivative works or modified versions, and any portions
34 * thereof, and that both notices appear in supporting documentation.
35 *
36 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
37 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
38 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
39 *
40 * Carnegie Mellon requests users of this software to return to
41 *
42 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
43 * School of Computer Science
44 * Carnegie Mellon University
45 * Pittsburgh PA 15213-3890
46 *
47 * any improvements or extensions that they make and grant Carnegie Mellon
48 * the rights to redistribute these changes.
49 */
50
51 /*
52 * Default Pager.
53 * Paging File Management.
54 */
55
56 #include <mach/memory_object_control.h>
57 #include <mach/memory_object_server.h>
58 #include "default_pager_internal.h"
59 #include <default_pager/default_pager_alerts.h>
60 #include <ipc/ipc_port.h>
61 #include <ipc/ipc_space.h>
62 #include <kern/queue.h>
63 #include <kern/counters.h>
64 #include <kern/sched_prim.h>
65 #include <vm/vm_kern.h>
66 #include <vm/vm_pageout.h>
67 /* CDY CDY */
68 #include <vm/vm_map.h>
69
70 /*
71 * ALLOC_STRIDE... the maximum number of bytes allocated from
72 * a swap file before moving on to the next swap file... if
73 * all swap files reside on a single disk, this value should
74 * be very large (this is the default assumption)... if the
75 * swap files are spread across multiple disks, than this value
76 * should be small (128 * 1024)...
77 *
78 * This should be determined dynamically in the future
79 */
80
81 #define ALLOC_STRIDE (1024 * 1024 * 1024)
82 int physical_transfer_cluster_count = 0;
83
84 #define VM_SUPER_CLUSTER 0x40000
85 #define VM_SUPER_PAGES 64
86
87 /*
88 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
89 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
90 */
91 #define VSTRUCT_DEF_CLSHIFT 2
92 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
93 int default_pager_clsize = 0;
94
95 /* statistics */
96 unsigned int clustered_writes[VM_SUPER_PAGES+1];
97 unsigned int clustered_reads[VM_SUPER_PAGES+1];
98
99 /*
100 * Globals used for asynchronous paging operations:
101 * vs_async_list: head of list of to-be-completed I/O ops
102 * async_num_queued: number of pages completed, but not yet
103 * processed by async thread.
104 * async_requests_out: number of pages of requests not completed.
105 */
106
107 #if 0
108 struct vs_async *vs_async_list;
109 int async_num_queued;
110 int async_requests_out;
111 #endif
112
113
114 #define VS_ASYNC_REUSE 1
115 struct vs_async *vs_async_free_list;
116
117 mutex_t default_pager_async_lock; /* Protects globals above */
118
119
120 int vs_alloc_async_failed = 0; /* statistics */
121 int vs_alloc_async_count = 0; /* statistics */
122 struct vs_async *vs_alloc_async(void); /* forward */
123 void vs_free_async(struct vs_async *vsa); /* forward */
124
125
126 #define VS_ALLOC_ASYNC() vs_alloc_async()
127 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
128
129 #define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
130 #define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
131 #define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, \
132 ETAP_IO_DEV_PAGEH)
133 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
134 /*
135 * Paging Space Hysteresis triggers and the target notification port
136 *
137 */
138
139 unsigned int minimum_pages_remaining = 0;
140 unsigned int maximum_pages_free = 0;
141 ipc_port_t min_pages_trigger_port = NULL;
142 ipc_port_t max_pages_trigger_port = NULL;
143
144 boolean_t bs_low = FALSE;
145 int backing_store_release_trigger_disable = 0;
146
147
148
149 /*
150 * Object sizes are rounded up to the next power of 2,
151 * unless they are bigger than a given maximum size.
152 */
153 vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
154
155 /*
156 * List of all backing store and segments.
157 */
158 struct backing_store_list_head backing_store_list;
159 paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
160 mutex_t paging_segments_lock;
161 int paging_segment_max = 0;
162 int paging_segment_count = 0;
163 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
164
165
166 /*
167 * Total pages free in system
168 * This differs from clusters committed/avail which is a measure of the
169 * over commitment of paging segments to backing store. An idea which is
170 * likely to be deprecated.
171 */
172 unsigned int dp_pages_free = 0;
173 unsigned int cluster_transfer_minimum = 100;
174
175 kern_return_t ps_write_file(paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, int); /* forward */
176 kern_return_t ps_read_file (paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, unsigned int *, int); /* forward */
177
178
179 default_pager_thread_t *
180 get_read_buffer()
181 {
182 int i;
183
184 DPT_LOCK(dpt_lock);
185 while(TRUE) {
186 for (i=0; i<default_pager_internal_count; i++) {
187 if(dpt_array[i]->checked_out == FALSE) {
188 dpt_array[i]->checked_out = TRUE;
189 DPT_UNLOCK(dpt_lock);
190 return dpt_array[i];
191 }
192 }
193 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
194 }
195 }
196
197 void
198 bs_initialize(void)
199 {
200 int i;
201
202 /*
203 * List of all backing store.
204 */
205 BSL_LOCK_INIT();
206 queue_init(&backing_store_list.bsl_queue);
207 PSL_LOCK_INIT();
208
209 VS_ASYNC_LOCK_INIT();
210 #if VS_ASYNC_REUSE
211 vs_async_free_list = NULL;
212 #endif /* VS_ASYNC_REUSE */
213
214 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
215 clustered_writes[i] = 0;
216 clustered_reads[i] = 0;
217 }
218
219 }
220
221 /*
222 * When things do not quite workout...
223 */
224 void bs_no_paging_space(boolean_t); /* forward */
225
226 void
227 bs_no_paging_space(
228 boolean_t out_of_memory)
229 {
230
231 if (out_of_memory)
232 dprintf(("*** OUT OF MEMORY ***\n"));
233 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
234 }
235
236 void bs_more_space(int); /* forward */
237 void bs_commit(int); /* forward */
238
239 boolean_t user_warned = FALSE;
240 unsigned int clusters_committed = 0;
241 unsigned int clusters_available = 0;
242 unsigned int clusters_committed_peak = 0;
243
244 void
245 bs_more_space(
246 int nclusters)
247 {
248 BSL_LOCK();
249 /*
250 * Account for new paging space.
251 */
252 clusters_available += nclusters;
253
254 if (clusters_available >= clusters_committed) {
255 if (verbose && user_warned) {
256 printf("%s%s - %d excess clusters now.\n",
257 my_name,
258 "paging space is OK now",
259 clusters_available - clusters_committed);
260 user_warned = FALSE;
261 clusters_committed_peak = 0;
262 }
263 } else {
264 if (verbose && user_warned) {
265 printf("%s%s - still short of %d clusters.\n",
266 my_name,
267 "WARNING: paging space over-committed",
268 clusters_committed - clusters_available);
269 clusters_committed_peak -= nclusters;
270 }
271 }
272 BSL_UNLOCK();
273
274 return;
275 }
276
277 void
278 bs_commit(
279 int nclusters)
280 {
281 BSL_LOCK();
282 clusters_committed += nclusters;
283 if (clusters_committed > clusters_available) {
284 if (verbose && !user_warned) {
285 user_warned = TRUE;
286 printf("%s%s - short of %d clusters.\n",
287 my_name,
288 "WARNING: paging space over-committed",
289 clusters_committed - clusters_available);
290 }
291 if (clusters_committed > clusters_committed_peak) {
292 clusters_committed_peak = clusters_committed;
293 }
294 } else {
295 if (verbose && user_warned) {
296 printf("%s%s - was short of up to %d clusters.\n",
297 my_name,
298 "paging space is OK now",
299 clusters_committed_peak - clusters_available);
300 user_warned = FALSE;
301 clusters_committed_peak = 0;
302 }
303 }
304 BSL_UNLOCK();
305
306 return;
307 }
308
309 int default_pager_info_verbose = 1;
310
311 void
312 bs_global_info(
313 vm_size_t *totalp,
314 vm_size_t *freep)
315 {
316 vm_size_t pages_total, pages_free;
317 paging_segment_t ps;
318 int i;
319
320 PSL_LOCK();
321 pages_total = pages_free = 0;
322 for (i = 0; i <= paging_segment_max; i++) {
323 ps = paging_segments[i];
324 if (ps == PAGING_SEGMENT_NULL)
325 continue;
326
327 /*
328 * no need to lock: by the time this data
329 * gets back to any remote requestor it
330 * will be obsolete anyways
331 */
332 pages_total += ps->ps_pgnum;
333 pages_free += ps->ps_clcount << ps->ps_clshift;
334 DEBUG(DEBUG_BS_INTERNAL,
335 ("segment #%d: %d total, %d free\n",
336 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
337 }
338 *totalp = pages_total;
339 *freep = pages_free;
340 if (verbose && user_warned && default_pager_info_verbose) {
341 if (clusters_available < clusters_committed) {
342 printf("%s %d clusters committed, %d available.\n",
343 my_name,
344 clusters_committed,
345 clusters_available);
346 }
347 }
348 PSL_UNLOCK();
349 }
350
351 backing_store_t backing_store_alloc(void); /* forward */
352
353 backing_store_t
354 backing_store_alloc(void)
355 {
356 backing_store_t bs;
357
358 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
359 if (bs == BACKING_STORE_NULL)
360 panic("backing_store_alloc: no memory");
361
362 BS_LOCK_INIT(bs);
363 bs->bs_port = MACH_PORT_NULL;
364 bs->bs_priority = 0;
365 bs->bs_clsize = 0;
366 bs->bs_pages_total = 0;
367 bs->bs_pages_in = 0;
368 bs->bs_pages_in_fail = 0;
369 bs->bs_pages_out = 0;
370 bs->bs_pages_out_fail = 0;
371
372 return bs;
373 }
374
375 backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
376
377 /* Even in both the component space and external versions of this pager, */
378 /* backing_store_lookup will be called from tasks in the application space */
379 backing_store_t
380 backing_store_lookup(
381 MACH_PORT_FACE port)
382 {
383 backing_store_t bs;
384
385 /*
386 port is currently backed with a vs structure in the alias field
387 we could create an ISBS alias and a port_is_bs call but frankly
388 I see no reason for the test, the bs->port == port check below
389 will work properly on junk entries.
390
391 if ((port == MACH_PORT_NULL) || port_is_vs(port))
392 */
393 if ((port == MACH_PORT_NULL))
394 return BACKING_STORE_NULL;
395
396 BSL_LOCK();
397 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
398 bs_links) {
399 BS_LOCK(bs);
400 if (bs->bs_port == port) {
401 BSL_UNLOCK();
402 /* Success, return it locked. */
403 return bs;
404 }
405 BS_UNLOCK(bs);
406 }
407 BSL_UNLOCK();
408 return BACKING_STORE_NULL;
409 }
410
411 void backing_store_add(backing_store_t); /* forward */
412
413 void
414 backing_store_add(
415 backing_store_t bs)
416 {
417 MACH_PORT_FACE port = bs->bs_port;
418 MACH_PORT_FACE pset = default_pager_default_set;
419 kern_return_t kr = KERN_SUCCESS;
420
421 if (kr != KERN_SUCCESS)
422 panic("backing_store_add: add to set");
423
424 }
425
426 /*
427 * Set up default page shift, but only if not already
428 * set and argument is within range.
429 */
430 boolean_t
431 bs_set_default_clsize(unsigned int npages)
432 {
433 switch(npages){
434 case 1:
435 case 2:
436 case 4:
437 case 8:
438 if (default_pager_clsize == 0) /* if not yet set */
439 vstruct_def_clshift = local_log2(npages);
440 return(TRUE);
441 }
442 return(FALSE);
443 }
444
445 int bs_get_global_clsize(int clsize); /* forward */
446
447 int
448 bs_get_global_clsize(
449 int clsize)
450 {
451 int i;
452 memory_object_default_t dmm;
453 kern_return_t kr;
454
455 /*
456 * Only allow setting of cluster size once. If called
457 * with no cluster size (default), we use the compiled-in default
458 * for the duration. The same cluster size is used for all
459 * paging segments.
460 */
461 if (default_pager_clsize == 0) {
462 /*
463 * Keep cluster size in bit shift because it's quicker
464 * arithmetic, and easier to keep at a power of 2.
465 */
466 if (clsize != NO_CLSIZE) {
467 for (i = 0; (1 << i) < clsize; i++);
468 if (i > MAX_CLUSTER_SHIFT)
469 i = MAX_CLUSTER_SHIFT;
470 vstruct_def_clshift = i;
471 }
472 default_pager_clsize = (1 << vstruct_def_clshift);
473
474 /*
475 * Let the user know the new (and definitive) cluster size.
476 */
477 if (verbose)
478 printf("%scluster size = %d page%s\n",
479 my_name, default_pager_clsize,
480 (default_pager_clsize == 1) ? "" : "s");
481
482 /*
483 * Let the kernel know too, in case it hasn't used the
484 * default value provided in main() yet.
485 */
486 dmm = default_pager_object;
487 clsize = default_pager_clsize * vm_page_size; /* in bytes */
488 kr = host_default_memory_manager(host_priv_self(),
489 &dmm,
490 clsize);
491 memory_object_default_deallocate(dmm);
492
493 if (kr != KERN_SUCCESS) {
494 panic("bs_get_global_cl_size:host_default_memory_manager");
495 }
496 if (dmm != default_pager_object) {
497 panic("bs_get_global_cl_size:there is another default pager");
498 }
499 }
500 ASSERT(default_pager_clsize > 0 &&
501 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
502
503 return default_pager_clsize;
504 }
505
506 kern_return_t
507 default_pager_backing_store_create(
508 memory_object_default_t pager,
509 int priority,
510 int clsize, /* in bytes */
511 MACH_PORT_FACE *backing_store)
512 {
513 backing_store_t bs;
514 MACH_PORT_FACE port;
515 kern_return_t kr;
516 struct vstruct_alias *alias_struct;
517
518 if (pager != default_pager_object)
519 return KERN_INVALID_ARGUMENT;
520
521 bs = backing_store_alloc();
522 port = ipc_port_alloc_kernel();
523 ipc_port_make_send(port);
524 assert (port != IP_NULL);
525
526 DEBUG(DEBUG_BS_EXTERNAL,
527 ("priority=%d clsize=%d bs_port=0x%x\n",
528 priority, clsize, (int) backing_store));
529
530 alias_struct = (struct vstruct_alias *)
531 kalloc(sizeof (struct vstruct_alias));
532 if(alias_struct != NULL) {
533 alias_struct->vs = (struct vstruct *)bs;
534 alias_struct->name = ISVS;
535 port->alias = (int) alias_struct;
536 }
537 else {
538 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
539 kfree((vm_offset_t)bs, sizeof (struct backing_store));
540 return KERN_RESOURCE_SHORTAGE;
541 }
542
543 bs->bs_port = port;
544 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
545 priority = BS_MAXPRI;
546 else if (priority == BS_NOPRI)
547 priority = BS_MAXPRI;
548 else
549 priority = BS_MINPRI;
550 bs->bs_priority = priority;
551
552 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
553
554 BSL_LOCK();
555 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
556 bs_links);
557 BSL_UNLOCK();
558
559 backing_store_add(bs);
560
561 *backing_store = port;
562 return KERN_SUCCESS;
563 }
564
565 kern_return_t
566 default_pager_backing_store_info(
567 MACH_PORT_FACE backing_store,
568 backing_store_flavor_t flavour,
569 backing_store_info_t info,
570 mach_msg_type_number_t *size)
571 {
572 backing_store_t bs;
573 backing_store_basic_info_t basic;
574 int i;
575 paging_segment_t ps;
576
577 if (flavour != BACKING_STORE_BASIC_INFO ||
578 *size < BACKING_STORE_BASIC_INFO_COUNT)
579 return KERN_INVALID_ARGUMENT;
580
581 basic = (backing_store_basic_info_t)info;
582 *size = BACKING_STORE_BASIC_INFO_COUNT;
583
584 VSTATS_LOCK(&global_stats.gs_lock);
585 basic->pageout_calls = global_stats.gs_pageout_calls;
586 basic->pagein_calls = global_stats.gs_pagein_calls;
587 basic->pages_in = global_stats.gs_pages_in;
588 basic->pages_out = global_stats.gs_pages_out;
589 basic->pages_unavail = global_stats.gs_pages_unavail;
590 basic->pages_init = global_stats.gs_pages_init;
591 basic->pages_init_writes= global_stats.gs_pages_init_writes;
592 VSTATS_UNLOCK(&global_stats.gs_lock);
593
594 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
595 return KERN_INVALID_ARGUMENT;
596
597 basic->bs_pages_total = bs->bs_pages_total;
598 PSL_LOCK();
599 bs->bs_pages_free = 0;
600 for (i = 0; i <= paging_segment_max; i++) {
601 ps = paging_segments[i];
602 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
603 PS_LOCK(ps);
604 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
605 PS_UNLOCK(ps);
606 }
607 }
608 PSL_UNLOCK();
609 basic->bs_pages_free = bs->bs_pages_free;
610 basic->bs_pages_in = bs->bs_pages_in;
611 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
612 basic->bs_pages_out = bs->bs_pages_out;
613 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
614
615 basic->bs_priority = bs->bs_priority;
616 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */
617
618 BS_UNLOCK(bs);
619
620 return KERN_SUCCESS;
621 }
622
623 int ps_delete(paging_segment_t); /* forward */
624
625 int
626 ps_delete(
627 paging_segment_t ps)
628 {
629 vstruct_t vs;
630 kern_return_t error = KERN_SUCCESS;
631 int vs_count;
632
633 VSL_LOCK(); /* get the lock on the list of vs's */
634
635 /* The lock relationship and sequence is farily complicated */
636 /* this code looks at a live list, locking and unlocking the list */
637 /* as it traverses it. It depends on the locking behavior of */
638 /* default_pager_no_senders. no_senders always locks the vstruct */
639 /* targeted for removal before locking the vstruct list. However */
640 /* it will remove that member of the list without locking its */
641 /* neighbors. We can be sure when we hold a lock on a vstruct */
642 /* it cannot be removed from the list but we must hold the list */
643 /* lock to be sure that its pointers to its neighbors are valid. */
644 /* Also, we can hold off destruction of a vstruct when the list */
645 /* lock and the vs locks are not being held by bumping the */
646 /* vs_async_pending count. */
647
648
649 while(backing_store_release_trigger_disable != 0) {
650 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
651 }
652
653 /* we will choose instead to hold a send right */
654 vs_count = vstruct_list.vsl_count;
655 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
656 if(vs == (vstruct_t)&vstruct_list) {
657 VSL_UNLOCK();
658 return KERN_SUCCESS;
659 }
660 VS_LOCK(vs);
661 vs_async_wait(vs); /* wait for any pending async writes */
662 if ((vs_count != 0) && (vs != NULL))
663 vs->vs_async_pending += 1; /* hold parties calling */
664 /* vs_async_wait */
665 VS_UNLOCK(vs);
666 VSL_UNLOCK();
667 while((vs_count != 0) && (vs != NULL)) {
668 /* We take the count of AMO's before beginning the */
669 /* transfer of of the target segment. */
670 /* We are guaranteed that the target segment cannot get */
671 /* more users. We also know that queue entries are */
672 /* made at the back of the list. If some of the entries */
673 /* we would check disappear while we are traversing the */
674 /* list then we will either check new entries which */
675 /* do not have any backing store in the target segment */
676 /* or re-check old entries. This might not be optimal */
677 /* but it will always be correct. The alternative is to */
678 /* take a snapshot of the list. */
679 vstruct_t next_vs;
680
681 if(dp_pages_free < cluster_transfer_minimum)
682 error = KERN_FAILURE;
683 else {
684 vm_object_t transfer_object;
685 int count;
686 upl_t upl;
687
688 transfer_object = vm_object_allocate(VM_SUPER_CLUSTER);
689 count = 0;
690 error = vm_object_upl_request(transfer_object,
691 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
692 &upl, NULL, &count,
693 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
694 | UPL_SET_INTERNAL);
695 if(error == KERN_SUCCESS) {
696 error = ps_vstruct_transfer_from_segment(
697 vs, ps, upl);
698 upl_commit(upl, NULL);
699 upl_deallocate(upl);
700 } else {
701 error = KERN_FAILURE;
702 }
703 vm_object_deallocate(transfer_object);
704 }
705 if(error) {
706 VS_LOCK(vs);
707 vs->vs_async_pending -= 1; /* release vs_async_wait */
708 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
709 vs->vs_waiting_async = FALSE;
710 VS_UNLOCK(vs);
711 thread_wakeup(&vs->vs_async_pending);
712 } else {
713 VS_UNLOCK(vs);
714 }
715 return KERN_FAILURE;
716 }
717
718 VSL_LOCK();
719
720 while(backing_store_release_trigger_disable != 0) {
721 VSL_SLEEP(&backing_store_release_trigger_disable,
722 THREAD_UNINT);
723 }
724
725 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
726 if((next_vs != (vstruct_t)&vstruct_list) &&
727 (vs != next_vs) && (vs_count != 1)) {
728 VS_LOCK(next_vs);
729 vs_async_wait(next_vs); /* wait for any */
730 /* pending async writes */
731 next_vs->vs_async_pending += 1; /* hold parties */
732 /* calling vs_async_wait */
733 VS_UNLOCK(next_vs);
734 }
735 VSL_UNLOCK();
736 VS_LOCK(vs);
737 vs->vs_async_pending -= 1;
738 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
739 vs->vs_waiting_async = FALSE;
740 VS_UNLOCK(vs);
741 thread_wakeup(&vs->vs_async_pending);
742 } else {
743 VS_UNLOCK(vs);
744 }
745 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
746 vs = NULL;
747 else
748 vs = next_vs;
749 vs_count--;
750 }
751 return KERN_SUCCESS;
752 }
753
754
755 kern_return_t
756 default_pager_backing_store_delete(
757 MACH_PORT_FACE backing_store)
758 {
759 backing_store_t bs;
760 int i;
761 paging_segment_t ps;
762 int error;
763 int interim_pages_removed = 0;
764 kern_return_t kr;
765
766 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
767 return KERN_INVALID_ARGUMENT;
768
769 #if 0
770 /* not implemented */
771 BS_UNLOCK(bs);
772 return KERN_FAILURE;
773 #endif
774
775 restart:
776 PSL_LOCK();
777 error = KERN_SUCCESS;
778 for (i = 0; i <= paging_segment_max; i++) {
779 ps = paging_segments[i];
780 if (ps != PAGING_SEGMENT_NULL &&
781 ps->ps_bs == bs &&
782 ! ps->ps_going_away) {
783 PS_LOCK(ps);
784 /* disable access to this segment */
785 ps->ps_going_away = TRUE;
786 PS_UNLOCK(ps);
787 /*
788 * The "ps" segment is "off-line" now,
789 * we can try and delete it...
790 */
791 if(dp_pages_free < (cluster_transfer_minimum
792 + ps->ps_pgcount)) {
793 error = KERN_FAILURE;
794 PSL_UNLOCK();
795 }
796 else {
797 /* remove all pages associated with the */
798 /* segment from the list of free pages */
799 /* when transfer is through, all target */
800 /* segment pages will appear to be free */
801
802 dp_pages_free -= ps->ps_pgcount;
803 interim_pages_removed += ps->ps_pgcount;
804 PSL_UNLOCK();
805 error = ps_delete(ps);
806 }
807 if (error != KERN_SUCCESS) {
808 /*
809 * We couldn't delete the segment,
810 * probably because there's not enough
811 * virtual memory left.
812 * Re-enable all the segments.
813 */
814 PSL_LOCK();
815 break;
816 }
817 goto restart;
818 }
819 }
820
821 if (error != KERN_SUCCESS) {
822 for (i = 0; i <= paging_segment_max; i++) {
823 ps = paging_segments[i];
824 if (ps != PAGING_SEGMENT_NULL &&
825 ps->ps_bs == bs &&
826 ps->ps_going_away) {
827 PS_LOCK(ps);
828 /* re-enable access to this segment */
829 ps->ps_going_away = FALSE;
830 PS_UNLOCK(ps);
831 }
832 }
833 dp_pages_free += interim_pages_removed;
834 PSL_UNLOCK();
835 BS_UNLOCK(bs);
836 return error;
837 }
838
839 for (i = 0; i <= paging_segment_max; i++) {
840 ps = paging_segments[i];
841 if (ps != PAGING_SEGMENT_NULL &&
842 ps->ps_bs == bs) {
843 if(ps->ps_going_away) {
844 paging_segments[i] = PAGING_SEGMENT_NULL;
845 paging_segment_count--;
846 PS_LOCK(ps);
847 kfree((vm_offset_t)ps->ps_bmap,
848 RMAPSIZE(ps->ps_ncls));
849 kfree((vm_offset_t)ps, sizeof *ps);
850 }
851 }
852 }
853
854 /* Scan the entire ps array separately to make certain we find the */
855 /* proper paging_segment_max */
856 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
857 if(paging_segments[i] != PAGING_SEGMENT_NULL)
858 paging_segment_max = i;
859 }
860
861 PSL_UNLOCK();
862
863 /*
864 * All the segments have been deleted.
865 * We can remove the backing store.
866 */
867
868 /*
869 * Disable lookups of this backing store.
870 */
871 if((void *)bs->bs_port->alias != NULL)
872 kfree((vm_offset_t) bs->bs_port->alias,
873 sizeof (struct vstruct_alias));
874 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
875 bs->bs_port = MACH_PORT_NULL;
876 BS_UNLOCK(bs);
877
878 /*
879 * Remove backing store from backing_store list.
880 */
881 BSL_LOCK();
882 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
883 bs_links);
884 BSL_UNLOCK();
885
886 /*
887 * Free the backing store structure.
888 */
889 kfree((vm_offset_t)bs, sizeof *bs);
890
891 return KERN_SUCCESS;
892 }
893
894 int ps_enter(paging_segment_t); /* forward */
895
896 int
897 ps_enter(
898 paging_segment_t ps)
899 {
900 int i;
901
902 PSL_LOCK();
903
904 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
905 if (paging_segments[i] == PAGING_SEGMENT_NULL)
906 break;
907 }
908
909 if (i < MAX_NUM_PAGING_SEGMENTS) {
910 paging_segments[i] = ps;
911 if (i > paging_segment_max)
912 paging_segment_max = i;
913 paging_segment_count++;
914 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
915 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
916 ps_select_array[ps->ps_bs->bs_priority] = 0;
917 i = 0;
918 } else {
919 PSL_UNLOCK();
920 return KERN_RESOURCE_SHORTAGE;
921 }
922
923 PSL_UNLOCK();
924 return i;
925 }
926
927 #ifdef DEVICE_PAGING
928 kern_return_t
929 default_pager_add_segment(
930 MACH_PORT_FACE backing_store,
931 MACH_PORT_FACE device,
932 recnum_t offset,
933 recnum_t count,
934 int record_size)
935 {
936 backing_store_t bs;
937 paging_segment_t ps;
938 int i;
939 int error;
940
941 if ((bs = backing_store_lookup(backing_store))
942 == BACKING_STORE_NULL)
943 return KERN_INVALID_ARGUMENT;
944
945 PSL_LOCK();
946 for (i = 0; i <= paging_segment_max; i++) {
947 ps = paging_segments[i];
948 if (ps == PAGING_SEGMENT_NULL)
949 continue;
950
951 /*
952 * Check for overlap on same device.
953 */
954 if (!(ps->ps_device != device
955 || offset >= ps->ps_offset + ps->ps_recnum
956 || offset + count <= ps->ps_offset)) {
957 PSL_UNLOCK();
958 BS_UNLOCK(bs);
959 return KERN_INVALID_ARGUMENT;
960 }
961 }
962 PSL_UNLOCK();
963
964 /*
965 * Set up the paging segment
966 */
967 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
968 if (ps == PAGING_SEGMENT_NULL) {
969 BS_UNLOCK(bs);
970 return KERN_RESOURCE_SHORTAGE;
971 }
972
973 ps->ps_segtype = PS_PARTITION;
974 ps->ps_device = device;
975 ps->ps_offset = offset;
976 ps->ps_record_shift = local_log2(vm_page_size / record_size);
977 ps->ps_recnum = count;
978 ps->ps_pgnum = count >> ps->ps_record_shift;
979
980 ps->ps_pgcount = ps->ps_pgnum;
981 ps->ps_clshift = local_log2(bs->bs_clsize);
982 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
983 ps->ps_hint = 0;
984
985 PS_LOCK_INIT(ps);
986 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
987 if (!ps->ps_bmap) {
988 kfree((vm_offset_t)ps, sizeof *ps);
989 BS_UNLOCK(bs);
990 return KERN_RESOURCE_SHORTAGE;
991 }
992 for (i = 0; i < ps->ps_ncls; i++) {
993 clrbit(ps->ps_bmap, i);
994 }
995
996 ps->ps_going_away = FALSE;
997 ps->ps_bs = bs;
998
999 if ((error = ps_enter(ps)) != 0) {
1000 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1001 kfree((vm_offset_t)ps, sizeof *ps);
1002 BS_UNLOCK(bs);
1003 return KERN_RESOURCE_SHORTAGE;
1004 }
1005
1006 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1007 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1008 BS_UNLOCK(bs);
1009
1010 PSL_LOCK();
1011 dp_pages_free += ps->ps_pgcount;
1012 PSL_UNLOCK();
1013
1014 bs_more_space(ps->ps_clcount);
1015
1016 DEBUG(DEBUG_BS_INTERNAL,
1017 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1018 device, offset, count, record_size,
1019 ps->ps_record_shift, ps->ps_pgnum));
1020
1021 return KERN_SUCCESS;
1022 }
1023
1024 boolean_t
1025 bs_add_device(
1026 char *dev_name,
1027 MACH_PORT_FACE master)
1028 {
1029 security_token_t null_security_token = {
1030 { 0, 0 }
1031 };
1032 MACH_PORT_FACE device;
1033 int info[DEV_GET_SIZE_COUNT];
1034 mach_msg_type_number_t info_count;
1035 MACH_PORT_FACE bs = MACH_PORT_NULL;
1036 unsigned int rec_size;
1037 recnum_t count;
1038 int clsize;
1039 MACH_PORT_FACE reply_port;
1040
1041 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1042 null_security_token, dev_name, &device))
1043 return FALSE;
1044
1045 info_count = DEV_GET_SIZE_COUNT;
1046 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1047 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1048 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1049 clsize = bs_get_global_clsize(0);
1050 if (!default_pager_backing_store_create(
1051 default_pager_object,
1052 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1053 (clsize * vm_page_size),
1054 &bs)) {
1055 if (!default_pager_add_segment(bs, device,
1056 0, count, rec_size)) {
1057 return TRUE;
1058 }
1059 ipc_port_release_receive(bs);
1060 }
1061 }
1062
1063 ipc_port_release_send(device);
1064 return FALSE;
1065 }
1066 #endif /* DEVICE_PAGING */
1067
1068 #if VS_ASYNC_REUSE
1069
1070 struct vs_async *
1071 vs_alloc_async(void)
1072 {
1073 struct vs_async *vsa;
1074 MACH_PORT_FACE reply_port;
1075 kern_return_t kr;
1076
1077 VS_ASYNC_LOCK();
1078 if (vs_async_free_list == NULL) {
1079 VS_ASYNC_UNLOCK();
1080 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1081 if (vsa != NULL) {
1082 /*
1083 * Try allocating a reply port named after the
1084 * address of the vs_async structure.
1085 */
1086 struct vstruct_alias *alias_struct;
1087
1088 reply_port = ipc_port_alloc_kernel();
1089 alias_struct = (struct vstruct_alias *)
1090 kalloc(sizeof (struct vstruct_alias));
1091 if(alias_struct != NULL) {
1092 alias_struct->vs = (struct vstruct *)vsa;
1093 alias_struct->name = ISVS;
1094 reply_port->alias = (int) alias_struct;
1095 vsa->reply_port = reply_port;
1096 vs_alloc_async_count++;
1097 }
1098 else {
1099 vs_alloc_async_failed++;
1100 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1101 (reply_port));
1102 kfree((vm_offset_t)vsa,
1103 sizeof (struct vs_async));
1104 vsa = NULL;
1105 }
1106 }
1107 } else {
1108 vsa = vs_async_free_list;
1109 vs_async_free_list = vs_async_free_list->vsa_next;
1110 VS_ASYNC_UNLOCK();
1111 }
1112
1113 return vsa;
1114 }
1115
1116 void
1117 vs_free_async(
1118 struct vs_async *vsa)
1119 {
1120 VS_ASYNC_LOCK();
1121 vsa->vsa_next = vs_async_free_list;
1122 vs_async_free_list = vsa;
1123 VS_ASYNC_UNLOCK();
1124 }
1125
1126 #else /* VS_ASYNC_REUSE */
1127
1128 struct vs_async *
1129 vs_alloc_async(void)
1130 {
1131 struct vs_async *vsa;
1132 MACH_PORT_FACE reply_port;
1133 kern_return_t kr;
1134
1135 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1136 if (vsa != NULL) {
1137 /*
1138 * Try allocating a reply port named after the
1139 * address of the vs_async structure.
1140 */
1141 reply_port = ipc_port_alloc_kernel();
1142 alias_struct = (vstruct_alias *)
1143 kalloc(sizeof (struct vstruct_alias));
1144 if(alias_struct != NULL) {
1145 alias_struct->vs = reply_port;
1146 alias_struct->name = ISVS;
1147 reply_port->alias = (int) vsa;
1148 vsa->reply_port = reply_port;
1149 vs_alloc_async_count++;
1150 }
1151 else {
1152 vs_alloc_async_failed++;
1153 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1154 (reply_port));
1155 kfree((vm_offset_t) vsa,
1156 sizeof (struct vs_async));
1157 vsa = NULL;
1158 }
1159 }
1160
1161 return vsa;
1162 }
1163
1164 void
1165 vs_free_async(
1166 struct vs_async *vsa)
1167 {
1168 MACH_PORT_FACE reply_port;
1169 kern_return_t kr;
1170
1171 reply_port = vsa->reply_port;
1172 kfree((vm_offset_t) reply_port->alias, sizeof (struct vstuct_alias));
1173 kfree((vm_offset_t) vsa, sizeof (struct vs_async));
1174 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1175 #if 0
1176 VS_ASYNC_LOCK();
1177 vs_alloc_async_count--;
1178 VS_ASYNC_UNLOCK();
1179 #endif
1180 }
1181
1182 #endif /* VS_ASYNC_REUSE */
1183
1184 zone_t vstruct_zone;
1185
1186 vstruct_t
1187 ps_vstruct_create(
1188 vm_size_t size)
1189 {
1190 vstruct_t vs;
1191 int i;
1192
1193 vs = (vstruct_t) zalloc(vstruct_zone);
1194 if (vs == VSTRUCT_NULL) {
1195 return VSTRUCT_NULL;
1196 }
1197
1198 VS_LOCK_INIT(vs);
1199
1200 /*
1201 * The following fields will be provided later.
1202 */
1203 vs->vs_mem_obj = NULL;
1204 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1205 vs->vs_references = 1;
1206 vs->vs_seqno = 0;
1207
1208 #ifdef MACH_KERNEL
1209 vs->vs_waiting_seqno = FALSE;
1210 vs->vs_waiting_read = FALSE;
1211 vs->vs_waiting_write = FALSE;
1212 vs->vs_waiting_async = FALSE;
1213 #else
1214 mutex_init(&vs->vs_waiting_seqno, ETAP_DPAGE_VSSEQNO);
1215 mutex_init(&vs->vs_waiting_read, ETAP_DPAGE_VSREAD);
1216 mutex_init(&vs->vs_waiting_write, ETAP_DPAGE_VSWRITE);
1217 mutex_init(&vs->vs_waiting_refs, ETAP_DPAGE_VSREFS);
1218 mutex_init(&vs->vs_waiting_async, ETAP_DPAGE_VSASYNC);
1219 #endif
1220
1221 vs->vs_readers = 0;
1222 vs->vs_writers = 0;
1223
1224 vs->vs_errors = 0;
1225
1226 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1227 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1228 vs->vs_async_pending = 0;
1229
1230 /*
1231 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1232 * depending on the size of the memory object.
1233 */
1234 if (INDIRECT_CLMAP(vs->vs_size)) {
1235 vs->vs_imap = (struct vs_map **)
1236 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1237 vs->vs_indirect = TRUE;
1238 } else {
1239 vs->vs_dmap = (struct vs_map *)
1240 kalloc(CLMAP_SIZE(vs->vs_size));
1241 vs->vs_indirect = FALSE;
1242 }
1243 vs->vs_xfer_pending = FALSE;
1244 DEBUG(DEBUG_VS_INTERNAL,
1245 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1246
1247 /*
1248 * Check to see that we got the space.
1249 */
1250 if (!vs->vs_dmap) {
1251 kfree((vm_offset_t)vs, sizeof *vs);
1252 return VSTRUCT_NULL;
1253 }
1254
1255 /*
1256 * Zero the indirect pointers, or clear the direct pointers.
1257 */
1258 if (vs->vs_indirect)
1259 memset(vs->vs_imap, 0,
1260 INDIRECT_CLMAP_SIZE(vs->vs_size));
1261 else
1262 for (i = 0; i < vs->vs_size; i++)
1263 VSM_CLR(vs->vs_dmap[i]);
1264
1265 VS_MAP_LOCK_INIT(vs);
1266
1267 bs_commit(vs->vs_size);
1268
1269 return vs;
1270 }
1271
1272 paging_segment_t ps_select_segment(int, int *); /* forward */
1273
1274 paging_segment_t
1275 ps_select_segment(
1276 int shift,
1277 int *psindex)
1278 {
1279 paging_segment_t ps;
1280 int i;
1281 int j;
1282
1283 /*
1284 * Optimize case where there's only one segment.
1285 * paging_segment_max will index the one and only segment.
1286 */
1287
1288 PSL_LOCK();
1289 if (paging_segment_count == 1) {
1290 paging_segment_t lps; /* used to avoid extra PS_UNLOCK */
1291 ipc_port_t trigger = IP_NULL;
1292
1293 ps = paging_segments[paging_segment_max];
1294 *psindex = paging_segment_max;
1295 PS_LOCK(ps);
1296 if (ps->ps_going_away) {
1297 /* this segment is being turned off */
1298 lps = PAGING_SEGMENT_NULL;
1299 } else {
1300 ASSERT(ps->ps_clshift >= shift);
1301 if (ps->ps_clcount) {
1302 ps->ps_clcount--;
1303 dp_pages_free -= 1 << ps->ps_clshift;
1304 if(min_pages_trigger_port &&
1305 (dp_pages_free < minimum_pages_remaining)) {
1306 trigger = min_pages_trigger_port;
1307 min_pages_trigger_port = NULL;
1308 bs_low = TRUE;
1309 }
1310 lps = ps;
1311 } else
1312 lps = PAGING_SEGMENT_NULL;
1313 }
1314 PS_UNLOCK(ps);
1315 PSL_UNLOCK();
1316
1317 if (trigger != IP_NULL) {
1318 default_pager_space_alert(trigger, HI_WAT_ALERT);
1319 ipc_port_release_send(trigger);
1320 }
1321 return lps;
1322 }
1323
1324 if (paging_segment_count == 0) {
1325 PSL_UNLOCK();
1326 return PAGING_SEGMENT_NULL;
1327 }
1328
1329 for (i = BS_MAXPRI;
1330 i >= BS_MINPRI; i--) {
1331 int start_index;
1332
1333 if ((ps_select_array[i] == BS_NOPRI) ||
1334 (ps_select_array[i] == BS_FULLPRI))
1335 continue;
1336 start_index = ps_select_array[i];
1337
1338 if(!(paging_segments[start_index])) {
1339 j = start_index+1;
1340 physical_transfer_cluster_count = 0;
1341 }
1342 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1343 (((paging_segments[start_index])->ps_clshift)
1344 + vm_page_shift))) {
1345 physical_transfer_cluster_count = 0;
1346 j = start_index + 1;
1347 } else {
1348 physical_transfer_cluster_count+=1;
1349 j = start_index;
1350 if(start_index == 0)
1351 start_index = paging_segment_max;
1352 else
1353 start_index = start_index - 1;
1354 }
1355
1356 while (1) {
1357 if (j > paging_segment_max)
1358 j = 0;
1359 if ((ps = paging_segments[j]) &&
1360 (ps->ps_bs->bs_priority == i)) {
1361 /*
1362 * Force the ps cluster size to be
1363 * >= that of the vstruct.
1364 */
1365 PS_LOCK(ps);
1366 if (ps->ps_going_away) {
1367 /* this segment is being turned off */
1368 } else if ((ps->ps_clcount) &&
1369 (ps->ps_clshift >= shift)) {
1370 ipc_port_t trigger = IP_NULL;
1371
1372 ps->ps_clcount--;
1373 dp_pages_free -= 1 << ps->ps_clshift;
1374 if(min_pages_trigger_port &&
1375 (dp_pages_free <
1376 minimum_pages_remaining)) {
1377 trigger = min_pages_trigger_port;
1378 min_pages_trigger_port = NULL;
1379 }
1380 PS_UNLOCK(ps);
1381 /*
1382 * found one, quit looking.
1383 */
1384 ps_select_array[i] = j;
1385 PSL_UNLOCK();
1386
1387 if (trigger != IP_NULL) {
1388 default_pager_space_alert(
1389 trigger,
1390 HI_WAT_ALERT);
1391 ipc_port_release_send(trigger);
1392 }
1393 *psindex = j;
1394 return ps;
1395 }
1396 PS_UNLOCK(ps);
1397 }
1398 if (j == start_index) {
1399 /*
1400 * none at this priority -- mark it full
1401 */
1402 ps_select_array[i] = BS_FULLPRI;
1403 break;
1404 }
1405 j++;
1406 }
1407 }
1408 PSL_UNLOCK();
1409 return PAGING_SEGMENT_NULL;
1410 }
1411
1412 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1413
1414 vm_offset_t
1415 ps_allocate_cluster(
1416 vstruct_t vs,
1417 int *psindex,
1418 paging_segment_t use_ps)
1419 {
1420 int byte_num;
1421 int bit_num = 0;
1422 paging_segment_t ps;
1423 vm_offset_t cluster;
1424 ipc_port_t trigger = IP_NULL;
1425
1426 /*
1427 * Find best paging segment.
1428 * ps_select_segment will decrement cluster count on ps.
1429 * Must pass cluster shift to find the most appropriate segment.
1430 */
1431 /* NOTE: The addition of paging segment delete capability threatened
1432 * to seriously complicate the treatment of paging segments in this
1433 * module and the ones that call it (notably ps_clmap), because of the
1434 * difficulty in assuring that the paging segment would continue to
1435 * exist between being unlocked and locked. This was
1436 * avoided because all calls to this module are based in either
1437 * dp_memory_object calls which rely on the vs lock, or by
1438 * the transfer function which is part of the segment delete path.
1439 * The transfer function which is part of paging segment delete is
1440 * protected from multiple callers by the backing store lock.
1441 * The paging segment delete function treats mappings to a paging
1442 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1443 * while data is transferred to the remaining segments. This is in
1444 * line with the view that incomplete or in-transition mappings between
1445 * data, a vstruct, and backing store are protected by the vs lock.
1446 * This and the ordering of the paging segment "going_away" bit setting
1447 * protects us.
1448 */
1449 if (use_ps != PAGING_SEGMENT_NULL) {
1450 ps = use_ps;
1451 PSL_LOCK();
1452 PS_LOCK(ps);
1453
1454 ASSERT(ps->ps_clcount != 0);
1455
1456 ps->ps_clcount--;
1457 dp_pages_free -= 1 << ps->ps_clshift;
1458 if(min_pages_trigger_port &&
1459 (dp_pages_free < minimum_pages_remaining)) {
1460 trigger = min_pages_trigger_port;
1461 min_pages_trigger_port = NULL;
1462 }
1463 PSL_UNLOCK();
1464 PS_UNLOCK(ps);
1465 if (trigger != IP_NULL) {
1466 default_pager_space_alert(trigger, HI_WAT_ALERT);
1467 ipc_port_release_send(trigger);
1468 }
1469
1470 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1471 PAGING_SEGMENT_NULL) {
1472 #if 0
1473 bs_no_paging_space(TRUE);
1474 #endif
1475 #if 0
1476 if (verbose)
1477 #endif
1478 dprintf(("no space in available paging segments; "
1479 "swapon suggested\n"));
1480 /* the count got off maybe, reset to zero */
1481 PSL_LOCK();
1482 dp_pages_free = 0;
1483 if(min_pages_trigger_port) {
1484 trigger = min_pages_trigger_port;
1485 min_pages_trigger_port = NULL;
1486 bs_low = TRUE;
1487 }
1488 PSL_UNLOCK();
1489 if (trigger != IP_NULL) {
1490 default_pager_space_alert(trigger, HI_WAT_ALERT);
1491 ipc_port_release_send(trigger);
1492 }
1493 return (vm_offset_t) -1;
1494 }
1495
1496 /*
1497 * Look for an available cluster. At the end of the loop,
1498 * byte_num is the byte offset and bit_num is the bit offset of the
1499 * first zero bit in the paging segment bitmap.
1500 */
1501 PS_LOCK(ps);
1502 byte_num = ps->ps_hint;
1503 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1504 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1505 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1506 if (isclr((ps->ps_bmap + byte_num), bit_num))
1507 break;
1508 }
1509 ASSERT(bit_num != NBBY);
1510 break;
1511 }
1512 }
1513 ps->ps_hint = byte_num;
1514 cluster = (byte_num*NBBY) + bit_num;
1515
1516 /* Space was reserved, so this must be true */
1517 ASSERT(cluster < ps->ps_ncls);
1518
1519 setbit(ps->ps_bmap, cluster);
1520 PS_UNLOCK(ps);
1521
1522 return cluster;
1523 }
1524
1525 void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */
1526
1527 void
1528 ps_deallocate_cluster(
1529 paging_segment_t ps,
1530 vm_offset_t cluster)
1531 {
1532
1533 if (cluster >= (vm_offset_t) ps->ps_ncls)
1534 panic("ps_deallocate_cluster: Invalid cluster number");
1535
1536 /*
1537 * Lock the paging segment, clear the cluster's bitmap and increment the
1538 * number of free cluster.
1539 */
1540 PSL_LOCK();
1541 PS_LOCK(ps);
1542 clrbit(ps->ps_bmap, cluster);
1543 ++ps->ps_clcount;
1544 dp_pages_free += 1 << ps->ps_clshift;
1545 PSL_UNLOCK();
1546
1547 /*
1548 * Move the hint down to the freed cluster if it is
1549 * less than the current hint.
1550 */
1551 if ((cluster/NBBY) < ps->ps_hint) {
1552 ps->ps_hint = (cluster/NBBY);
1553 }
1554
1555 PS_UNLOCK(ps);
1556
1557 /*
1558 * If we're freeing space on a full priority, reset the array.
1559 */
1560 PSL_LOCK();
1561 if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1562 ps_select_array[ps->ps_bs->bs_priority] = 0;
1563 PSL_UNLOCK();
1564
1565 return;
1566 }
1567
1568 void ps_dealloc_vsmap(struct vs_map *, vm_size_t); /* forward */
1569
1570 void
1571 ps_dealloc_vsmap(
1572 struct vs_map *vsmap,
1573 vm_size_t size)
1574 {
1575 int i;
1576 for (i = 0; i < size; i++)
1577 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1578 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1579 VSM_CLOFF(vsmap[i]));
1580 }
1581
1582 void
1583 ps_vstruct_dealloc(
1584 vstruct_t vs)
1585 {
1586 int i;
1587 spl_t s;
1588
1589 VS_MAP_LOCK(vs);
1590
1591 /*
1592 * If this is an indirect structure, then we walk through the valid
1593 * (non-zero) indirect pointers and deallocate the clusters
1594 * associated with each used map entry (via ps_dealloc_vsmap).
1595 * When all of the clusters in an indirect block have been
1596 * freed, we deallocate the block. When all of the indirect
1597 * blocks have been deallocated we deallocate the memory
1598 * holding the indirect pointers.
1599 */
1600 if (vs->vs_indirect) {
1601 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1602 if (vs->vs_imap[i] != NULL) {
1603 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1604 kfree((vm_offset_t)vs->vs_imap[i],
1605 CLMAP_THRESHOLD);
1606 }
1607 }
1608 kfree((vm_offset_t)vs->vs_imap,
1609 INDIRECT_CLMAP_SIZE(vs->vs_size));
1610 } else {
1611 /*
1612 * Direct map. Free used clusters, then memory.
1613 */
1614 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1615 kfree((vm_offset_t)vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1616 }
1617 VS_MAP_UNLOCK(vs);
1618
1619 bs_commit(- vs->vs_size);
1620
1621 zfree(vstruct_zone, (vm_offset_t)vs);
1622 }
1623
1624 int ps_map_extend(vstruct_t, int); /* forward */
1625
1626 int ps_map_extend(
1627 vstruct_t vs,
1628 int new_size)
1629 {
1630 struct vs_map **new_imap;
1631 struct vs_map *new_dmap = NULL;
1632 int newdsize;
1633 int i;
1634 void *old_map = NULL;
1635 int old_map_size = 0;
1636
1637 if (vs->vs_size >= new_size) {
1638 /*
1639 * Someone has already done the work.
1640 */
1641 return 0;
1642 }
1643
1644 /*
1645 * If the new size extends into the indirect range, then we have one
1646 * of two cases: we are going from indirect to indirect, or we are
1647 * going from direct to indirect. If we are going from indirect to
1648 * indirect, then it is possible that the new size will fit in the old
1649 * indirect map. If this is the case, then just reset the size of the
1650 * vstruct map and we are done. If the new size will not
1651 * fit into the old indirect map, then we have to allocate a new
1652 * indirect map and copy the old map pointers into this new map.
1653 *
1654 * If we are going from direct to indirect, then we have to allocate a
1655 * new indirect map and copy the old direct pages into the first
1656 * indirect page of the new map.
1657 * NOTE: allocating memory here is dangerous, as we're in the
1658 * pageout path.
1659 */
1660 if (INDIRECT_CLMAP(new_size)) {
1661 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1662
1663 /*
1664 * Get a new indirect map and zero it.
1665 */
1666 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1667 if (vs->vs_indirect &&
1668 (new_map_size == old_map_size)) {
1669 bs_commit(new_size - vs->vs_size);
1670 vs->vs_size = new_size;
1671 return 0;
1672 }
1673
1674 new_imap = (struct vs_map **)kalloc(new_map_size);
1675 if (new_imap == NULL) {
1676 return -1;
1677 }
1678 memset(new_imap, 0, new_map_size);
1679
1680 if (vs->vs_indirect) {
1681 /* Copy old entries into new map */
1682 memcpy(new_imap, vs->vs_imap, old_map_size);
1683 /* Arrange to free the old map */
1684 old_map = (void *) vs->vs_imap;
1685 newdsize = 0;
1686 } else { /* Old map was a direct map */
1687 /* Allocate an indirect page */
1688 if ((new_imap[0] = (struct vs_map *)
1689 kalloc(CLMAP_THRESHOLD)) == NULL) {
1690 kfree((vm_offset_t)new_imap, new_map_size);
1691 return -1;
1692 }
1693 new_dmap = new_imap[0];
1694 newdsize = CLMAP_ENTRIES;
1695 }
1696 } else {
1697 new_imap = NULL;
1698 newdsize = new_size;
1699 /*
1700 * If the new map is a direct map, then the old map must
1701 * also have been a direct map. All we have to do is
1702 * to allocate a new direct map, copy the old entries
1703 * into it and free the old map.
1704 */
1705 if ((new_dmap = (struct vs_map *)
1706 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1707 return -1;
1708 }
1709 }
1710 if (newdsize) {
1711
1712 /* Free the old map */
1713 old_map = (void *) vs->vs_dmap;
1714 old_map_size = CLMAP_SIZE(vs->vs_size);
1715
1716 /* Copy info from the old map into the new map */
1717 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1718
1719 /* Initialize the rest of the new map */
1720 for (i = vs->vs_size; i < newdsize; i++)
1721 VSM_CLR(new_dmap[i]);
1722 }
1723 if (new_imap) {
1724 vs->vs_imap = new_imap;
1725 vs->vs_indirect = TRUE;
1726 } else
1727 vs->vs_dmap = new_dmap;
1728 bs_commit(new_size - vs->vs_size);
1729 vs->vs_size = new_size;
1730 if (old_map)
1731 kfree((vm_offset_t)old_map, old_map_size);
1732 return 0;
1733 }
1734
1735 vm_offset_t
1736 ps_clmap(
1737 vstruct_t vs,
1738 vm_offset_t offset,
1739 struct clmap *clmap,
1740 int flag,
1741 vm_size_t size,
1742 int error)
1743 {
1744 vm_offset_t cluster; /* The cluster of offset. */
1745 vm_offset_t newcl; /* The new cluster allocated. */
1746 vm_offset_t newoff;
1747 int i;
1748 struct vs_map *vsmap;
1749
1750 VS_MAP_LOCK(vs);
1751
1752 ASSERT(vs->vs_dmap);
1753 cluster = atop_32(offset) >> vs->vs_clshift;
1754
1755 /*
1756 * Initialize cluster error value
1757 */
1758 clmap->cl_error = 0;
1759
1760 /*
1761 * If the object has grown, extend the page map.
1762 */
1763 if (cluster >= vs->vs_size) {
1764 if (flag == CL_FIND) {
1765 /* Do not allocate if just doing a lookup */
1766 VS_MAP_UNLOCK(vs);
1767 return (vm_offset_t) -1;
1768 }
1769 if (ps_map_extend(vs, cluster + 1)) {
1770 VS_MAP_UNLOCK(vs);
1771 return (vm_offset_t) -1;
1772 }
1773 }
1774
1775 /*
1776 * Look for the desired cluster. If the map is indirect, then we
1777 * have a two level lookup. First find the indirect block, then
1778 * find the actual cluster. If the indirect block has not yet
1779 * been allocated, then do so. If the cluster has not yet been
1780 * allocated, then do so.
1781 *
1782 * If any of the allocations fail, then return an error.
1783 * Don't allocate if just doing a lookup.
1784 */
1785 if (vs->vs_indirect) {
1786 long ind_block = cluster/CLMAP_ENTRIES;
1787
1788 /* Is the indirect block allocated? */
1789 vsmap = vs->vs_imap[ind_block];
1790 if (vsmap == NULL) {
1791 if (flag == CL_FIND) {
1792 VS_MAP_UNLOCK(vs);
1793 return (vm_offset_t) -1;
1794 }
1795
1796 /* Allocate the indirect block */
1797 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1798 if (vsmap == NULL) {
1799 VS_MAP_UNLOCK(vs);
1800 return (vm_offset_t) -1;
1801 }
1802 /* Initialize the cluster offsets */
1803 for (i = 0; i < CLMAP_ENTRIES; i++)
1804 VSM_CLR(vsmap[i]);
1805 vs->vs_imap[ind_block] = vsmap;
1806 }
1807 } else
1808 vsmap = vs->vs_dmap;
1809
1810 ASSERT(vsmap);
1811 vsmap += cluster%CLMAP_ENTRIES;
1812
1813 /*
1814 * At this point, vsmap points to the struct vs_map desired.
1815 *
1816 * Look in the map for the cluster, if there was an error on a
1817 * previous write, flag it and return. If it is not yet
1818 * allocated, then allocate it, if we're writing; if we're
1819 * doing a lookup and the cluster's not allocated, return error.
1820 */
1821 if (VSM_ISERR(*vsmap)) {
1822 clmap->cl_error = VSM_GETERR(*vsmap);
1823 VS_MAP_UNLOCK(vs);
1824 return (vm_offset_t) -1;
1825 } else if (VSM_ISCLR(*vsmap)) {
1826 int psindex;
1827
1828 if (flag == CL_FIND) {
1829 /*
1830 * If there's an error and the entry is clear, then
1831 * we've run out of swap space. Record the error
1832 * here and return.
1833 */
1834 if (error) {
1835 VSM_SETERR(*vsmap, error);
1836 }
1837 VS_MAP_UNLOCK(vs);
1838 return (vm_offset_t) -1;
1839 } else {
1840 /*
1841 * Attempt to allocate a cluster from the paging segment
1842 */
1843 newcl = ps_allocate_cluster(vs, &psindex,
1844 PAGING_SEGMENT_NULL);
1845 if (newcl == -1) {
1846 VS_MAP_UNLOCK(vs);
1847 return (vm_offset_t) -1;
1848 }
1849 VSM_CLR(*vsmap);
1850 VSM_SETCLOFF(*vsmap, newcl);
1851 VSM_SETPS(*vsmap, psindex);
1852 }
1853 } else
1854 newcl = VSM_CLOFF(*vsmap);
1855
1856 /*
1857 * Fill in pertinent fields of the clmap
1858 */
1859 clmap->cl_ps = VSM_PS(*vsmap);
1860 clmap->cl_numpages = VSCLSIZE(vs);
1861 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1862
1863 /*
1864 * Byte offset in paging segment is byte offset to cluster plus
1865 * byte offset within cluster. It looks ugly, but should be
1866 * relatively quick.
1867 */
1868 ASSERT(trunc_page(offset) == offset);
1869 newcl = ptoa_32(newcl) << vs->vs_clshift;
1870 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1871 if (flag == CL_ALLOC) {
1872 /*
1873 * set bits in the allocation bitmap according to which
1874 * pages were requested. size is in bytes.
1875 */
1876 i = atop_32(newoff);
1877 while ((size > 0) && (i < VSCLSIZE(vs))) {
1878 VSM_SETALLOC(*vsmap, i);
1879 i++;
1880 size -= vm_page_size;
1881 }
1882 }
1883 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1884 if (newoff) {
1885 /*
1886 * Offset is not cluster aligned, so number of pages
1887 * and bitmaps must be adjusted
1888 */
1889 clmap->cl_numpages -= atop_32(newoff);
1890 CLMAP_SHIFT(clmap, vs);
1891 CLMAP_SHIFTALLOC(clmap, vs);
1892 }
1893
1894 /*
1895 *
1896 * The setting of valid bits and handling of write errors
1897 * must be done here, while we hold the lock on the map.
1898 * It logically should be done in ps_vs_write_complete().
1899 * The size and error information has been passed from
1900 * ps_vs_write_complete(). If the size parameter is non-zero,
1901 * then there is work to be done. If error is also non-zero,
1902 * then the error number is recorded in the cluster and the
1903 * entire cluster is in error.
1904 */
1905 if (size && flag == CL_FIND) {
1906 vm_offset_t off = (vm_offset_t) 0;
1907
1908 if (!error) {
1909 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1910 i++) {
1911 VSM_SETPG(*vsmap, i);
1912 size -= vm_page_size;
1913 }
1914 ASSERT(i <= VSCLSIZE(vs));
1915 } else {
1916 BS_STAT(clmap->cl_ps->ps_bs,
1917 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1918 atop_32(size));
1919 off = VSM_CLOFF(*vsmap);
1920 VSM_SETERR(*vsmap, error);
1921 }
1922 /*
1923 * Deallocate cluster if error, and no valid pages
1924 * already present.
1925 */
1926 if (off != (vm_offset_t) 0)
1927 ps_deallocate_cluster(clmap->cl_ps, off);
1928 VS_MAP_UNLOCK(vs);
1929 return (vm_offset_t) 0;
1930 } else
1931 VS_MAP_UNLOCK(vs);
1932
1933 DEBUG(DEBUG_VS_INTERNAL,
1934 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1935 newcl+newoff, (int) vs, (int) vsmap, flag));
1936 DEBUG(DEBUG_VS_INTERNAL,
1937 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1938 (int) clmap->cl_ps, clmap->cl_numpages,
1939 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1940
1941 return (newcl + newoff);
1942 }
1943
1944 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */
1945
1946 void
1947 ps_clunmap(
1948 vstruct_t vs,
1949 vm_offset_t offset,
1950 vm_size_t length)
1951 {
1952 vm_offset_t cluster; /* The cluster number of offset */
1953 struct vs_map *vsmap;
1954
1955 VS_MAP_LOCK(vs);
1956
1957 /*
1958 * Loop through all clusters in this range, freeing paging segment
1959 * clusters and map entries as encountered.
1960 */
1961 while (length > 0) {
1962 vm_offset_t newoff;
1963 int i;
1964
1965 cluster = atop_32(offset) >> vs->vs_clshift;
1966 if (vs->vs_indirect) /* indirect map */
1967 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
1968 else
1969 vsmap = vs->vs_dmap;
1970 if (vsmap == NULL) {
1971 VS_MAP_UNLOCK(vs);
1972 return;
1973 }
1974 vsmap += cluster%CLMAP_ENTRIES;
1975 if (VSM_ISCLR(*vsmap)) {
1976 length -= vm_page_size;
1977 offset += vm_page_size;
1978 continue;
1979 }
1980 /*
1981 * We've got a valid mapping. Clear it and deallocate
1982 * paging segment cluster pages.
1983 * Optimize for entire cluster cleraing.
1984 */
1985 if (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) {
1986 /*
1987 * Not cluster aligned.
1988 */
1989 ASSERT(trunc_page(newoff) == newoff);
1990 i = atop_32(newoff);
1991 } else
1992 i = 0;
1993 while ((i < VSCLSIZE(vs)) && (length > 0)) {
1994 VSM_CLRPG(*vsmap, i);
1995 VSM_CLRALLOC(*vsmap, i);
1996 length -= vm_page_size;
1997 offset += vm_page_size;
1998 i++;
1999 }
2000
2001 /*
2002 * If map entry is empty, clear and deallocate cluster.
2003 */
2004 if (!VSM_ALLOC(*vsmap)) {
2005 ps_deallocate_cluster(VSM_PS(*vsmap),
2006 VSM_CLOFF(*vsmap));
2007 VSM_CLR(*vsmap);
2008 }
2009 }
2010
2011 VS_MAP_UNLOCK(vs);
2012 }
2013
2014 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2015
2016 void
2017 ps_vs_write_complete(
2018 vstruct_t vs,
2019 vm_offset_t offset,
2020 vm_size_t size,
2021 int error)
2022 {
2023 struct clmap clmap;
2024
2025 /*
2026 * Get the struct vsmap for this cluster.
2027 * Use READ, even though it was written, because the
2028 * cluster MUST be present, unless there was an error
2029 * in the original ps_clmap (e.g. no space), in which
2030 * case, nothing happens.
2031 *
2032 * Must pass enough information to ps_clmap to allow it
2033 * to set the vs_map structure bitmap under lock.
2034 */
2035 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2036 }
2037
2038 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */
2039
2040 void
2041 vs_cl_write_complete(
2042 vstruct_t vs,
2043 paging_segment_t ps,
2044 vm_offset_t offset,
2045 vm_offset_t addr,
2046 vm_size_t size,
2047 boolean_t async,
2048 int error)
2049 {
2050 kern_return_t kr;
2051
2052 if (error) {
2053 /*
2054 * For internal objects, the error is recorded on a
2055 * per-cluster basis by ps_clmap() which is called
2056 * by ps_vs_write_complete() below.
2057 */
2058 dprintf(("write failed error = 0x%x\n", error));
2059 /* add upl_abort code here */
2060 } else
2061 GSTAT(global_stats.gs_pages_out += atop_32(size));
2062 /*
2063 * Notify the vstruct mapping code, so it can do its accounting.
2064 */
2065 ps_vs_write_complete(vs, offset, size, error);
2066
2067 if (async) {
2068 VS_LOCK(vs);
2069 ASSERT(vs->vs_async_pending > 0);
2070 vs->vs_async_pending -= size;
2071 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2072 vs->vs_waiting_async = FALSE;
2073 VS_UNLOCK(vs);
2074 /* mutex_unlock(&vs->vs_waiting_async); */
2075 thread_wakeup(&vs->vs_async_pending);
2076 } else {
2077 VS_UNLOCK(vs);
2078 }
2079 }
2080 }
2081
2082 #ifdef DEVICE_PAGING
2083 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2084
2085 kern_return_t
2086 device_write_reply(
2087 MACH_PORT_FACE reply_port,
2088 kern_return_t device_code,
2089 io_buf_len_t bytes_written)
2090 {
2091 struct vs_async *vsa;
2092
2093 vsa = (struct vs_async *)
2094 ((struct vstruct_alias *)(reply_port->alias))->vs;
2095
2096 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2097 device_code = KERN_FAILURE;
2098 }
2099
2100 vsa->vsa_error = device_code;
2101
2102
2103 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2104 if(vsa->vsa_flags & VSA_TRANSFER) {
2105 /* revisit when async disk segments redone */
2106 if(vsa->vsa_error) {
2107 /* need to consider error condition. re-write data or */
2108 /* throw it away here. */
2109 vm_offset_t ioaddr;
2110 if(vm_map_copyout(kernel_map, &ioaddr,
2111 (vm_map_copy_t)vsa->vsa_addr) != KERN_SUCCESS)
2112 panic("vs_cluster_write: unable to copy source list\n");
2113 vm_deallocate(kernel_map, ioaddr, vsa->vsa_size);
2114 }
2115 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2116 vsa->vsa_size, vsa->vsa_error);
2117 } else {
2118 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2119 vsa->vsa_addr, vsa->vsa_size, TRUE,
2120 vsa->vsa_error);
2121 }
2122 VS_FREE_ASYNC(vsa);
2123
2124 return KERN_SUCCESS;
2125 }
2126
2127 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2128 kern_return_t
2129 device_write_reply_inband(
2130 MACH_PORT_FACE reply_port,
2131 kern_return_t return_code,
2132 io_buf_len_t bytes_written)
2133 {
2134 panic("device_write_reply_inband: illegal");
2135 return KERN_SUCCESS;
2136 }
2137
2138 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2139 kern_return_t
2140 device_read_reply(
2141 MACH_PORT_FACE reply_port,
2142 kern_return_t return_code,
2143 io_buf_ptr_t data,
2144 mach_msg_type_number_t dataCnt)
2145 {
2146 struct vs_async *vsa;
2147 vsa = (struct vs_async *)
2148 ((struct vstruct_alias *)(reply_port->alias))->vs;
2149 vsa->vsa_addr = (vm_offset_t)data;
2150 vsa->vsa_size = (vm_size_t)dataCnt;
2151 vsa->vsa_error = return_code;
2152 thread_wakeup(&vsa->vsa_lock);
2153 return KERN_SUCCESS;
2154 }
2155
2156 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2157 kern_return_t
2158 device_read_reply_inband(
2159 MACH_PORT_FACE reply_port,
2160 kern_return_t return_code,
2161 io_buf_ptr_inband_t data,
2162 mach_msg_type_number_t dataCnt)
2163 {
2164 panic("device_read_reply_inband: illegal");
2165 return KERN_SUCCESS;
2166 }
2167
2168 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2169 kern_return_t
2170 device_read_reply_overwrite(
2171 MACH_PORT_FACE reply_port,
2172 kern_return_t return_code,
2173 io_buf_len_t bytes_read)
2174 {
2175 panic("device_read_reply_overwrite: illegal\n");
2176 return KERN_SUCCESS;
2177 }
2178
2179 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2180 kern_return_t
2181 device_open_reply(
2182 MACH_PORT_FACE reply_port,
2183 kern_return_t return_code,
2184 MACH_PORT_FACE device_port)
2185 {
2186 panic("device_open_reply: illegal\n");
2187 return KERN_SUCCESS;
2188 }
2189
2190 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
2191
2192 kern_return_t
2193 ps_read_device(
2194 paging_segment_t ps,
2195 vm_offset_t offset,
2196 vm_offset_t *bufferp,
2197 unsigned int size,
2198 unsigned int *residualp,
2199 int flags)
2200 {
2201 kern_return_t kr;
2202 recnum_t dev_offset;
2203 unsigned int bytes_wanted;
2204 unsigned int bytes_read;
2205 unsigned int total_read;
2206 vm_offset_t dev_buffer;
2207 vm_offset_t buf_ptr;
2208 unsigned int records_read;
2209 struct vs_async *vsa;
2210 mutex_t vs_waiting_read_reply;
2211
2212 device_t device;
2213 vm_map_copy_t device_data = NULL;
2214 default_pager_thread_t *dpt = NULL;
2215
2216 device = dev_port_lookup(ps->ps_device);
2217 clustered_reads[atop_32(size)]++;
2218
2219 dev_offset = (ps->ps_offset +
2220 (offset >> (vm_page_shift - ps->ps_record_shift)));
2221 bytes_wanted = size;
2222 total_read = 0;
2223 *bufferp = (vm_offset_t)NULL;
2224
2225 do {
2226 vsa = VS_ALLOC_ASYNC();
2227 if (vsa) {
2228 vsa->vsa_vs = NULL;
2229 vsa->vsa_addr = 0;
2230 vsa->vsa_offset = 0;
2231 vsa->vsa_size = 0;
2232 vsa->vsa_ps = NULL;
2233 }
2234 mutex_init(&vsa->vsa_lock, ETAP_DPAGE_VSSEQNO);
2235 ip_lock(vsa->reply_port);
2236 vsa->reply_port->ip_sorights++;
2237 ip_reference(vsa->reply_port);
2238 ip_unlock(vsa->reply_port);
2239 kr = ds_device_read_common(device,
2240 vsa->reply_port,
2241 (mach_msg_type_name_t)
2242 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2243 (dev_mode_t) 0,
2244 dev_offset,
2245 bytes_wanted,
2246 (IO_READ | IO_CALL),
2247 (io_buf_ptr_t *) &dev_buffer,
2248 (mach_msg_type_number_t *) &bytes_read);
2249 if(kr == MIG_NO_REPLY) {
2250 assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2251 thread_block(THREAD_CONTINUE_NULL);
2252
2253 dev_buffer = vsa->vsa_addr;
2254 bytes_read = (unsigned int)vsa->vsa_size;
2255 kr = vsa->vsa_error;
2256 }
2257 VS_FREE_ASYNC(vsa);
2258 if (kr != KERN_SUCCESS || bytes_read == 0) {
2259 break;
2260 }
2261 total_read += bytes_read;
2262
2263 /*
2264 * If we got the entire range, use the returned dev_buffer.
2265 */
2266 if (bytes_read == size) {
2267 *bufferp = (vm_offset_t)dev_buffer;
2268 break;
2269 }
2270
2271 #if 1
2272 dprintf(("read only %d bytes out of %d\n",
2273 bytes_read, bytes_wanted));
2274 #endif
2275 if(dpt == NULL) {
2276 dpt = get_read_buffer();
2277 buf_ptr = dpt->dpt_buffer;
2278 *bufferp = (vm_offset_t)buf_ptr;
2279 }
2280 /*
2281 * Otherwise, copy the data into the provided buffer (*bufferp)
2282 * and append the rest of the range as it comes in.
2283 */
2284 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2285 buf_ptr += bytes_read;
2286 bytes_wanted -= bytes_read;
2287 records_read = (bytes_read >>
2288 (vm_page_shift - ps->ps_record_shift));
2289 dev_offset += records_read;
2290 DEBUG(DEBUG_VS_INTERNAL,
2291 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2292 dev_buffer, bytes_read));
2293 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2294 != KERN_SUCCESS)
2295 Panic("dealloc buf");
2296 } while (bytes_wanted);
2297
2298 *residualp = size - total_read;
2299 if((dev_buffer != *bufferp) && (total_read != 0)) {
2300 vm_offset_t temp_buffer;
2301 vm_allocate(kernel_map, &temp_buffer, total_read, TRUE);
2302 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2303 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2304 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2305 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2306 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2307 (vm_map_copy_t *)&device_data, FALSE))
2308 panic("ps_read_device: cannot copyin locally provided buffer\n");
2309 }
2310 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2311 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2312 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2313 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2314 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2315 (vm_map_copy_t *)&device_data, FALSE))
2316 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2317 }
2318 else {
2319 device_data = NULL;
2320 }
2321 *bufferp = (vm_offset_t)device_data;
2322
2323 if(dpt != NULL) {
2324 /* Free the receive buffer */
2325 dpt->checked_out = 0;
2326 thread_wakeup(&dpt_array);
2327 }
2328 return KERN_SUCCESS;
2329 }
2330
2331 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
2332
2333 kern_return_t
2334 ps_write_device(
2335 paging_segment_t ps,
2336 vm_offset_t offset,
2337 vm_offset_t addr,
2338 unsigned int size,
2339 struct vs_async *vsa)
2340 {
2341 recnum_t dev_offset;
2342 io_buf_len_t bytes_to_write, bytes_written;
2343 recnum_t records_written;
2344 kern_return_t kr;
2345 MACH_PORT_FACE reply_port;
2346
2347
2348
2349 clustered_writes[atop_32(size)]++;
2350
2351 dev_offset = (ps->ps_offset +
2352 (offset >> (vm_page_shift - ps->ps_record_shift)));
2353 bytes_to_write = size;
2354
2355 if (vsa) {
2356 /*
2357 * Asynchronous write.
2358 */
2359 reply_port = vsa->reply_port;
2360 ip_lock(reply_port);
2361 reply_port->ip_sorights++;
2362 ip_reference(reply_port);
2363 ip_unlock(reply_port);
2364 {
2365 device_t device;
2366 device = dev_port_lookup(ps->ps_device);
2367
2368 vsa->vsa_addr = addr;
2369 kr=ds_device_write_common(device,
2370 reply_port,
2371 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2372 (dev_mode_t) 0,
2373 dev_offset,
2374 (io_buf_ptr_t) addr,
2375 size,
2376 (IO_WRITE | IO_CALL),
2377 &bytes_written);
2378 }
2379 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2380 if (verbose)
2381 dprintf(("%s0x%x, addr=0x%x,"
2382 "size=0x%x,offset=0x%x\n",
2383 "device_write_request returned ",
2384 kr, addr, size, offset));
2385 BS_STAT(ps->ps_bs,
2386 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2387 /* do the completion notification to free resources */
2388 device_write_reply(reply_port, kr, 0);
2389 return PAGER_ERROR;
2390 }
2391 } else do {
2392 /*
2393 * Synchronous write.
2394 */
2395 {
2396 device_t device;
2397 device = dev_port_lookup(ps->ps_device);
2398 kr=ds_device_write_common(device,
2399 IP_NULL, 0,
2400 (dev_mode_t) 0,
2401 dev_offset,
2402 (io_buf_ptr_t) addr,
2403 size,
2404 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2405 &bytes_written);
2406 }
2407 if (kr != KERN_SUCCESS) {
2408 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2409 "device_write returned ",
2410 kr, addr, size, offset));
2411 BS_STAT(ps->ps_bs,
2412 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2413 return PAGER_ERROR;
2414 }
2415 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2416 Panic("fragmented write");
2417 records_written = (bytes_written >>
2418 (vm_page_shift - ps->ps_record_shift));
2419 dev_offset += records_written;
2420 #if 1
2421 if (bytes_written != bytes_to_write) {
2422 dprintf(("wrote only %d bytes out of %d\n",
2423 bytes_written, bytes_to_write));
2424 }
2425 #endif
2426 bytes_to_write -= bytes_written;
2427 addr += bytes_written;
2428 } while (bytes_to_write > 0);
2429
2430 return PAGER_SUCCESS;
2431 }
2432
2433
2434 #else /* !DEVICE_PAGING */
2435
2436 kern_return_t
2437 ps_read_device(
2438 paging_segment_t ps,
2439 vm_offset_t offset,
2440 vm_offset_t *bufferp,
2441 unsigned int size,
2442 unsigned int *residualp,
2443 int flags)
2444 {
2445 panic("ps_read_device not supported");
2446 }
2447
2448 ps_write_device(
2449 paging_segment_t ps,
2450 vm_offset_t offset,
2451 vm_offset_t addr,
2452 unsigned int size,
2453 struct vs_async *vsa)
2454 {
2455 panic("ps_write_device not supported");
2456 }
2457
2458 #endif /* DEVICE_PAGING */
2459 void pvs_object_data_provided(vstruct_t, upl_t, vm_offset_t, vm_size_t); /* forward */
2460
2461 void
2462 pvs_object_data_provided(
2463 vstruct_t vs,
2464 upl_t upl,
2465 vm_offset_t offset,
2466 vm_size_t size)
2467 {
2468
2469 DEBUG(DEBUG_VS_INTERNAL,
2470 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2471 upl, offset, size));
2472
2473 ASSERT(size > 0);
2474 GSTAT(global_stats.gs_pages_in += atop_32(size));
2475
2476
2477 #if USE_PRECIOUS
2478 ps_clunmap(vs, offset, size);
2479 #endif /* USE_PRECIOUS */
2480
2481 }
2482
2483 kern_return_t
2484 pvs_cluster_read(
2485 vstruct_t vs,
2486 vm_offset_t vs_offset,
2487 vm_size_t cnt)
2488 {
2489 upl_t upl;
2490 kern_return_t error = KERN_SUCCESS;
2491 int size;
2492 unsigned int residual;
2493 unsigned int request_flags;
2494 int seg_index;
2495 int pages_in_cl;
2496 int cl_size;
2497 int cl_mask;
2498 int cl_index;
2499 int xfer_size;
2500 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2501 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2502 struct clmap clmap;
2503
2504 pages_in_cl = 1 << vs->vs_clshift;
2505 cl_size = pages_in_cl * vm_page_size;
2506 cl_mask = cl_size - 1;
2507
2508 /*
2509 * This loop will be executed multiple times until the entire
2510 * request has been satisfied... if the request spans cluster
2511 * boundaries, the clusters will be checked for logical continunity,
2512 * if contiguous the I/O request will span multiple clusters, otherwise
2513 * it will be broken up into the minimal set of I/O's
2514 *
2515 * If there are holes in a request (either unallocated pages in a paging
2516 * segment or an unallocated paging segment), we stop
2517 * reading at the hole, inform the VM of any data read, inform
2518 * the VM of an unavailable range, then loop again, hoping to
2519 * find valid pages later in the requested range. This continues until
2520 * the entire range has been examined, and read, if present.
2521 */
2522
2523 #if USE_PRECIOUS
2524 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
2525 #else
2526 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
2527 #endif
2528 while (cnt && (error == KERN_SUCCESS)) {
2529 int ps_info_valid;
2530 int page_list_count;
2531
2532 if((vs_offset & cl_mask) &&
2533 (cnt > (VM_SUPER_CLUSTER -
2534 (vs_offset & cl_mask)))) {
2535 size = VM_SUPER_CLUSTER;
2536 size -= vs_offset & cl_mask;
2537 } else if (cnt > VM_SUPER_CLUSTER) {
2538 size = VM_SUPER_CLUSTER;
2539 } else {
2540 size = cnt;
2541 }
2542 cnt -= size;
2543
2544 ps_info_valid = 0;
2545 seg_index = 0;
2546
2547 while (size > 0 && error == KERN_SUCCESS) {
2548 int abort_size;
2549 int failed_size;
2550 int beg_pseg;
2551 int beg_indx;
2552 vm_offset_t cur_offset;
2553
2554
2555 if ( !ps_info_valid) {
2556 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2557 psp[seg_index] = CLMAP_PS(clmap);
2558 ps_info_valid = 1;
2559 }
2560 /*
2561 * skip over unallocated physical segments
2562 */
2563 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2564 abort_size = cl_size - (vs_offset & cl_mask);
2565 abort_size = MIN(abort_size, size);
2566
2567 page_list_count = 0;
2568 memory_object_super_upl_request(
2569 vs->vs_control,
2570 (memory_object_offset_t)vs_offset,
2571 abort_size, abort_size,
2572 &upl, NULL, &page_list_count,
2573 request_flags);
2574
2575 if (clmap.cl_error) {
2576 upl_abort(upl, UPL_ABORT_ERROR);
2577 } else {
2578 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2579 }
2580 upl_deallocate(upl);
2581
2582 size -= abort_size;
2583 vs_offset += abort_size;
2584
2585 seg_index++;
2586 ps_info_valid = 0;
2587 continue;
2588 }
2589 cl_index = (vs_offset & cl_mask) / vm_page_size;
2590
2591 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2592 /*
2593 * skip over unallocated pages
2594 */
2595 if (CLMAP_ISSET(clmap, cl_index))
2596 break;
2597 abort_size += vm_page_size;
2598 }
2599 if (abort_size) {
2600 /*
2601 * Let VM system know about holes in clusters.
2602 */
2603 GSTAT(global_stats.gs_pages_unavail += atop_32(abort_size));
2604
2605 page_list_count = 0;
2606 memory_object_super_upl_request(
2607 vs->vs_control,
2608 (memory_object_offset_t)vs_offset,
2609 abort_size, abort_size,
2610 &upl, NULL, &page_list_count,
2611 request_flags);
2612
2613 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2614 upl_deallocate(upl);
2615
2616 size -= abort_size;
2617 vs_offset += abort_size;
2618
2619 if (cl_index == pages_in_cl) {
2620 /*
2621 * if we're at the end of this physical cluster
2622 * then bump to the next one and continue looking
2623 */
2624 seg_index++;
2625 ps_info_valid = 0;
2626 continue;
2627 }
2628 if (size == 0)
2629 break;
2630 }
2631 /*
2632 * remember the starting point of the first allocated page
2633 * for the I/O we're about to issue
2634 */
2635 beg_pseg = seg_index;
2636 beg_indx = cl_index;
2637 cur_offset = vs_offset;
2638
2639 /*
2640 * calculate the size of the I/O that we can do...
2641 * this may span multiple physical segments if
2642 * they are contiguous
2643 */
2644 for (xfer_size = 0; xfer_size < size; ) {
2645
2646 while (cl_index < pages_in_cl
2647 && xfer_size < size) {
2648 /*
2649 * accumulate allocated pages within
2650 * a physical segment
2651 */
2652 if (CLMAP_ISSET(clmap, cl_index)) {
2653 xfer_size += vm_page_size;
2654 cur_offset += vm_page_size;
2655 cl_index++;
2656
2657 BS_STAT(psp[seg_index]->ps_bs,
2658 psp[seg_index]->ps_bs->bs_pages_in++);
2659 } else
2660 break;
2661 }
2662 if (cl_index < pages_in_cl
2663 || xfer_size >= size) {
2664 /*
2665 * we've hit an unallocated page or
2666 * the end of this request... go fire
2667 * the I/O
2668 */
2669 break;
2670 }
2671 /*
2672 * we've hit the end of the current physical
2673 * segment and there's more to do, so try
2674 * moving to the next one
2675 */
2676 seg_index++;
2677
2678 ps_offset[seg_index] =
2679 ps_clmap(vs,
2680 cur_offset & ~cl_mask,
2681 &clmap, CL_FIND, 0, 0);
2682 psp[seg_index] = CLMAP_PS(clmap);
2683 ps_info_valid = 1;
2684
2685 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2686 /*
2687 * if the physical segment we're about
2688 * to step into is not contiguous to
2689 * the one we're currently in, or it's
2690 * in a different paging file, or
2691 * it hasn't been allocated....
2692 * we stop here and generate the I/O
2693 */
2694 break;
2695 }
2696 /*
2697 * start with first page of the next physical
2698 * segment
2699 */
2700 cl_index = 0;
2701 }
2702 if (xfer_size) {
2703 /*
2704 * we have a contiguous range of allocated pages
2705 * to read from
2706 */
2707 page_list_count = 0;
2708 memory_object_super_upl_request(vs->vs_control,
2709 (memory_object_offset_t)vs_offset,
2710 xfer_size, xfer_size,
2711 &upl, NULL, &page_list_count,
2712 request_flags | UPL_SET_INTERNAL);
2713
2714 error = ps_read_file(psp[beg_pseg],
2715 upl, (vm_offset_t) 0,
2716 ps_offset[beg_pseg] +
2717 (beg_indx * vm_page_size),
2718 xfer_size, &residual, 0);
2719 } else
2720 continue;
2721
2722 failed_size = 0;
2723
2724 /*
2725 * Adjust counts and send response to VM. Optimize
2726 * for the common case, i.e. no error and/or partial
2727 * data. If there was an error, then we need to error
2728 * the entire range, even if some data was successfully
2729 * read. If there was a partial read we may supply some
2730 * data and may error some as well. In all cases the
2731 * VM must receive some notification for every page
2732 * in the range.
2733 */
2734 if ((error == KERN_SUCCESS) && (residual == 0)) {
2735 /*
2736 * Got everything we asked for, supply the data
2737 * to the VM. Note that as a side effect of
2738 * supplying the data, the buffer holding the
2739 * supplied data is deallocated from the pager's
2740 * address space.
2741 */
2742 pvs_object_data_provided(
2743 vs, upl, vs_offset, xfer_size);
2744 } else {
2745 failed_size = xfer_size;
2746
2747 if (error == KERN_SUCCESS) {
2748 if (residual == xfer_size) {
2749 /*
2750 * If a read operation returns no error
2751 * and no data moved, we turn it into
2752 * an error, assuming we're reading at
2753 * or beyong EOF.
2754 * Fall through and error the entire
2755 * range.
2756 */
2757 error = KERN_FAILURE;
2758 } else {
2759 /*
2760 * Otherwise, we have partial read. If
2761 * the part read is a integral number
2762 * of pages supply it. Otherwise round
2763 * it up to a page boundary, zero fill
2764 * the unread part, and supply it.
2765 * Fall through and error the remainder
2766 * of the range, if any.
2767 */
2768 int fill, lsize;
2769
2770 fill = residual
2771 & ~vm_page_size;
2772 lsize = (xfer_size - residual)
2773 + fill;
2774 pvs_object_data_provided(
2775 vs, upl,
2776 vs_offset, lsize);
2777
2778 if (lsize < xfer_size) {
2779 failed_size =
2780 xfer_size - lsize;
2781 error = KERN_FAILURE;
2782 }
2783 }
2784 }
2785 }
2786 /*
2787 * If there was an error in any part of the range, tell
2788 * the VM. Note that error is explicitly checked again
2789 * since it can be modified above.
2790 */
2791 if (error != KERN_SUCCESS) {
2792 BS_STAT(psp[beg_pseg]->ps_bs,
2793 psp[beg_pseg]->ps_bs->bs_pages_in_fail
2794 += atop_32(failed_size));
2795 }
2796 size -= xfer_size;
2797 vs_offset += xfer_size;
2798 }
2799
2800 } /* END while (cnt && (error == 0)) */
2801 return error;
2802 }
2803
2804 int vs_do_async_write = 1;
2805
2806 kern_return_t
2807 vs_cluster_write(
2808 vstruct_t vs,
2809 upl_t internal_upl,
2810 vm_offset_t offset,
2811 vm_size_t cnt,
2812 boolean_t dp_internal,
2813 int flags)
2814 {
2815 vm_offset_t size;
2816 vm_offset_t transfer_size;
2817 int error = 0;
2818 struct clmap clmap;
2819
2820 vm_offset_t actual_offset; /* Offset within paging segment */
2821 paging_segment_t ps;
2822 vm_offset_t subx_size;
2823 vm_offset_t mobj_base_addr;
2824 vm_offset_t mobj_target_addr;
2825 int mobj_size;
2826
2827 struct vs_async *vsa;
2828 vm_map_copy_t copy;
2829
2830 upl_t upl;
2831 upl_page_info_t *pl;
2832 int page_index;
2833 int list_size;
2834 int pages_in_cl;
2835 int cl_size;
2836 int base_index;
2837 int seg_size;
2838
2839 pages_in_cl = 1 << vs->vs_clshift;
2840 cl_size = pages_in_cl * vm_page_size;
2841
2842 if (!dp_internal) {
2843 int page_list_count;
2844 int request_flags;
2845 int super_size;
2846 int first_dirty;
2847 int num_dirty;
2848 int num_of_pages;
2849 int seg_index;
2850 vm_offset_t upl_offset;
2851 vm_offset_t seg_offset;
2852 vm_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2853 paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2854
2855
2856 if (bs_low) {
2857 super_size = cl_size;
2858
2859 request_flags = UPL_NOBLOCK |
2860 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2861 UPL_NO_SYNC | UPL_SET_INTERNAL;
2862 } else {
2863 super_size = VM_SUPER_CLUSTER;
2864
2865 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2866 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2867 UPL_NO_SYNC | UPL_SET_INTERNAL;
2868 }
2869
2870 page_list_count = 0;
2871 memory_object_super_upl_request(vs->vs_control,
2872 (memory_object_offset_t)offset,
2873 cnt, super_size,
2874 &upl, NULL, &page_list_count,
2875 request_flags | UPL_FOR_PAGEOUT);
2876
2877 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2878
2879 seg_size = cl_size - (upl->offset % cl_size);
2880 upl_offset = upl->offset & ~(cl_size - 1);
2881
2882 for (seg_index = 0, transfer_size = upl->size;
2883 transfer_size > 0; ) {
2884 ps_offset[seg_index] =
2885 ps_clmap(vs,
2886 upl_offset,
2887 &clmap, CL_ALLOC,
2888 cl_size, 0);
2889
2890 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2891 upl_abort(upl, 0);
2892 upl_deallocate(upl);
2893
2894 return KERN_FAILURE;
2895
2896 }
2897 psp[seg_index] = CLMAP_PS(clmap);
2898
2899 if (transfer_size > seg_size) {
2900 transfer_size -= seg_size;
2901 upl_offset += cl_size;
2902 seg_size = cl_size;
2903 seg_index++;
2904 } else
2905 transfer_size = 0;
2906 }
2907 /*
2908 * Ignore any non-present pages at the end of the
2909 * UPL.
2910 */
2911 for (page_index = upl->size / vm_page_size; page_index > 0;)
2912 if (UPL_PAGE_PRESENT(pl, --page_index))
2913 break;
2914 num_of_pages = page_index + 1;
2915
2916 base_index = (upl->offset % cl_size) / PAGE_SIZE;
2917
2918 for (page_index = 0; page_index < num_of_pages; ) {
2919 /*
2920 * skip over non-dirty pages
2921 */
2922 for ( ; page_index < num_of_pages; page_index++) {
2923 if (UPL_DIRTY_PAGE(pl, page_index)
2924 || UPL_PRECIOUS_PAGE(pl, page_index))
2925 /*
2926 * this is a page we need to write
2927 * go see if we can buddy it up with
2928 * others that are contiguous to it
2929 */
2930 break;
2931 /*
2932 * if the page is not-dirty, but present we
2933 * need to commit it... This is an unusual
2934 * case since we only asked for dirty pages
2935 */
2936 if (UPL_PAGE_PRESENT(pl, page_index)) {
2937 boolean_t empty = FALSE;
2938 upl_commit_range(upl,
2939 page_index * vm_page_size,
2940 vm_page_size,
2941 UPL_COMMIT_NOTIFY_EMPTY,
2942 pl,
2943 page_list_count,
2944 &empty);
2945 if (empty) {
2946 assert(page_index ==
2947 num_of_pages - 1);
2948 upl_deallocate(upl);
2949 }
2950 }
2951 }
2952 if (page_index == num_of_pages)
2953 /*
2954 * no more pages to look at, we're out of here
2955 */
2956 break;
2957
2958 /*
2959 * gather up contiguous dirty pages... we have at
2960 * least 1 * otherwise we would have bailed above
2961 * make sure that each physical segment that we step
2962 * into is contiguous to the one we're currently in
2963 * if it's not, we have to stop and write what we have
2964 */
2965 for (first_dirty = page_index;
2966 page_index < num_of_pages; ) {
2967 if ( !UPL_DIRTY_PAGE(pl, page_index)
2968 && !UPL_PRECIOUS_PAGE(pl, page_index))
2969 break;
2970 page_index++;
2971 /*
2972 * if we just looked at the last page in the UPL
2973 * we don't need to check for physical segment
2974 * continuity
2975 */
2976 if (page_index < num_of_pages) {
2977 int cur_seg;
2978 int nxt_seg;
2979
2980 cur_seg = (base_index + (page_index - 1))/pages_in_cl;
2981 nxt_seg = (base_index + page_index)/pages_in_cl;
2982
2983 if (cur_seg != nxt_seg) {
2984 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
2985 /*
2986 * if the segment we're about
2987 * to step into is not
2988 * contiguous to the one we're
2989 * currently in, or it's in a
2990 * different paging file....
2991 * we stop here and generate
2992 * the I/O
2993 */
2994 break;
2995 }
2996 }
2997 }
2998 num_dirty = page_index - first_dirty;
2999
3000 if (num_dirty) {
3001 upl_offset = first_dirty * vm_page_size;
3002 transfer_size = num_dirty * vm_page_size;
3003
3004 while (transfer_size) {
3005
3006 if ((seg_size = cl_size -
3007 ((upl->offset + upl_offset) % cl_size))
3008 > transfer_size)
3009 seg_size = transfer_size;
3010
3011 ps_vs_write_complete(vs,
3012 upl->offset + upl_offset,
3013 seg_size, error);
3014
3015 transfer_size -= seg_size;
3016 upl_offset += seg_size;
3017 }
3018 upl_offset = first_dirty * vm_page_size;
3019 transfer_size = num_dirty * vm_page_size;
3020
3021 seg_index = (base_index + first_dirty) / pages_in_cl;
3022 seg_offset = (upl->offset + upl_offset) % cl_size;
3023
3024 error = ps_write_file(psp[seg_index],
3025 upl, upl_offset,
3026 ps_offset[seg_index]
3027 + seg_offset,
3028 transfer_size, flags);
3029 } else {
3030 boolean_t empty = FALSE;
3031 upl_abort_range(upl,
3032 first_dirty * vm_page_size,
3033 num_dirty * vm_page_size,
3034 UPL_ABORT_NOTIFY_EMPTY,
3035 &empty);
3036 if (empty) {
3037 assert(page_index == num_of_pages);
3038 upl_deallocate(upl);
3039 }
3040 }
3041 }
3042
3043 } else {
3044 assert(cnt <= (vm_page_size << vs->vs_clshift));
3045 list_size = cnt;
3046
3047 page_index = 0;
3048 /* The caller provides a mapped_data which is derived */
3049 /* from a temporary object. The targeted pages are */
3050 /* guaranteed to be set at offset 0 in the mapped_data */
3051 /* The actual offset however must still be derived */
3052 /* from the offset in the vs in question */
3053 mobj_base_addr = offset;
3054 mobj_target_addr = mobj_base_addr;
3055
3056 for (transfer_size = list_size; transfer_size != 0;) {
3057 actual_offset = ps_clmap(vs, mobj_target_addr,
3058 &clmap, CL_ALLOC,
3059 transfer_size < cl_size ?
3060 transfer_size : cl_size, 0);
3061 if(actual_offset == (vm_offset_t) -1) {
3062 error = 1;
3063 break;
3064 }
3065 cnt = MIN(transfer_size,
3066 CLMAP_NPGS(clmap) * vm_page_size);
3067 ps = CLMAP_PS(clmap);
3068 /* Assume that the caller has given us contiguous */
3069 /* pages */
3070 if(cnt) {
3071 ps_vs_write_complete(vs, mobj_target_addr,
3072 cnt, error);
3073 error = ps_write_file(ps, internal_upl,
3074 0, actual_offset,
3075 cnt, flags);
3076 if (error)
3077 break;
3078 }
3079 if (error)
3080 break;
3081 actual_offset += cnt;
3082 mobj_target_addr += cnt;
3083 transfer_size -= cnt;
3084 cnt = 0;
3085
3086 if (error)
3087 break;
3088 }
3089 }
3090 if(error)
3091 return KERN_FAILURE;
3092 else
3093 return KERN_SUCCESS;
3094 }
3095
3096 vm_size_t
3097 ps_vstruct_allocated_size(
3098 vstruct_t vs)
3099 {
3100 int num_pages;
3101 struct vs_map *vsmap;
3102 int i, j, k;
3103
3104 num_pages = 0;
3105 if (vs->vs_indirect) {
3106 /* loop on indirect maps */
3107 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3108 vsmap = vs->vs_imap[i];
3109 if (vsmap == NULL)
3110 continue;
3111 /* loop on clusters in this indirect map */
3112 for (j = 0; j < CLMAP_ENTRIES; j++) {
3113 if (VSM_ISCLR(vsmap[j]) ||
3114 VSM_ISERR(vsmap[j]))
3115 continue;
3116 /* loop on pages in this cluster */
3117 for (k = 0; k < VSCLSIZE(vs); k++) {
3118 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3119 num_pages++;
3120 }
3121 }
3122 }
3123 } else {
3124 vsmap = vs->vs_dmap;
3125 if (vsmap == NULL)
3126 return 0;
3127 /* loop on clusters in the direct map */
3128 for (j = 0; j < CLMAP_ENTRIES; j++) {
3129 if (VSM_ISCLR(vsmap[j]) ||
3130 VSM_ISERR(vsmap[j]))
3131 continue;
3132 /* loop on pages in this cluster */
3133 for (k = 0; k < VSCLSIZE(vs); k++) {
3134 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3135 num_pages++;
3136 }
3137 }
3138 }
3139
3140 return ptoa_32(num_pages);
3141 }
3142
3143 size_t
3144 ps_vstruct_allocated_pages(
3145 vstruct_t vs,
3146 default_pager_page_t *pages,
3147 size_t pages_size)
3148 {
3149 int num_pages;
3150 struct vs_map *vsmap;
3151 vm_offset_t offset;
3152 int i, j, k;
3153
3154 num_pages = 0;
3155 offset = 0;
3156 if (vs->vs_indirect) {
3157 /* loop on indirect maps */
3158 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3159 vsmap = vs->vs_imap[i];
3160 if (vsmap == NULL) {
3161 offset += (vm_page_size * CLMAP_ENTRIES *
3162 VSCLSIZE(vs));
3163 continue;
3164 }
3165 /* loop on clusters in this indirect map */
3166 for (j = 0; j < CLMAP_ENTRIES; j++) {
3167 if (VSM_ISCLR(vsmap[j]) ||
3168 VSM_ISERR(vsmap[j])) {
3169 offset += vm_page_size * VSCLSIZE(vs);
3170 continue;
3171 }
3172 /* loop on pages in this cluster */
3173 for (k = 0; k < VSCLSIZE(vs); k++) {
3174 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3175 num_pages++;
3176 if (num_pages < pages_size)
3177 pages++->dpp_offset =
3178 offset;
3179 }
3180 offset += vm_page_size;
3181 }
3182 }
3183 }
3184 } else {
3185 vsmap = vs->vs_dmap;
3186 if (vsmap == NULL)
3187 return 0;
3188 /* loop on clusters in the direct map */
3189 for (j = 0; j < CLMAP_ENTRIES; j++) {
3190 if (VSM_ISCLR(vsmap[j]) ||
3191 VSM_ISERR(vsmap[j])) {
3192 offset += vm_page_size * VSCLSIZE(vs);
3193 continue;
3194 }
3195 /* loop on pages in this cluster */
3196 for (k = 0; k < VSCLSIZE(vs); k++) {
3197 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3198 num_pages++;
3199 if (num_pages < pages_size)
3200 pages++->dpp_offset = offset;
3201 }
3202 offset += vm_page_size;
3203 }
3204 }
3205 }
3206
3207 return num_pages;
3208 }
3209
3210
3211 kern_return_t
3212 ps_vstruct_transfer_from_segment(
3213 vstruct_t vs,
3214 paging_segment_t segment,
3215 upl_t upl)
3216 {
3217 struct vs_map *vsmap;
3218 struct vs_map old_vsmap;
3219 struct vs_map new_vsmap;
3220 int i, j, k;
3221
3222 VS_LOCK(vs); /* block all work on this vstruct */
3223 /* can't allow the normal multiple write */
3224 /* semantic because writes may conflict */
3225 vs->vs_xfer_pending = TRUE;
3226 vs_wait_for_sync_writers(vs);
3227 vs_start_write(vs);
3228 vs_wait_for_readers(vs);
3229 /* we will unlock the vs to allow other writes while transferring */
3230 /* and will be guaranteed of the persistance of the vs struct */
3231 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3232 /* vs_async_pending */
3233 /* OK we now have guaranteed no other parties are accessing this */
3234 /* vs. Now that we are also supporting simple lock versions of */
3235 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3236 /* our purpose in holding it before was the multiple write case */
3237 /* we now use the boolean xfer_pending to do that. We can use */
3238 /* a boolean instead of a count because we have guaranteed single */
3239 /* file access to this code in its caller */
3240 VS_UNLOCK(vs);
3241 vs_changed:
3242 if (vs->vs_indirect) {
3243 int vsmap_size;
3244 int clmap_off;
3245 /* loop on indirect maps */
3246 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3247 vsmap = vs->vs_imap[i];
3248 if (vsmap == NULL)
3249 continue;
3250 /* loop on clusters in this indirect map */
3251 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3252 VSCLSIZE(vs) * i);
3253 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3254 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3255 else
3256 vsmap_size = CLMAP_ENTRIES;
3257 for (j = 0; j < vsmap_size; j++) {
3258 if (VSM_ISCLR(vsmap[j]) ||
3259 VSM_ISERR(vsmap[j]) ||
3260 (VSM_PS(vsmap[j]) != segment))
3261 continue;
3262 if(vs_cluster_transfer(vs,
3263 (vm_page_size * (j << vs->vs_clshift))
3264 + clmap_off,
3265 vm_page_size << vs->vs_clshift,
3266 upl)
3267 != KERN_SUCCESS) {
3268 VS_LOCK(vs);
3269 vs->vs_xfer_pending = FALSE;
3270 VS_UNLOCK(vs);
3271 vs_finish_write(vs);
3272 return KERN_FAILURE;
3273 }
3274 /* allow other readers/writers during transfer*/
3275 VS_LOCK(vs);
3276 vs->vs_xfer_pending = FALSE;
3277 VS_UNLOCK(vs);
3278 vs_finish_write(vs);
3279 VS_LOCK(vs);
3280 vs->vs_xfer_pending = TRUE;
3281 vs_wait_for_sync_writers(vs);
3282 vs_start_write(vs);
3283 vs_wait_for_readers(vs);
3284 VS_UNLOCK(vs);
3285 if (!(vs->vs_indirect)) {
3286 goto vs_changed;
3287 }
3288 }
3289 }
3290 } else {
3291 vsmap = vs->vs_dmap;
3292 if (vsmap == NULL) {
3293 VS_LOCK(vs);
3294 vs->vs_xfer_pending = FALSE;
3295 VS_UNLOCK(vs);
3296 vs_finish_write(vs);
3297 return KERN_SUCCESS;
3298 }
3299 /* loop on clusters in the direct map */
3300 for (j = 0; j < vs->vs_size; j++) {
3301 if (VSM_ISCLR(vsmap[j]) ||
3302 VSM_ISERR(vsmap[j]) ||
3303 (VSM_PS(vsmap[j]) != segment))
3304 continue;
3305 if(vs_cluster_transfer(vs,
3306 vm_page_size * (j << vs->vs_clshift),
3307 vm_page_size << vs->vs_clshift,
3308 upl) != KERN_SUCCESS) {
3309 VS_LOCK(vs);
3310 vs->vs_xfer_pending = FALSE;
3311 VS_UNLOCK(vs);
3312 vs_finish_write(vs);
3313 return KERN_FAILURE;
3314 }
3315 /* allow other readers/writers during transfer*/
3316 VS_LOCK(vs);
3317 vs->vs_xfer_pending = FALSE;
3318 VS_UNLOCK(vs);
3319 vs_finish_write(vs);
3320 VS_LOCK(vs);
3321 vs->vs_xfer_pending = TRUE;
3322 VS_UNLOCK(vs);
3323 vs_wait_for_sync_writers(vs);
3324 vs_start_write(vs);
3325 vs_wait_for_readers(vs);
3326 if (vs->vs_indirect) {
3327 goto vs_changed;
3328 }
3329 }
3330 }
3331
3332 VS_LOCK(vs);
3333 vs->vs_xfer_pending = FALSE;
3334 VS_UNLOCK(vs);
3335 vs_finish_write(vs);
3336 return KERN_SUCCESS;
3337 }
3338
3339
3340
3341 vs_map_t
3342 vs_get_map_entry(
3343 vstruct_t vs,
3344 vm_offset_t offset)
3345 {
3346 struct vs_map *vsmap;
3347 vm_offset_t cluster;
3348
3349 cluster = atop_32(offset) >> vs->vs_clshift;
3350 if (vs->vs_indirect) {
3351 long ind_block = cluster/CLMAP_ENTRIES;
3352
3353 /* Is the indirect block allocated? */
3354 vsmap = vs->vs_imap[ind_block];
3355 if(vsmap == (vs_map_t) NULL)
3356 return vsmap;
3357 } else
3358 vsmap = vs->vs_dmap;
3359 vsmap += cluster%CLMAP_ENTRIES;
3360 return vsmap;
3361 }
3362
3363 kern_return_t
3364 vs_cluster_transfer(
3365 vstruct_t vs,
3366 vm_offset_t offset,
3367 vm_size_t cnt,
3368 upl_t upl)
3369 {
3370 vm_offset_t actual_offset;
3371 paging_segment_t ps;
3372 struct clmap clmap;
3373 kern_return_t error = KERN_SUCCESS;
3374 int size, size_wanted, i;
3375 unsigned int residual;
3376 int unavail_size;
3377 default_pager_thread_t *dpt;
3378 boolean_t dealloc;
3379 struct vs_map *vsmap_ptr;
3380 struct vs_map read_vsmap;
3381 struct vs_map original_read_vsmap;
3382 struct vs_map write_vsmap;
3383 upl_t sync_upl;
3384 vm_offset_t ioaddr;
3385
3386 /* vs_cluster_transfer reads in the pages of a cluster and
3387 * then writes these pages back to new backing store. The
3388 * segment the pages are being read from is assumed to have
3389 * been taken off-line and is no longer considered for new
3390 * space requests.
3391 */
3392
3393 /*
3394 * This loop will be executed once per cluster referenced.
3395 * Typically this means once, since it's unlikely that the
3396 * VM system will ask for anything spanning cluster boundaries.
3397 *
3398 * If there are holes in a cluster (in a paging segment), we stop
3399 * reading at the hole, then loop again, hoping to
3400 * find valid pages later in the cluster. This continues until
3401 * the entire range has been examined, and read, if present. The
3402 * pages are written as they are read. If a failure occurs after
3403 * some pages are written the unmap call at the bottom of the loop
3404 * recovers the backing store and the old backing store remains
3405 * in effect.
3406 */
3407
3408 VSM_CLR(write_vsmap);
3409 VSM_CLR(original_read_vsmap);
3410 /* grab the actual object's pages to sync with I/O */
3411 while (cnt && (error == KERN_SUCCESS)) {
3412 vsmap_ptr = vs_get_map_entry(vs, offset);
3413 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3414
3415 if (actual_offset == (vm_offset_t) -1) {
3416
3417 /*
3418 * Nothing left to write in this cluster at least
3419 * set write cluster information for any previous
3420 * write, clear for next cluster, if there is one
3421 */
3422 unsigned int local_size, clmask, clsize;
3423
3424 clsize = vm_page_size << vs->vs_clshift;
3425 clmask = clsize - 1;
3426 local_size = clsize - (offset & clmask);
3427 ASSERT(local_size);
3428 local_size = MIN(local_size, cnt);
3429
3430 /* This cluster has no data in it beyond what may */
3431 /* have been found on a previous iteration through */
3432 /* the loop "write_vsmap" */
3433 *vsmap_ptr = write_vsmap;
3434 VSM_CLR(write_vsmap);
3435 VSM_CLR(original_read_vsmap);
3436
3437 cnt -= local_size;
3438 offset += local_size;
3439 continue;
3440 }
3441
3442 /*
3443 * Count up contiguous available or unavailable
3444 * pages.
3445 */
3446 ps = CLMAP_PS(clmap);
3447 ASSERT(ps);
3448 size = 0;
3449 unavail_size = 0;
3450 for (i = 0;
3451 (size < cnt) && (unavail_size < cnt) &&
3452 (i < CLMAP_NPGS(clmap)); i++) {
3453 if (CLMAP_ISSET(clmap, i)) {
3454 if (unavail_size != 0)
3455 break;
3456 size += vm_page_size;
3457 BS_STAT(ps->ps_bs,
3458 ps->ps_bs->bs_pages_in++);
3459 } else {
3460 if (size != 0)
3461 break;
3462 unavail_size += vm_page_size;
3463 }
3464 }
3465
3466 if (size == 0) {
3467 ASSERT(unavail_size);
3468 cnt -= unavail_size;
3469 offset += unavail_size;
3470 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3471 == 0) {
3472 /* There is no more to transfer in this
3473 cluster
3474 */
3475 *vsmap_ptr = write_vsmap;
3476 VSM_CLR(write_vsmap);
3477 VSM_CLR(original_read_vsmap);
3478 }
3479 continue;
3480 }
3481
3482 if(VSM_ISCLR(original_read_vsmap))
3483 original_read_vsmap = *vsmap_ptr;
3484
3485 if(ps->ps_segtype == PS_PARTITION) {
3486 /*
3487 NEED TO ISSUE WITH SYNC & NO COMMIT
3488 error = ps_read_device(ps, actual_offset, &buffer,
3489 size, &residual, flags);
3490 */
3491 } else {
3492 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3493 error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset,
3494 size, &residual,
3495 (UPL_IOSYNC | UPL_NOCOMMIT));
3496 }
3497
3498 read_vsmap = *vsmap_ptr;
3499
3500
3501 /*
3502 * Adjust counts and put data in new BS. Optimize for the
3503 * common case, i.e. no error and/or partial data.
3504 * If there was an error, then we need to error the entire
3505 * range, even if some data was successfully read.
3506 *
3507 */
3508 if ((error == KERN_SUCCESS) && (residual == 0)) {
3509 int page_list_count = 0;
3510
3511 /*
3512 * Got everything we asked for, supply the data to
3513 * the new BS. Note that as a side effect of supplying
3514 * the data, the buffer holding the supplied data is
3515 * deallocated from the pager's address space unless
3516 * the write is unsuccessful.
3517 */
3518
3519 /* note buffer will be cleaned up in all cases by */
3520 /* internal_cluster_write or if an error on write */
3521 /* the vm_map_copy_page_discard call */
3522 *vsmap_ptr = write_vsmap;
3523
3524 if(vs_cluster_write(vs, upl, offset,
3525 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3526 error = KERN_FAILURE;
3527 if(!(VSM_ISCLR(*vsmap_ptr))) {
3528 /* unmap the new backing store object */
3529 ps_clunmap(vs, offset, size);
3530 }
3531 /* original vsmap */
3532 *vsmap_ptr = original_read_vsmap;
3533 VSM_CLR(write_vsmap);
3534 } else {
3535 if((offset + size) &
3536 ((vm_page_size << vs->vs_clshift)
3537 - 1)) {
3538 /* There is more to transfer in this
3539 cluster
3540 */
3541 write_vsmap = *vsmap_ptr;
3542 *vsmap_ptr = read_vsmap;
3543 } else {
3544 /* discard the old backing object */
3545 write_vsmap = *vsmap_ptr;
3546 *vsmap_ptr = read_vsmap;
3547 ps_clunmap(vs, offset, size);
3548 *vsmap_ptr = write_vsmap;
3549 VSM_CLR(write_vsmap);
3550 VSM_CLR(original_read_vsmap);
3551 }
3552 }
3553 } else {
3554 size_wanted = size;
3555 if (error == KERN_SUCCESS) {
3556 if (residual == size) {
3557 /*
3558 * If a read operation returns no error
3559 * and no data moved, we turn it into
3560 * an error, assuming we're reading at
3561 * or beyond EOF.
3562 * Fall through and error the entire
3563 * range.
3564 */
3565 error = KERN_FAILURE;
3566 *vsmap_ptr = write_vsmap;
3567 if(!(VSM_ISCLR(*vsmap_ptr))) {
3568 /* unmap the new backing store object */
3569 ps_clunmap(vs, offset, size);
3570 }
3571 *vsmap_ptr = original_read_vsmap;
3572 VSM_CLR(write_vsmap);
3573 continue;
3574 } else {
3575 /*
3576 * Otherwise, we have partial read.
3577 * This is also considered an error
3578 * for the purposes of cluster transfer
3579 */
3580 error = KERN_FAILURE;
3581 *vsmap_ptr = write_vsmap;
3582 if(!(VSM_ISCLR(*vsmap_ptr))) {
3583 /* unmap the new backing store object */
3584 ps_clunmap(vs, offset, size);
3585 }
3586 *vsmap_ptr = original_read_vsmap;
3587 VSM_CLR(write_vsmap);
3588 continue;
3589 }
3590 }
3591
3592 }
3593 cnt -= size;
3594 offset += size;
3595
3596 } /* END while (cnt && (error == 0)) */
3597 if(!VSM_ISCLR(write_vsmap))
3598 *vsmap_ptr = write_vsmap;
3599
3600 return error;
3601 }
3602
3603 kern_return_t
3604 default_pager_add_file(MACH_PORT_FACE backing_store,
3605 int *vp,
3606 int record_size,
3607 long size)
3608 {
3609 backing_store_t bs;
3610 paging_segment_t ps;
3611 int i;
3612 int error;
3613
3614 if ((bs = backing_store_lookup(backing_store))
3615 == BACKING_STORE_NULL)
3616 return KERN_INVALID_ARGUMENT;
3617
3618 PSL_LOCK();
3619 for (i = 0; i <= paging_segment_max; i++) {
3620 ps = paging_segments[i];
3621 if (ps == PAGING_SEGMENT_NULL)
3622 continue;
3623 if (ps->ps_segtype != PS_FILE)
3624 continue;
3625
3626 /*
3627 * Check for overlap on same device.
3628 */
3629 if (ps->ps_vnode == (struct vnode *)vp) {
3630 PSL_UNLOCK();
3631 BS_UNLOCK(bs);
3632 return KERN_INVALID_ARGUMENT;
3633 }
3634 }
3635 PSL_UNLOCK();
3636
3637 /*
3638 * Set up the paging segment
3639 */
3640 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3641 if (ps == PAGING_SEGMENT_NULL) {
3642 BS_UNLOCK(bs);
3643 return KERN_RESOURCE_SHORTAGE;
3644 }
3645
3646 ps->ps_segtype = PS_FILE;
3647 ps->ps_vnode = (struct vnode *)vp;
3648 ps->ps_offset = 0;
3649 ps->ps_record_shift = local_log2(vm_page_size / record_size);
3650 ps->ps_recnum = size;
3651 ps->ps_pgnum = size >> ps->ps_record_shift;
3652
3653 ps->ps_pgcount = ps->ps_pgnum;
3654 ps->ps_clshift = local_log2(bs->bs_clsize);
3655 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3656 ps->ps_hint = 0;
3657
3658 PS_LOCK_INIT(ps);
3659 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3660 if (!ps->ps_bmap) {
3661 kfree((vm_offset_t)ps, sizeof *ps);
3662 BS_UNLOCK(bs);
3663 return KERN_RESOURCE_SHORTAGE;
3664 }
3665 for (i = 0; i < ps->ps_ncls; i++) {
3666 clrbit(ps->ps_bmap, i);
3667 }
3668
3669 ps->ps_going_away = FALSE;
3670 ps->ps_bs = bs;
3671
3672 if ((error = ps_enter(ps)) != 0) {
3673 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3674 kfree((vm_offset_t)ps, sizeof *ps);
3675 BS_UNLOCK(bs);
3676 return KERN_RESOURCE_SHORTAGE;
3677 }
3678
3679 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3680 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3681 PSL_LOCK();
3682 dp_pages_free += ps->ps_pgcount;
3683 PSL_UNLOCK();
3684
3685 BS_UNLOCK(bs);
3686
3687 bs_more_space(ps->ps_clcount);
3688
3689 DEBUG(DEBUG_BS_INTERNAL,
3690 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3691 device, offset, size, record_size,
3692 ps->ps_record_shift, ps->ps_pgnum));
3693
3694 return KERN_SUCCESS;
3695 }
3696
3697
3698
3699 kern_return_t
3700 ps_read_file(
3701 paging_segment_t ps,
3702 upl_t upl,
3703 vm_offset_t upl_offset,
3704 vm_offset_t offset,
3705 unsigned int size,
3706 unsigned int *residualp,
3707 int flags)
3708 {
3709 vm_object_offset_t f_offset;
3710 int error = 0;
3711 int result;
3712
3713
3714 clustered_reads[atop_32(size)]++;
3715
3716 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3717
3718 /* for transfer case we need to pass uploffset and flags */
3719 error = vnode_pagein(ps->ps_vnode,
3720 upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
3721
3722 /* The vnode_pagein semantic is somewhat at odds with the existing */
3723 /* device_read semantic. Partial reads are not experienced at this */
3724 /* level. It is up to the bit map code and cluster read code to */
3725 /* check that requested data locations are actually backed, and the */
3726 /* pagein code to either read all of the requested data or return an */
3727 /* error. */
3728
3729 if (error)
3730 result = KERN_FAILURE;
3731 else {
3732 *residualp = 0;
3733 result = KERN_SUCCESS;
3734 }
3735 return result;
3736 }
3737
3738 kern_return_t
3739 ps_write_file(
3740 paging_segment_t ps,
3741 upl_t upl,
3742 vm_offset_t upl_offset,
3743 vm_offset_t offset,
3744 unsigned int size,
3745 int flags)
3746 {
3747 vm_object_offset_t f_offset;
3748 kern_return_t result;
3749
3750 int error = 0;
3751
3752 clustered_writes[atop_32(size)]++;
3753 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3754
3755 if (vnode_pageout(ps->ps_vnode,
3756 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3757 result = KERN_FAILURE;
3758 else
3759 result = KERN_SUCCESS;
3760
3761 return result;
3762 }
3763
3764 kern_return_t
3765 default_pager_triggers(MACH_PORT_FACE default_pager,
3766 int hi_wat,
3767 int lo_wat,
3768 int flags,
3769 MACH_PORT_FACE trigger_port)
3770 {
3771 MACH_PORT_FACE release;
3772 kern_return_t kr;
3773
3774 PSL_LOCK();
3775 if (flags == HI_WAT_ALERT) {
3776 release = min_pages_trigger_port;
3777 min_pages_trigger_port = trigger_port;
3778 minimum_pages_remaining = hi_wat/vm_page_size;
3779 bs_low = FALSE;
3780 kr = KERN_SUCCESS;
3781 } else if (flags == LO_WAT_ALERT) {
3782 release = max_pages_trigger_port;
3783 max_pages_trigger_port = trigger_port;
3784 maximum_pages_free = lo_wat/vm_page_size;
3785 kr = KERN_SUCCESS;
3786 } else {
3787 release = trigger_port;
3788 kr = KERN_INVALID_ARGUMENT;
3789 }
3790 PSL_UNLOCK();
3791
3792 if (IP_VALID(release))
3793 ipc_port_release_send(release);
3794
3795 return kr;
3796 }
3797
3798 /*
3799 * Monitor the amount of available backing store vs. the amount of
3800 * required backing store, notify a listener (if present) when
3801 * backing store may safely be removed.
3802 *
3803 * We attempt to avoid the situation where backing store is
3804 * discarded en masse, as this can lead to thrashing as the
3805 * backing store is compacted.
3806 */
3807
3808 #define PF_INTERVAL 3 /* time between free level checks */
3809 #define PF_LATENCY 10 /* number of intervals before release */
3810
3811 static int dp_pages_free_low_count = 0;
3812
3813 void
3814 default_pager_backing_store_monitor(thread_call_param_t p1, thread_call_param_t p2)
3815 {
3816 unsigned long long average;
3817 ipc_port_t trigger;
3818 uint64_t deadline;
3819
3820 /*
3821 * We determine whether it will be safe to release some
3822 * backing store by watching the free page level. If
3823 * it remains below the maximum_pages_free threshold for
3824 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3825 * then we deem it safe.
3826 *
3827 * Note that this establishes a maximum rate at which backing
3828 * store will be released, as each notification (currently)
3829 * only results in a single backing store object being
3830 * released.
3831 */
3832 if (dp_pages_free > maximum_pages_free) {
3833 dp_pages_free_low_count++;
3834 } else {
3835 dp_pages_free_low_count = 0;
3836 }
3837
3838 /* decide whether to send notification */
3839 trigger = IP_NULL;
3840 if (max_pages_trigger_port &&
3841 (backing_store_release_trigger_disable == 0) &&
3842 (dp_pages_free_low_count > PF_LATENCY)) {
3843 trigger = max_pages_trigger_port;
3844 max_pages_trigger_port = NULL;
3845 }
3846
3847 /* send notification */
3848 if (trigger != IP_NULL) {
3849 VSL_LOCK();
3850 if(backing_store_release_trigger_disable != 0) {
3851 assert_wait((event_t)
3852 &backing_store_release_trigger_disable,
3853 THREAD_UNINT);
3854 VSL_UNLOCK();
3855 thread_block(THREAD_CONTINUE_NULL);
3856 } else {
3857 VSL_UNLOCK();
3858 }
3859 default_pager_space_alert(trigger, LO_WAT_ALERT);
3860 ipc_port_release_send(trigger);
3861 dp_pages_free_low_count = 0;
3862 }
3863
3864 clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
3865 thread_call_func_delayed(default_pager_backing_store_monitor, NULL, deadline);
3866 }