]> git.saurik.com Git - apple/xnu.git/blob - osfmk/default_pager/dp_backing_store.c
xnu-201.tar.gz
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
1
2 /*
3 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 *
5 * @APPLE_LICENSE_HEADER_START@
6 *
7 * The contents of this file constitute Original Code as defined in and
8 * are subject to the Apple Public Source License Version 1.1 (the
9 * "License"). You may not use this file except in compliance with the
10 * License. Please obtain a copy of the License at
11 * http://www.apple.com/publicsource and read it before using this file.
12 *
13 * This Original Code and all software distributed under the License are
14 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
18 * License for the specific language governing rights and limitations
19 * under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /*
24 * @OSF_COPYRIGHT@
25 */
26 /*
27 * Mach Operating System
28 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
29 * All Rights Reserved.
30 *
31 * Permission to use, copy, modify and distribute this software and its
32 * documentation is hereby granted, provided that both the copyright
33 * notice and this permission notice appear in all copies of the
34 * software, derivative works or modified versions, and any portions
35 * thereof, and that both notices appear in supporting documentation.
36 *
37 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
38 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
39 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
40 *
41 * Carnegie Mellon requests users of this software to return to
42 *
43 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
44 * School of Computer Science
45 * Carnegie Mellon University
46 * Pittsburgh PA 15213-3890
47 *
48 * any improvements or extensions that they make and grant Carnegie Mellon
49 * the rights to redistribute these changes.
50 */
51
52 /*
53 * Default Pager.
54 * Paging File Management.
55 */
56
57 #include <mach/memory_object_control.h>
58 #include <mach/memory_object_server.h>
59 #include "default_pager_internal.h"
60 #include <default_pager/default_pager_alerts.h>
61 #include <ipc/ipc_port.h>
62 #include <ipc/ipc_space.h>
63 #include <kern/queue.h>
64 #include <kern/counters.h>
65 #include <kern/sched_prim.h>
66 #include <vm/vm_kern.h>
67 #include <vm/vm_pageout.h>
68 /* CDY CDY */
69 #include <vm/vm_map.h>
70
71 /*
72 * ALLOC_STRIDE... the maximum number of bytes allocated from
73 * a swap file before moving on to the next swap file... if
74 * all swap files reside on a single disk, this value should
75 * be very large (this is the default assumption)... if the
76 * swap files are spread across multiple disks, than this value
77 * should be small (128 * 1024)...
78 *
79 * This should be determined dynamically in the future
80 */
81
82 #define ALLOC_STRIDE (1024 * 1024 * 1024)
83 int physical_transfer_cluster_count = 0;
84
85 #define VM_SUPER_CLUSTER 0x20000
86 #define VM_SUPER_PAGES 32
87
88 /*
89 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
90 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
91 */
92 #define VSTRUCT_DEF_CLSHIFT 2
93 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
94 int default_pager_clsize = 0;
95
96 /* statistics */
97 unsigned int clustered_writes[VM_SUPER_PAGES+1];
98 unsigned int clustered_reads[VM_SUPER_PAGES+1];
99
100 /*
101 * Globals used for asynchronous paging operations:
102 * vs_async_list: head of list of to-be-completed I/O ops
103 * async_num_queued: number of pages completed, but not yet
104 * processed by async thread.
105 * async_requests_out: number of pages of requests not completed.
106 */
107
108 #if 0
109 struct vs_async *vs_async_list;
110 int async_num_queued;
111 int async_requests_out;
112 #endif
113
114
115 #define VS_ASYNC_REUSE 1
116 struct vs_async *vs_async_free_list;
117
118 mutex_t default_pager_async_lock; /* Protects globals above */
119
120
121 int vs_alloc_async_failed = 0; /* statistics */
122 int vs_alloc_async_count = 0; /* statistics */
123 struct vs_async *vs_alloc_async(void); /* forward */
124 void vs_free_async(struct vs_async *vsa); /* forward */
125
126
127 #define VS_ALLOC_ASYNC() vs_alloc_async()
128 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
129
130 #define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
131 #define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
132 #define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, \
133 ETAP_IO_DEV_PAGEH)
134 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
135 /*
136 * Paging Space Hysteresis triggers and the target notification port
137 *
138 */
139
140 unsigned int minimum_pages_remaining = 0;
141 unsigned int maximum_pages_free = 0;
142 ipc_port_t min_pages_trigger_port = NULL;
143 ipc_port_t max_pages_trigger_port = NULL;
144
145 boolean_t bs_low = FALSE;
146 int backing_store_release_trigger_disable = 0;
147
148
149
150 /*
151 * Object sizes are rounded up to the next power of 2,
152 * unless they are bigger than a given maximum size.
153 */
154 vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
155
156 /*
157 * List of all backing store and segments.
158 */
159 struct backing_store_list_head backing_store_list;
160 paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
161 mutex_t paging_segments_lock;
162 int paging_segment_max = 0;
163 int paging_segment_count = 0;
164 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
165
166
167 /*
168 * Total pages free in system
169 * This differs from clusters committed/avail which is a measure of the
170 * over commitment of paging segments to backing store. An idea which is
171 * likely to be deprecated.
172 */
173 unsigned int dp_pages_free = 0;
174 unsigned int cluster_transfer_minimum = 100;
175
176 kern_return_t ps_write_file(paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, int); /* forward */
177 kern_return_t ps_read_file (paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, unsigned int *, int); /* forward */
178
179
180 default_pager_thread_t *
181 get_read_buffer()
182 {
183 int i;
184
185 DPT_LOCK(dpt_lock);
186 while(TRUE) {
187 for (i=0; i<default_pager_internal_count; i++) {
188 if(dpt_array[i]->checked_out == FALSE) {
189 dpt_array[i]->checked_out = TRUE;
190 DPT_UNLOCK(dpt_lock);
191 return dpt_array[i];
192 }
193 }
194 assert_wait(&dpt_array, THREAD_UNINT);
195 DPT_UNLOCK(dpt_lock);
196 thread_block((void(*)(void))0);
197 }
198 }
199
200 void
201 bs_initialize(void)
202 {
203 int i;
204
205 /*
206 * List of all backing store.
207 */
208 BSL_LOCK_INIT();
209 queue_init(&backing_store_list.bsl_queue);
210 PSL_LOCK_INIT();
211
212 VS_ASYNC_LOCK_INIT();
213 #if VS_ASYNC_REUSE
214 vs_async_free_list = NULL;
215 #endif /* VS_ASYNC_REUSE */
216
217 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
218 clustered_writes[i] = 0;
219 clustered_reads[i] = 0;
220 }
221
222 }
223
224 /*
225 * When things do not quite workout...
226 */
227 void bs_no_paging_space(boolean_t); /* forward */
228
229 void
230 bs_no_paging_space(
231 boolean_t out_of_memory)
232 {
233
234 if (out_of_memory)
235 dprintf(("*** OUT OF MEMORY ***\n"));
236 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
237 }
238
239 void bs_more_space(int); /* forward */
240 void bs_commit(int); /* forward */
241
242 boolean_t user_warned = FALSE;
243 unsigned int clusters_committed = 0;
244 unsigned int clusters_available = 0;
245 unsigned int clusters_committed_peak = 0;
246
247 void
248 bs_more_space(
249 int nclusters)
250 {
251 BSL_LOCK();
252 /*
253 * Account for new paging space.
254 */
255 clusters_available += nclusters;
256
257 if (clusters_available >= clusters_committed) {
258 if (verbose && user_warned) {
259 printf("%s%s - %d excess clusters now.\n",
260 my_name,
261 "paging space is OK now",
262 clusters_available - clusters_committed);
263 user_warned = FALSE;
264 clusters_committed_peak = 0;
265 }
266 } else {
267 if (verbose && user_warned) {
268 printf("%s%s - still short of %d clusters.\n",
269 my_name,
270 "WARNING: paging space over-committed",
271 clusters_committed - clusters_available);
272 clusters_committed_peak -= nclusters;
273 }
274 }
275 BSL_UNLOCK();
276
277 return;
278 }
279
280 void
281 bs_commit(
282 int nclusters)
283 {
284 BSL_LOCK();
285 clusters_committed += nclusters;
286 if (clusters_committed > clusters_available) {
287 if (verbose && !user_warned) {
288 user_warned = TRUE;
289 printf("%s%s - short of %d clusters.\n",
290 my_name,
291 "WARNING: paging space over-committed",
292 clusters_committed - clusters_available);
293 }
294 if (clusters_committed > clusters_committed_peak) {
295 clusters_committed_peak = clusters_committed;
296 }
297 } else {
298 if (verbose && user_warned) {
299 printf("%s%s - was short of up to %d clusters.\n",
300 my_name,
301 "paging space is OK now",
302 clusters_committed_peak - clusters_available);
303 user_warned = FALSE;
304 clusters_committed_peak = 0;
305 }
306 }
307 BSL_UNLOCK();
308
309 return;
310 }
311
312 int default_pager_info_verbose = 1;
313
314 void
315 bs_global_info(
316 vm_size_t *totalp,
317 vm_size_t *freep)
318 {
319 vm_size_t pages_total, pages_free;
320 paging_segment_t ps;
321 int i;
322
323 PSL_LOCK();
324 pages_total = pages_free = 0;
325 for (i = 0; i <= paging_segment_max; i++) {
326 ps = paging_segments[i];
327 if (ps == PAGING_SEGMENT_NULL)
328 continue;
329
330 /*
331 * no need to lock: by the time this data
332 * gets back to any remote requestor it
333 * will be obsolete anyways
334 */
335 pages_total += ps->ps_pgnum;
336 pages_free += ps->ps_clcount << ps->ps_clshift;
337 DEBUG(DEBUG_BS_INTERNAL,
338 ("segment #%d: %d total, %d free\n",
339 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
340 }
341 *totalp = pages_total;
342 *freep = pages_free;
343 if (verbose && user_warned && default_pager_info_verbose) {
344 if (clusters_available < clusters_committed) {
345 printf("%s %d clusters committed, %d available.\n",
346 my_name,
347 clusters_committed,
348 clusters_available);
349 }
350 }
351 PSL_UNLOCK();
352 }
353
354 backing_store_t backing_store_alloc(void); /* forward */
355
356 backing_store_t
357 backing_store_alloc(void)
358 {
359 backing_store_t bs;
360
361 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
362 if (bs == BACKING_STORE_NULL)
363 panic("backing_store_alloc: no memory");
364
365 BS_LOCK_INIT(bs);
366 bs->bs_port = MACH_PORT_NULL;
367 bs->bs_priority = 0;
368 bs->bs_clsize = 0;
369 bs->bs_pages_total = 0;
370 bs->bs_pages_in = 0;
371 bs->bs_pages_in_fail = 0;
372 bs->bs_pages_out = 0;
373 bs->bs_pages_out_fail = 0;
374
375 return bs;
376 }
377
378 backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
379
380 /* Even in both the component space and external versions of this pager, */
381 /* backing_store_lookup will be called from tasks in the application space */
382 backing_store_t
383 backing_store_lookup(
384 MACH_PORT_FACE port)
385 {
386 backing_store_t bs;
387
388 /*
389 port is currently backed with a vs structure in the alias field
390 we could create an ISBS alias and a port_is_bs call but frankly
391 I see no reason for the test, the bs->port == port check below
392 will work properly on junk entries.
393
394 if ((port == MACH_PORT_NULL) || port_is_vs(port))
395 */
396 if ((port == MACH_PORT_NULL))
397 return BACKING_STORE_NULL;
398
399 BSL_LOCK();
400 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
401 bs_links) {
402 BS_LOCK(bs);
403 if (bs->bs_port == port) {
404 BSL_UNLOCK();
405 /* Success, return it locked. */
406 return bs;
407 }
408 BS_UNLOCK(bs);
409 }
410 BSL_UNLOCK();
411 return BACKING_STORE_NULL;
412 }
413
414 void backing_store_add(backing_store_t); /* forward */
415
416 void
417 backing_store_add(
418 backing_store_t bs)
419 {
420 MACH_PORT_FACE port = bs->bs_port;
421 MACH_PORT_FACE pset = default_pager_default_set;
422 kern_return_t kr = KERN_SUCCESS;
423
424 if (kr != KERN_SUCCESS)
425 panic("backing_store_add: add to set");
426
427 }
428
429 /*
430 * Set up default page shift, but only if not already
431 * set and argument is within range.
432 */
433 boolean_t
434 bs_set_default_clsize(unsigned int npages)
435 {
436 switch(npages){
437 case 1:
438 case 2:
439 case 4:
440 case 8:
441 if (default_pager_clsize == 0) /* if not yet set */
442 vstruct_def_clshift = local_log2(npages);
443 return(TRUE);
444 }
445 return(FALSE);
446 }
447
448 int bs_get_global_clsize(int clsize); /* forward */
449
450 int
451 bs_get_global_clsize(
452 int clsize)
453 {
454 int i;
455 memory_object_default_t dmm;
456 kern_return_t kr;
457
458 /*
459 * Only allow setting of cluster size once. If called
460 * with no cluster size (default), we use the compiled-in default
461 * for the duration. The same cluster size is used for all
462 * paging segments.
463 */
464 if (default_pager_clsize == 0) {
465 /*
466 * Keep cluster size in bit shift because it's quicker
467 * arithmetic, and easier to keep at a power of 2.
468 */
469 if (clsize != NO_CLSIZE) {
470 for (i = 0; (1 << i) < clsize; i++);
471 if (i > MAX_CLUSTER_SHIFT)
472 i = MAX_CLUSTER_SHIFT;
473 vstruct_def_clshift = i;
474 }
475 default_pager_clsize = (1 << vstruct_def_clshift);
476
477 /*
478 * Let the user know the new (and definitive) cluster size.
479 */
480 if (verbose)
481 printf("%scluster size = %d page%s\n",
482 my_name, default_pager_clsize,
483 (default_pager_clsize == 1) ? "" : "s");
484
485 /*
486 * Let the kernel know too, in case it hasn't used the
487 * default value provided in main() yet.
488 */
489 dmm = default_pager_object;
490 clsize = default_pager_clsize * vm_page_size; /* in bytes */
491 kr = host_default_memory_manager(host_priv_self(),
492 &dmm,
493 clsize);
494 memory_object_default_deallocate(dmm);
495
496 if (kr != KERN_SUCCESS) {
497 panic("bs_get_global_cl_size:host_default_memory_manager");
498 }
499 if (dmm != default_pager_object) {
500 panic("bs_get_global_cl_size:there is another default pager");
501 }
502 }
503 ASSERT(default_pager_clsize > 0 &&
504 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
505
506 return default_pager_clsize;
507 }
508
509 kern_return_t
510 default_pager_backing_store_create(
511 memory_object_default_t pager,
512 int priority,
513 int clsize, /* in bytes */
514 MACH_PORT_FACE *backing_store)
515 {
516 backing_store_t bs;
517 MACH_PORT_FACE port;
518 kern_return_t kr;
519 struct vstruct_alias *alias_struct;
520
521 if (pager != default_pager_object)
522 return KERN_INVALID_ARGUMENT;
523
524 bs = backing_store_alloc();
525 port = ipc_port_alloc_kernel();
526 ipc_port_make_send(port);
527 assert (port != IP_NULL);
528
529 DEBUG(DEBUG_BS_EXTERNAL,
530 ("priority=%d clsize=%d bs_port=0x%x\n",
531 priority, clsize, (int) backing_store));
532
533 alias_struct = (struct vstruct_alias *)
534 kalloc(sizeof (struct vstruct_alias));
535 if(alias_struct != NULL) {
536 alias_struct->vs = (struct vstruct *)bs;
537 alias_struct->name = ISVS;
538 port->alias = (int) alias_struct;
539 }
540 else {
541 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
542 kfree((vm_offset_t)bs, sizeof (struct backing_store));
543 return KERN_RESOURCE_SHORTAGE;
544 }
545
546 bs->bs_port = port;
547 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
548 priority = BS_MAXPRI;
549 else if (priority == BS_NOPRI)
550 priority = BS_MAXPRI;
551 else
552 priority = BS_MINPRI;
553 bs->bs_priority = priority;
554
555 bs->bs_clsize = bs_get_global_clsize(atop(clsize));
556
557 BSL_LOCK();
558 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
559 bs_links);
560 BSL_UNLOCK();
561
562 backing_store_add(bs);
563
564 *backing_store = port;
565 return KERN_SUCCESS;
566 }
567
568 kern_return_t
569 default_pager_backing_store_info(
570 MACH_PORT_FACE backing_store,
571 backing_store_flavor_t flavour,
572 backing_store_info_t info,
573 mach_msg_type_number_t *size)
574 {
575 backing_store_t bs;
576 backing_store_basic_info_t basic;
577 int i;
578 paging_segment_t ps;
579
580 if (flavour != BACKING_STORE_BASIC_INFO ||
581 *size < BACKING_STORE_BASIC_INFO_COUNT)
582 return KERN_INVALID_ARGUMENT;
583
584 basic = (backing_store_basic_info_t)info;
585 *size = BACKING_STORE_BASIC_INFO_COUNT;
586
587 VSTATS_LOCK(&global_stats.gs_lock);
588 basic->pageout_calls = global_stats.gs_pageout_calls;
589 basic->pagein_calls = global_stats.gs_pagein_calls;
590 basic->pages_in = global_stats.gs_pages_in;
591 basic->pages_out = global_stats.gs_pages_out;
592 basic->pages_unavail = global_stats.gs_pages_unavail;
593 basic->pages_init = global_stats.gs_pages_init;
594 basic->pages_init_writes= global_stats.gs_pages_init_writes;
595 VSTATS_UNLOCK(&global_stats.gs_lock);
596
597 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
598 return KERN_INVALID_ARGUMENT;
599
600 basic->bs_pages_total = bs->bs_pages_total;
601 PSL_LOCK();
602 bs->bs_pages_free = 0;
603 for (i = 0; i <= paging_segment_max; i++) {
604 ps = paging_segments[i];
605 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
606 PS_LOCK(ps);
607 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
608 PS_UNLOCK(ps);
609 }
610 }
611 PSL_UNLOCK();
612 basic->bs_pages_free = bs->bs_pages_free;
613 basic->bs_pages_in = bs->bs_pages_in;
614 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
615 basic->bs_pages_out = bs->bs_pages_out;
616 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
617
618 basic->bs_priority = bs->bs_priority;
619 basic->bs_clsize = ptoa(bs->bs_clsize); /* in bytes */
620
621 BS_UNLOCK(bs);
622
623 return KERN_SUCCESS;
624 }
625
626 int ps_delete(paging_segment_t); /* forward */
627
628 int
629 ps_delete(
630 paging_segment_t ps)
631 {
632 vstruct_t vs;
633 kern_return_t error = KERN_SUCCESS;
634 int vs_count;
635
636 VSL_LOCK(); /* get the lock on the list of vs's */
637
638 /* The lock relationship and sequence is farily complicated */
639 /* this code looks at a live list, locking and unlocking the list */
640 /* as it traverses it. It depends on the locking behavior of */
641 /* default_pager_no_senders. no_senders always locks the vstruct */
642 /* targeted for removal before locking the vstruct list. However */
643 /* it will remove that member of the list without locking its */
644 /* neighbors. We can be sure when we hold a lock on a vstruct */
645 /* it cannot be removed from the list but we must hold the list */
646 /* lock to be sure that its pointers to its neighbors are valid. */
647 /* Also, we can hold off destruction of a vstruct when the list */
648 /* lock and the vs locks are not being held by bumping the */
649 /* vs_async_pending count. */
650
651
652 while(backing_store_release_trigger_disable != 0) {
653 assert_wait((event_t)
654 &backing_store_release_trigger_disable,
655 THREAD_UNINT);
656 VSL_UNLOCK();
657 thread_block((void (*)(void)) 0);
658 VSL_LOCK();
659 }
660
661 /* we will choose instead to hold a send right */
662 vs_count = vstruct_list.vsl_count;
663 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
664 if(vs == (vstruct_t)&vstruct_list) {
665 VSL_UNLOCK();
666 return KERN_SUCCESS;
667 }
668 VS_LOCK(vs);
669 vs_async_wait(vs); /* wait for any pending async writes */
670 if ((vs_count != 0) && (vs != NULL))
671 vs->vs_async_pending += 1; /* hold parties calling */
672 /* vs_async_wait */
673 VS_UNLOCK(vs);
674 VSL_UNLOCK();
675 while((vs_count != 0) && (vs != NULL)) {
676 /* We take the count of AMO's before beginning the */
677 /* transfer of of the target segment. */
678 /* We are guaranteed that the target segment cannot get */
679 /* more users. We also know that queue entries are */
680 /* made at the back of the list. If some of the entries */
681 /* we would check disappear while we are traversing the */
682 /* list then we will either check new entries which */
683 /* do not have any backing store in the target segment */
684 /* or re-check old entries. This might not be optimal */
685 /* but it will always be correct. The alternative is to */
686 /* take a snapshot of the list. */
687 vstruct_t next_vs;
688
689 if(dp_pages_free < cluster_transfer_minimum)
690 error = KERN_FAILURE;
691 else {
692 vm_object_t transfer_object;
693 int count;
694 upl_t upl;
695
696 transfer_object = vm_object_allocate(VM_SUPER_CLUSTER);
697 count = 0;
698 error = vm_object_upl_request(transfer_object,
699 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
700 &upl, NULL, &count,
701 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
702 | UPL_SET_INTERNAL);
703 if(error == KERN_SUCCESS) {
704 #ifndef ubc_sync_working
705 upl_commit(upl, NULL);
706 upl_deallocate(upl);
707 error = ps_vstruct_transfer_from_segment(
708 vs, ps, transfer_object);
709 #else
710 error = ps_vstruct_transfer_from_segment(
711 vs, ps, upl);
712 upl_commit(upl, NULL);
713 upl_deallocate(upl);
714 #endif
715 vm_object_deallocate(transfer_object);
716 } else {
717 vm_object_deallocate(transfer_object);
718 error = KERN_FAILURE;
719 }
720 }
721 if(error) {
722 VS_LOCK(vs);
723 vs->vs_async_pending -= 1; /* release vs_async_wait */
724 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
725 vs->vs_waiting_async = FALSE;
726 VS_UNLOCK(vs);
727 thread_wakeup(&vs->vs_async_pending);
728 } else {
729 VS_UNLOCK(vs);
730 }
731 return KERN_FAILURE;
732 }
733
734 VSL_LOCK();
735
736 while(backing_store_release_trigger_disable != 0) {
737 assert_wait((event_t)
738 &backing_store_release_trigger_disable,
739 THREAD_UNINT);
740 VSL_UNLOCK();
741 thread_block((void (*)(void)) 0);
742 VSL_LOCK();
743 }
744
745 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
746 if((next_vs != (vstruct_t)&vstruct_list) &&
747 (vs != next_vs) && (vs_count != 1)) {
748 VS_LOCK(next_vs);
749 vs_async_wait(next_vs); /* wait for any */
750 /* pending async writes */
751 next_vs->vs_async_pending += 1; /* hold parties */
752 /* calling vs_async_wait */
753 VS_UNLOCK(next_vs);
754 }
755 VSL_UNLOCK();
756 VS_LOCK(vs);
757 vs->vs_async_pending -= 1;
758 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
759 vs->vs_waiting_async = FALSE;
760 VS_UNLOCK(vs);
761 thread_wakeup(&vs->vs_async_pending);
762 } else {
763 VS_UNLOCK(vs);
764 }
765 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
766 vs = NULL;
767 else
768 vs = next_vs;
769 vs_count--;
770 }
771 return KERN_SUCCESS;
772 }
773
774
775 kern_return_t
776 default_pager_backing_store_delete(
777 MACH_PORT_FACE backing_store)
778 {
779 backing_store_t bs;
780 int i;
781 paging_segment_t ps;
782 int error;
783 int interim_pages_removed = 0;
784 kern_return_t kr;
785
786 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
787 return KERN_INVALID_ARGUMENT;
788
789 #if 0
790 /* not implemented */
791 BS_UNLOCK(bs);
792 return KERN_FAILURE;
793 #endif
794
795 restart:
796 PSL_LOCK();
797 error = KERN_SUCCESS;
798 for (i = 0; i <= paging_segment_max; i++) {
799 ps = paging_segments[i];
800 if (ps != PAGING_SEGMENT_NULL &&
801 ps->ps_bs == bs &&
802 ! ps->ps_going_away) {
803 PS_LOCK(ps);
804 /* disable access to this segment */
805 ps->ps_going_away = TRUE;
806 PS_UNLOCK(ps);
807 /*
808 * The "ps" segment is "off-line" now,
809 * we can try and delete it...
810 */
811 if(dp_pages_free < (cluster_transfer_minimum
812 + ps->ps_pgcount)) {
813 error = KERN_FAILURE;
814 PSL_UNLOCK();
815 }
816 else {
817 /* remove all pages associated with the */
818 /* segment from the list of free pages */
819 /* when transfer is through, all target */
820 /* segment pages will appear to be free */
821
822 dp_pages_free -= ps->ps_pgcount;
823 interim_pages_removed += ps->ps_pgcount;
824 PSL_UNLOCK();
825 error = ps_delete(ps);
826 }
827 if (error != KERN_SUCCESS) {
828 /*
829 * We couldn't delete the segment,
830 * probably because there's not enough
831 * virtual memory left.
832 * Re-enable all the segments.
833 */
834 PSL_LOCK();
835 break;
836 }
837 goto restart;
838 }
839 }
840
841 if (error != KERN_SUCCESS) {
842 for (i = 0; i <= paging_segment_max; i++) {
843 ps = paging_segments[i];
844 if (ps != PAGING_SEGMENT_NULL &&
845 ps->ps_bs == bs &&
846 ps->ps_going_away) {
847 PS_LOCK(ps);
848 /* re-enable access to this segment */
849 ps->ps_going_away = FALSE;
850 PS_UNLOCK(ps);
851 }
852 }
853 dp_pages_free += interim_pages_removed;
854 PSL_UNLOCK();
855 BS_UNLOCK(bs);
856 return error;
857 }
858
859 for (i = 0; i <= paging_segment_max; i++) {
860 ps = paging_segments[i];
861 if (ps != PAGING_SEGMENT_NULL &&
862 ps->ps_bs == bs) {
863 if(ps->ps_going_away) {
864 paging_segments[i] = PAGING_SEGMENT_NULL;
865 paging_segment_count--;
866 PS_LOCK(ps);
867 kfree((vm_offset_t)ps->ps_bmap,
868 RMAPSIZE(ps->ps_ncls));
869 kfree((vm_offset_t)ps, sizeof *ps);
870 }
871 }
872 }
873
874 /* Scan the entire ps array separately to make certain we find the */
875 /* proper paging_segment_max */
876 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
877 if(paging_segments[i] != PAGING_SEGMENT_NULL)
878 paging_segment_max = i;
879 }
880
881 PSL_UNLOCK();
882
883 /*
884 * All the segments have been deleted.
885 * We can remove the backing store.
886 */
887
888 /*
889 * Disable lookups of this backing store.
890 */
891 if((void *)bs->bs_port->alias != NULL)
892 kfree((vm_offset_t) bs->bs_port->alias,
893 sizeof (struct vstruct_alias));
894 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
895 bs->bs_port = MACH_PORT_NULL;
896 BS_UNLOCK(bs);
897
898 /*
899 * Remove backing store from backing_store list.
900 */
901 BSL_LOCK();
902 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
903 bs_links);
904 BSL_UNLOCK();
905
906 /*
907 * Free the backing store structure.
908 */
909 kfree((vm_offset_t)bs, sizeof *bs);
910
911 return KERN_SUCCESS;
912 }
913
914 int ps_enter(paging_segment_t); /* forward */
915
916 int
917 ps_enter(
918 paging_segment_t ps)
919 {
920 int i;
921
922 PSL_LOCK();
923
924 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
925 if (paging_segments[i] == PAGING_SEGMENT_NULL)
926 break;
927 }
928
929 if (i < MAX_NUM_PAGING_SEGMENTS) {
930 paging_segments[i] = ps;
931 if (i > paging_segment_max)
932 paging_segment_max = i;
933 paging_segment_count++;
934 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
935 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
936 ps_select_array[ps->ps_bs->bs_priority] = 0;
937 i = 0;
938 } else {
939 PSL_UNLOCK();
940 return KERN_RESOURCE_SHORTAGE;
941 }
942
943 PSL_UNLOCK();
944 return i;
945 }
946
947 #ifdef DEVICE_PAGING
948 kern_return_t
949 default_pager_add_segment(
950 MACH_PORT_FACE backing_store,
951 MACH_PORT_FACE device,
952 recnum_t offset,
953 recnum_t count,
954 int record_size)
955 {
956 backing_store_t bs;
957 paging_segment_t ps;
958 int i;
959 int error;
960
961 if ((bs = backing_store_lookup(backing_store))
962 == BACKING_STORE_NULL)
963 return KERN_INVALID_ARGUMENT;
964
965 PSL_LOCK();
966 for (i = 0; i <= paging_segment_max; i++) {
967 ps = paging_segments[i];
968 if (ps == PAGING_SEGMENT_NULL)
969 continue;
970
971 /*
972 * Check for overlap on same device.
973 */
974 if (!(ps->ps_device != device
975 || offset >= ps->ps_offset + ps->ps_recnum
976 || offset + count <= ps->ps_offset)) {
977 PSL_UNLOCK();
978 BS_UNLOCK(bs);
979 return KERN_INVALID_ARGUMENT;
980 }
981 }
982 PSL_UNLOCK();
983
984 /*
985 * Set up the paging segment
986 */
987 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
988 if (ps == PAGING_SEGMENT_NULL) {
989 BS_UNLOCK(bs);
990 return KERN_RESOURCE_SHORTAGE;
991 }
992
993 ps->ps_segtype = PS_PARTITION;
994 ps->ps_device = device;
995 ps->ps_offset = offset;
996 ps->ps_record_shift = local_log2(vm_page_size / record_size);
997 ps->ps_recnum = count;
998 ps->ps_pgnum = count >> ps->ps_record_shift;
999
1000 ps->ps_pgcount = ps->ps_pgnum;
1001 ps->ps_clshift = local_log2(bs->bs_clsize);
1002 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1003 ps->ps_hint = 0;
1004
1005 PS_LOCK_INIT(ps);
1006 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1007 if (!ps->ps_bmap) {
1008 kfree((vm_offset_t)ps, sizeof *ps);
1009 BS_UNLOCK(bs);
1010 return KERN_RESOURCE_SHORTAGE;
1011 }
1012 for (i = 0; i < ps->ps_ncls; i++) {
1013 clrbit(ps->ps_bmap, i);
1014 }
1015
1016 ps->ps_going_away = FALSE;
1017 ps->ps_bs = bs;
1018
1019 if ((error = ps_enter(ps)) != 0) {
1020 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1021 kfree((vm_offset_t)ps, sizeof *ps);
1022 BS_UNLOCK(bs);
1023 return KERN_RESOURCE_SHORTAGE;
1024 }
1025
1026 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1027 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1028 BS_UNLOCK(bs);
1029
1030 PSL_LOCK();
1031 dp_pages_free += ps->ps_pgcount;
1032 PSL_UNLOCK();
1033
1034 bs_more_space(ps->ps_clcount);
1035
1036 DEBUG(DEBUG_BS_INTERNAL,
1037 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1038 device, offset, count, record_size,
1039 ps->ps_record_shift, ps->ps_pgnum));
1040
1041 return KERN_SUCCESS;
1042 }
1043
1044 boolean_t
1045 bs_add_device(
1046 char *dev_name,
1047 MACH_PORT_FACE master)
1048 {
1049 security_token_t null_security_token = {
1050 { 0, 0 }
1051 };
1052 MACH_PORT_FACE device;
1053 int info[DEV_GET_SIZE_COUNT];
1054 mach_msg_type_number_t info_count;
1055 MACH_PORT_FACE bs = MACH_PORT_NULL;
1056 unsigned int rec_size;
1057 recnum_t count;
1058 int clsize;
1059 MACH_PORT_FACE reply_port;
1060
1061 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1062 null_security_token, dev_name, &device))
1063 return FALSE;
1064
1065 info_count = DEV_GET_SIZE_COUNT;
1066 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1067 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1068 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1069 clsize = bs_get_global_clsize(0);
1070 if (!default_pager_backing_store_create(
1071 default_pager_object,
1072 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1073 (clsize * vm_page_size),
1074 &bs)) {
1075 if (!default_pager_add_segment(bs, device,
1076 0, count, rec_size)) {
1077 return TRUE;
1078 }
1079 ipc_port_release_receive(bs);
1080 }
1081 }
1082
1083 ipc_port_release_send(device);
1084 return FALSE;
1085 }
1086 #endif /* DEVICE_PAGING */
1087
1088 #if VS_ASYNC_REUSE
1089
1090 struct vs_async *
1091 vs_alloc_async(void)
1092 {
1093 struct vs_async *vsa;
1094 MACH_PORT_FACE reply_port;
1095 kern_return_t kr;
1096
1097 VS_ASYNC_LOCK();
1098 if (vs_async_free_list == NULL) {
1099 VS_ASYNC_UNLOCK();
1100 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1101 if (vsa != NULL) {
1102 /*
1103 * Try allocating a reply port named after the
1104 * address of the vs_async structure.
1105 */
1106 struct vstruct_alias *alias_struct;
1107
1108 reply_port = ipc_port_alloc_kernel();
1109 alias_struct = (struct vstruct_alias *)
1110 kalloc(sizeof (struct vstruct_alias));
1111 if(alias_struct != NULL) {
1112 alias_struct->vs = (struct vstruct *)vsa;
1113 alias_struct->name = ISVS;
1114 reply_port->alias = (int) alias_struct;
1115 vsa->reply_port = reply_port;
1116 vs_alloc_async_count++;
1117 }
1118 else {
1119 vs_alloc_async_failed++;
1120 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1121 (reply_port));
1122 kfree((vm_offset_t)vsa,
1123 sizeof (struct vs_async));
1124 vsa = NULL;
1125 }
1126 }
1127 } else {
1128 vsa = vs_async_free_list;
1129 vs_async_free_list = vs_async_free_list->vsa_next;
1130 VS_ASYNC_UNLOCK();
1131 }
1132
1133 return vsa;
1134 }
1135
1136 void
1137 vs_free_async(
1138 struct vs_async *vsa)
1139 {
1140 VS_ASYNC_LOCK();
1141 vsa->vsa_next = vs_async_free_list;
1142 vs_async_free_list = vsa;
1143 VS_ASYNC_UNLOCK();
1144 }
1145
1146 #else /* VS_ASYNC_REUSE */
1147
1148 struct vs_async *
1149 vs_alloc_async(void)
1150 {
1151 struct vs_async *vsa;
1152 MACH_PORT_FACE reply_port;
1153 kern_return_t kr;
1154
1155 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1156 if (vsa != NULL) {
1157 /*
1158 * Try allocating a reply port named after the
1159 * address of the vs_async structure.
1160 */
1161 reply_port = ipc_port_alloc_kernel();
1162 alias_struct = (vstruct_alias *)
1163 kalloc(sizeof (struct vstruct_alias));
1164 if(alias_struct != NULL) {
1165 alias_struct->vs = reply_port;
1166 alias_struct->name = ISVS;
1167 reply_port->alias = (int) vsa;
1168 vsa->reply_port = reply_port;
1169 vs_alloc_async_count++;
1170 }
1171 else {
1172 vs_alloc_async_failed++;
1173 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1174 (reply_port));
1175 kfree((vm_offset_t) vsa,
1176 sizeof (struct vs_async));
1177 vsa = NULL;
1178 }
1179 }
1180
1181 return vsa;
1182 }
1183
1184 void
1185 vs_free_async(
1186 struct vs_async *vsa)
1187 {
1188 MACH_PORT_FACE reply_port;
1189 kern_return_t kr;
1190
1191 reply_port = vsa->reply_port;
1192 kfree((vm_offset_t) reply_port->alias, sizeof (struct vstuct_alias));
1193 kfree((vm_offset_t) vsa, sizeof (struct vs_async));
1194 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1195 #if 0
1196 VS_ASYNC_LOCK();
1197 vs_alloc_async_count--;
1198 VS_ASYNC_UNLOCK();
1199 #endif
1200 }
1201
1202 #endif /* VS_ASYNC_REUSE */
1203
1204 zone_t vstruct_zone;
1205
1206 vstruct_t
1207 ps_vstruct_create(
1208 vm_size_t size)
1209 {
1210 vstruct_t vs;
1211 int i;
1212
1213 vs = (vstruct_t) zalloc(vstruct_zone);
1214 if (vs == VSTRUCT_NULL) {
1215 return VSTRUCT_NULL;
1216 }
1217
1218 VS_LOCK_INIT(vs);
1219
1220 /*
1221 * The following fields will be provided later.
1222 */
1223 vs->vs_mem_obj = NULL;
1224 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1225 vs->vs_references = 1;
1226 vs->vs_seqno = 0;
1227
1228 #ifdef MACH_KERNEL
1229 vs->vs_waiting_seqno = FALSE;
1230 vs->vs_waiting_read = FALSE;
1231 vs->vs_waiting_write = FALSE;
1232 vs->vs_waiting_async = FALSE;
1233 #else
1234 mutex_init(&vs->vs_waiting_seqno, ETAP_DPAGE_VSSEQNO);
1235 mutex_init(&vs->vs_waiting_read, ETAP_DPAGE_VSREAD);
1236 mutex_init(&vs->vs_waiting_write, ETAP_DPAGE_VSWRITE);
1237 mutex_init(&vs->vs_waiting_refs, ETAP_DPAGE_VSREFS);
1238 mutex_init(&vs->vs_waiting_async, ETAP_DPAGE_VSASYNC);
1239 #endif
1240
1241 vs->vs_readers = 0;
1242 vs->vs_writers = 0;
1243
1244 vs->vs_errors = 0;
1245
1246 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1247 vs->vs_size = ((atop(round_page(size)) - 1) >> vs->vs_clshift) + 1;
1248 vs->vs_async_pending = 0;
1249
1250 /*
1251 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1252 * depending on the size of the memory object.
1253 */
1254 if (INDIRECT_CLMAP(vs->vs_size)) {
1255 vs->vs_imap = (struct vs_map **)
1256 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1257 vs->vs_indirect = TRUE;
1258 } else {
1259 vs->vs_dmap = (struct vs_map *)
1260 kalloc(CLMAP_SIZE(vs->vs_size));
1261 vs->vs_indirect = FALSE;
1262 }
1263 vs->vs_xfer_pending = FALSE;
1264 DEBUG(DEBUG_VS_INTERNAL,
1265 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1266
1267 /*
1268 * Check to see that we got the space.
1269 */
1270 if (!vs->vs_dmap) {
1271 kfree((vm_offset_t)vs, sizeof *vs);
1272 return VSTRUCT_NULL;
1273 }
1274
1275 /*
1276 * Zero the indirect pointers, or clear the direct pointers.
1277 */
1278 if (vs->vs_indirect)
1279 memset(vs->vs_imap, 0,
1280 INDIRECT_CLMAP_SIZE(vs->vs_size));
1281 else
1282 for (i = 0; i < vs->vs_size; i++)
1283 VSM_CLR(vs->vs_dmap[i]);
1284
1285 VS_MAP_LOCK_INIT(vs);
1286
1287 bs_commit(vs->vs_size);
1288
1289 return vs;
1290 }
1291
1292 paging_segment_t ps_select_segment(int, int *); /* forward */
1293
1294 paging_segment_t
1295 ps_select_segment(
1296 int shift,
1297 int *psindex)
1298 {
1299 paging_segment_t ps;
1300 int i;
1301 int j;
1302
1303 /*
1304 * Optimize case where there's only one segment.
1305 * paging_segment_max will index the one and only segment.
1306 */
1307
1308 PSL_LOCK();
1309 if (paging_segment_count == 1) {
1310 paging_segment_t lps; /* used to avoid extra PS_UNLOCK */
1311 ipc_port_t trigger = IP_NULL;
1312
1313 ps = paging_segments[paging_segment_max];
1314 *psindex = paging_segment_max;
1315 PS_LOCK(ps);
1316 if (ps->ps_going_away) {
1317 /* this segment is being turned off */
1318 lps = PAGING_SEGMENT_NULL;
1319 } else {
1320 ASSERT(ps->ps_clshift >= shift);
1321 if (ps->ps_clcount) {
1322 ps->ps_clcount--;
1323 dp_pages_free -= 1 << ps->ps_clshift;
1324 if(min_pages_trigger_port &&
1325 (dp_pages_free < minimum_pages_remaining)) {
1326 trigger = min_pages_trigger_port;
1327 min_pages_trigger_port = NULL;
1328 bs_low = TRUE;
1329 }
1330 lps = ps;
1331 } else
1332 lps = PAGING_SEGMENT_NULL;
1333 }
1334 PS_UNLOCK(ps);
1335 PSL_UNLOCK();
1336
1337 if (trigger != IP_NULL) {
1338 default_pager_space_alert(trigger, HI_WAT_ALERT);
1339 ipc_port_release_send(trigger);
1340 }
1341 return lps;
1342 }
1343
1344 if (paging_segment_count == 0) {
1345 PSL_UNLOCK();
1346 return PAGING_SEGMENT_NULL;
1347 }
1348
1349 for (i = BS_MAXPRI;
1350 i >= BS_MINPRI; i--) {
1351 int start_index;
1352
1353 if ((ps_select_array[i] == BS_NOPRI) ||
1354 (ps_select_array[i] == BS_FULLPRI))
1355 continue;
1356 start_index = ps_select_array[i];
1357
1358 if(!(paging_segments[start_index])) {
1359 j = start_index+1;
1360 physical_transfer_cluster_count = 0;
1361 }
1362 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1363 (((paging_segments[start_index])->ps_clshift)
1364 + vm_page_shift))) {
1365 physical_transfer_cluster_count = 0;
1366 j = start_index + 1;
1367 } else {
1368 physical_transfer_cluster_count+=1;
1369 j = start_index;
1370 if(start_index == 0)
1371 start_index = paging_segment_max;
1372 else
1373 start_index = start_index - 1;
1374 }
1375
1376 while (1) {
1377 if (j > paging_segment_max)
1378 j = 0;
1379 if ((ps = paging_segments[j]) &&
1380 (ps->ps_bs->bs_priority == i)) {
1381 /*
1382 * Force the ps cluster size to be
1383 * >= that of the vstruct.
1384 */
1385 PS_LOCK(ps);
1386 if (ps->ps_going_away) {
1387 /* this segment is being turned off */
1388 } else if ((ps->ps_clcount) &&
1389 (ps->ps_clshift >= shift)) {
1390 ipc_port_t trigger = IP_NULL;
1391
1392 ps->ps_clcount--;
1393 dp_pages_free -= 1 << ps->ps_clshift;
1394 if(min_pages_trigger_port &&
1395 (dp_pages_free <
1396 minimum_pages_remaining)) {
1397 trigger = min_pages_trigger_port;
1398 min_pages_trigger_port = NULL;
1399 }
1400 PS_UNLOCK(ps);
1401 /*
1402 * found one, quit looking.
1403 */
1404 ps_select_array[i] = j;
1405 PSL_UNLOCK();
1406
1407 if (trigger != IP_NULL) {
1408 default_pager_space_alert(
1409 trigger,
1410 HI_WAT_ALERT);
1411 ipc_port_release_send(trigger);
1412 }
1413 *psindex = j;
1414 return ps;
1415 }
1416 PS_UNLOCK(ps);
1417 }
1418 if (j == start_index) {
1419 /*
1420 * none at this priority -- mark it full
1421 */
1422 ps_select_array[i] = BS_FULLPRI;
1423 break;
1424 }
1425 j++;
1426 }
1427 }
1428 PSL_UNLOCK();
1429 return PAGING_SEGMENT_NULL;
1430 }
1431
1432 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1433
1434 vm_offset_t
1435 ps_allocate_cluster(
1436 vstruct_t vs,
1437 int *psindex,
1438 paging_segment_t use_ps)
1439 {
1440 int byte_num;
1441 int bit_num = 0;
1442 paging_segment_t ps;
1443 vm_offset_t cluster;
1444 ipc_port_t trigger = IP_NULL;
1445
1446 /*
1447 * Find best paging segment.
1448 * ps_select_segment will decrement cluster count on ps.
1449 * Must pass cluster shift to find the most appropriate segment.
1450 */
1451 /* NOTE: The addition of paging segment delete capability threatened
1452 * to seriously complicate the treatment of paging segments in this
1453 * module and the ones that call it (notably ps_clmap), because of the
1454 * difficulty in assuring that the paging segment would continue to
1455 * exist between being unlocked and locked. This was
1456 * avoided because all calls to this module are based in either
1457 * dp_memory_object calls which rely on the vs lock, or by
1458 * the transfer function which is part of the segment delete path.
1459 * The transfer function which is part of paging segment delete is
1460 * protected from multiple callers by the backing store lock.
1461 * The paging segment delete function treats mappings to a paging
1462 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1463 * while data is transferred to the remaining segments. This is in
1464 * line with the view that incomplete or in-transition mappings between
1465 * data, a vstruct, and backing store are protected by the vs lock.
1466 * This and the ordering of the paging segment "going_away" bit setting
1467 * protects us.
1468 */
1469 if (use_ps != PAGING_SEGMENT_NULL) {
1470 ps = use_ps;
1471 PSL_LOCK();
1472 PS_LOCK(ps);
1473 ps->ps_clcount--;
1474 dp_pages_free -= 1 << ps->ps_clshift;
1475 if(min_pages_trigger_port &&
1476 (dp_pages_free < minimum_pages_remaining)) {
1477 trigger = min_pages_trigger_port;
1478 min_pages_trigger_port = NULL;
1479 }
1480 PSL_UNLOCK();
1481 PS_UNLOCK(ps);
1482 if (trigger != IP_NULL) {
1483 default_pager_space_alert(trigger, HI_WAT_ALERT);
1484 ipc_port_release_send(trigger);
1485 }
1486
1487 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1488 PAGING_SEGMENT_NULL) {
1489 #if 0
1490 bs_no_paging_space(TRUE);
1491 #endif
1492 #if 0
1493 if (verbose)
1494 #endif
1495 dprintf(("no space in available paging segments; "
1496 "swapon suggested\n"));
1497 /* the count got off maybe, reset to zero */
1498 PSL_LOCK();
1499 dp_pages_free = 0;
1500 if(min_pages_trigger_port) {
1501 trigger = min_pages_trigger_port;
1502 min_pages_trigger_port = NULL;
1503 bs_low = TRUE;
1504 }
1505 PSL_UNLOCK();
1506 if (trigger != IP_NULL) {
1507 default_pager_space_alert(trigger, HI_WAT_ALERT);
1508 ipc_port_release_send(trigger);
1509 }
1510 return (vm_offset_t) -1;
1511 }
1512 ASSERT(ps->ps_clcount != 0);
1513
1514 /*
1515 * Look for an available cluster. At the end of the loop,
1516 * byte_num is the byte offset and bit_num is the bit offset of the
1517 * first zero bit in the paging segment bitmap.
1518 */
1519 PS_LOCK(ps);
1520 byte_num = ps->ps_hint;
1521 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1522 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1523 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1524 if (isclr((ps->ps_bmap + byte_num), bit_num))
1525 break;
1526 }
1527 ASSERT(bit_num != NBBY);
1528 break;
1529 }
1530 }
1531 ps->ps_hint = byte_num;
1532 cluster = (byte_num*NBBY) + bit_num;
1533
1534 /* Space was reserved, so this must be true */
1535 ASSERT(cluster < ps->ps_ncls);
1536
1537 setbit(ps->ps_bmap, cluster);
1538 PS_UNLOCK(ps);
1539
1540 return cluster;
1541 }
1542
1543 void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */
1544
1545 void
1546 ps_deallocate_cluster(
1547 paging_segment_t ps,
1548 vm_offset_t cluster)
1549 {
1550 ipc_port_t trigger = IP_NULL;
1551
1552 if (cluster >= (vm_offset_t) ps->ps_ncls)
1553 panic("ps_deallocate_cluster: Invalid cluster number");
1554
1555 /*
1556 * Lock the paging segment, clear the cluster's bitmap and increment the
1557 * number of free cluster.
1558 */
1559 PSL_LOCK();
1560 PS_LOCK(ps);
1561 clrbit(ps->ps_bmap, cluster);
1562 ++ps->ps_clcount;
1563 dp_pages_free += 1 << ps->ps_clshift;
1564 if(max_pages_trigger_port
1565 && (backing_store_release_trigger_disable == 0)
1566 && (dp_pages_free > maximum_pages_free)) {
1567 trigger = max_pages_trigger_port;
1568 max_pages_trigger_port = NULL;
1569 }
1570 PSL_UNLOCK();
1571
1572 /*
1573 * Move the hint down to the freed cluster if it is
1574 * less than the current hint.
1575 */
1576 if ((cluster/NBBY) < ps->ps_hint) {
1577 ps->ps_hint = (cluster/NBBY);
1578 }
1579
1580 PS_UNLOCK(ps);
1581
1582 /*
1583 * If we're freeing space on a full priority, reset the array.
1584 */
1585 PSL_LOCK();
1586 if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1587 ps_select_array[ps->ps_bs->bs_priority] = 0;
1588 PSL_UNLOCK();
1589
1590 if (trigger != IP_NULL) {
1591 VSL_LOCK();
1592 if(backing_store_release_trigger_disable != 0) {
1593 assert_wait((event_t)
1594 &backing_store_release_trigger_disable,
1595 THREAD_UNINT);
1596 VSL_UNLOCK();
1597 thread_block((void (*)(void)) 0);
1598 } else {
1599 VSL_UNLOCK();
1600 }
1601 default_pager_space_alert(trigger, LO_WAT_ALERT);
1602 ipc_port_release_send(trigger);
1603 }
1604
1605 return;
1606 }
1607
1608 void ps_dealloc_vsmap(struct vs_map *, vm_size_t); /* forward */
1609
1610 void
1611 ps_dealloc_vsmap(
1612 struct vs_map *vsmap,
1613 vm_size_t size)
1614 {
1615 int i;
1616 for (i = 0; i < size; i++)
1617 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1618 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1619 VSM_CLOFF(vsmap[i]));
1620 }
1621
1622 void
1623 ps_vstruct_dealloc(
1624 vstruct_t vs)
1625 {
1626 int i;
1627 spl_t s;
1628
1629 VS_MAP_LOCK(vs);
1630
1631 /*
1632 * If this is an indirect structure, then we walk through the valid
1633 * (non-zero) indirect pointers and deallocate the clusters
1634 * associated with each used map entry (via ps_dealloc_vsmap).
1635 * When all of the clusters in an indirect block have been
1636 * freed, we deallocate the block. When all of the indirect
1637 * blocks have been deallocated we deallocate the memory
1638 * holding the indirect pointers.
1639 */
1640 if (vs->vs_indirect) {
1641 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1642 if (vs->vs_imap[i] != NULL) {
1643 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1644 kfree((vm_offset_t)vs->vs_imap[i],
1645 CLMAP_THRESHOLD);
1646 }
1647 }
1648 kfree((vm_offset_t)vs->vs_imap,
1649 INDIRECT_CLMAP_SIZE(vs->vs_size));
1650 } else {
1651 /*
1652 * Direct map. Free used clusters, then memory.
1653 */
1654 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1655 kfree((vm_offset_t)vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1656 }
1657 VS_MAP_UNLOCK(vs);
1658
1659 bs_commit(- vs->vs_size);
1660
1661 zfree(vstruct_zone, (vm_offset_t)vs);
1662 }
1663
1664 int ps_map_extend(vstruct_t, int); /* forward */
1665
1666 int ps_map_extend(
1667 vstruct_t vs,
1668 int new_size)
1669 {
1670 struct vs_map **new_imap;
1671 struct vs_map *new_dmap = NULL;
1672 int newdsize;
1673 int i;
1674 void *old_map = NULL;
1675 int old_map_size = 0;
1676
1677 if (vs->vs_size >= new_size) {
1678 /*
1679 * Someone has already done the work.
1680 */
1681 return 0;
1682 }
1683
1684 /*
1685 * If the new size extends into the indirect range, then we have one
1686 * of two cases: we are going from indirect to indirect, or we are
1687 * going from direct to indirect. If we are going from indirect to
1688 * indirect, then it is possible that the new size will fit in the old
1689 * indirect map. If this is the case, then just reset the size of the
1690 * vstruct map and we are done. If the new size will not
1691 * fit into the old indirect map, then we have to allocate a new
1692 * indirect map and copy the old map pointers into this new map.
1693 *
1694 * If we are going from direct to indirect, then we have to allocate a
1695 * new indirect map and copy the old direct pages into the first
1696 * indirect page of the new map.
1697 * NOTE: allocating memory here is dangerous, as we're in the
1698 * pageout path.
1699 */
1700 if (INDIRECT_CLMAP(new_size)) {
1701 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1702
1703 /*
1704 * Get a new indirect map and zero it.
1705 */
1706 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1707 if (vs->vs_indirect &&
1708 (new_map_size == old_map_size)) {
1709 bs_commit(new_size - vs->vs_size);
1710 vs->vs_size = new_size;
1711 return 0;
1712 }
1713
1714 new_imap = (struct vs_map **)kalloc(new_map_size);
1715 if (new_imap == NULL) {
1716 return -1;
1717 }
1718 memset(new_imap, 0, new_map_size);
1719
1720 if (vs->vs_indirect) {
1721 /* Copy old entries into new map */
1722 memcpy(new_imap, vs->vs_imap, old_map_size);
1723 /* Arrange to free the old map */
1724 old_map = (void *) vs->vs_imap;
1725 newdsize = 0;
1726 } else { /* Old map was a direct map */
1727 /* Allocate an indirect page */
1728 if ((new_imap[0] = (struct vs_map *)
1729 kalloc(CLMAP_THRESHOLD)) == NULL) {
1730 kfree((vm_offset_t)new_imap, new_map_size);
1731 return -1;
1732 }
1733 new_dmap = new_imap[0];
1734 newdsize = CLMAP_ENTRIES;
1735 }
1736 } else {
1737 new_imap = NULL;
1738 newdsize = new_size;
1739 /*
1740 * If the new map is a direct map, then the old map must
1741 * also have been a direct map. All we have to do is
1742 * to allocate a new direct map, copy the old entries
1743 * into it and free the old map.
1744 */
1745 if ((new_dmap = (struct vs_map *)
1746 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1747 return -1;
1748 }
1749 }
1750 if (newdsize) {
1751
1752 /* Free the old map */
1753 old_map = (void *) vs->vs_dmap;
1754 old_map_size = CLMAP_SIZE(vs->vs_size);
1755
1756 /* Copy info from the old map into the new map */
1757 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1758
1759 /* Initialize the rest of the new map */
1760 for (i = vs->vs_size; i < newdsize; i++)
1761 VSM_CLR(new_dmap[i]);
1762 }
1763 if (new_imap) {
1764 vs->vs_imap = new_imap;
1765 vs->vs_indirect = TRUE;
1766 } else
1767 vs->vs_dmap = new_dmap;
1768 bs_commit(new_size - vs->vs_size);
1769 vs->vs_size = new_size;
1770 if (old_map)
1771 kfree((vm_offset_t)old_map, old_map_size);
1772 return 0;
1773 }
1774
1775 vm_offset_t
1776 ps_clmap(
1777 vstruct_t vs,
1778 vm_offset_t offset,
1779 struct clmap *clmap,
1780 int flag,
1781 vm_size_t size,
1782 int error)
1783 {
1784 vm_offset_t cluster; /* The cluster of offset. */
1785 vm_offset_t newcl; /* The new cluster allocated. */
1786 vm_offset_t newoff;
1787 int i;
1788 struct vs_map *vsmap;
1789
1790 VS_MAP_LOCK(vs);
1791
1792 ASSERT(vs->vs_dmap);
1793 cluster = atop(offset) >> vs->vs_clshift;
1794
1795 /*
1796 * Initialize cluster error value
1797 */
1798 clmap->cl_error = 0;
1799
1800 /*
1801 * If the object has grown, extend the page map.
1802 */
1803 if (cluster >= vs->vs_size) {
1804 if (flag == CL_FIND) {
1805 /* Do not allocate if just doing a lookup */
1806 VS_MAP_UNLOCK(vs);
1807 return (vm_offset_t) -1;
1808 }
1809 if (ps_map_extend(vs, cluster + 1)) {
1810 VS_MAP_UNLOCK(vs);
1811 return (vm_offset_t) -1;
1812 }
1813 }
1814
1815 /*
1816 * Look for the desired cluster. If the map is indirect, then we
1817 * have a two level lookup. First find the indirect block, then
1818 * find the actual cluster. If the indirect block has not yet
1819 * been allocated, then do so. If the cluster has not yet been
1820 * allocated, then do so.
1821 *
1822 * If any of the allocations fail, then return an error.
1823 * Don't allocate if just doing a lookup.
1824 */
1825 if (vs->vs_indirect) {
1826 long ind_block = cluster/CLMAP_ENTRIES;
1827
1828 /* Is the indirect block allocated? */
1829 vsmap = vs->vs_imap[ind_block];
1830 if (vsmap == NULL) {
1831 if (flag == CL_FIND) {
1832 VS_MAP_UNLOCK(vs);
1833 return (vm_offset_t) -1;
1834 }
1835
1836 /* Allocate the indirect block */
1837 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1838 if (vsmap == NULL) {
1839 VS_MAP_UNLOCK(vs);
1840 return (vm_offset_t) -1;
1841 }
1842 /* Initialize the cluster offsets */
1843 for (i = 0; i < CLMAP_ENTRIES; i++)
1844 VSM_CLR(vsmap[i]);
1845 vs->vs_imap[ind_block] = vsmap;
1846 }
1847 } else
1848 vsmap = vs->vs_dmap;
1849
1850 ASSERT(vsmap);
1851 vsmap += cluster%CLMAP_ENTRIES;
1852
1853 /*
1854 * At this point, vsmap points to the struct vs_map desired.
1855 *
1856 * Look in the map for the cluster, if there was an error on a
1857 * previous write, flag it and return. If it is not yet
1858 * allocated, then allocate it, if we're writing; if we're
1859 * doing a lookup and the cluster's not allocated, return error.
1860 */
1861 if (VSM_ISERR(*vsmap)) {
1862 clmap->cl_error = VSM_GETERR(*vsmap);
1863 VS_MAP_UNLOCK(vs);
1864 return (vm_offset_t) -1;
1865 } else if (VSM_ISCLR(*vsmap)) {
1866 int psindex;
1867
1868 if (flag == CL_FIND) {
1869 /*
1870 * If there's an error and the entry is clear, then
1871 * we've run out of swap space. Record the error
1872 * here and return.
1873 */
1874 if (error) {
1875 VSM_SETERR(*vsmap, error);
1876 }
1877 VS_MAP_UNLOCK(vs);
1878 return (vm_offset_t) -1;
1879 } else {
1880 /*
1881 * Attempt to allocate a cluster from the paging segment
1882 */
1883 newcl = ps_allocate_cluster(vs, &psindex,
1884 PAGING_SEGMENT_NULL);
1885 if (newcl == -1) {
1886 VS_MAP_UNLOCK(vs);
1887 return (vm_offset_t) -1;
1888 }
1889 VSM_CLR(*vsmap);
1890 VSM_SETCLOFF(*vsmap, newcl);
1891 VSM_SETPS(*vsmap, psindex);
1892 }
1893 } else
1894 newcl = VSM_CLOFF(*vsmap);
1895
1896 /*
1897 * Fill in pertinent fields of the clmap
1898 */
1899 clmap->cl_ps = VSM_PS(*vsmap);
1900 clmap->cl_numpages = VSCLSIZE(vs);
1901 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1902
1903 /*
1904 * Byte offset in paging segment is byte offset to cluster plus
1905 * byte offset within cluster. It looks ugly, but should be
1906 * relatively quick.
1907 */
1908 ASSERT(trunc_page(offset) == offset);
1909 newcl = ptoa(newcl) << vs->vs_clshift;
1910 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1911 if (flag == CL_ALLOC) {
1912 /*
1913 * set bits in the allocation bitmap according to which
1914 * pages were requested. size is in bytes.
1915 */
1916 i = atop(newoff);
1917 while ((size > 0) && (i < VSCLSIZE(vs))) {
1918 VSM_SETALLOC(*vsmap, i);
1919 i++;
1920 size -= vm_page_size;
1921 }
1922 }
1923 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1924 if (newoff) {
1925 /*
1926 * Offset is not cluster aligned, so number of pages
1927 * and bitmaps must be adjusted
1928 */
1929 clmap->cl_numpages -= atop(newoff);
1930 CLMAP_SHIFT(clmap, vs);
1931 CLMAP_SHIFTALLOC(clmap, vs);
1932 }
1933
1934 /*
1935 *
1936 * The setting of valid bits and handling of write errors
1937 * must be done here, while we hold the lock on the map.
1938 * It logically should be done in ps_vs_write_complete().
1939 * The size and error information has been passed from
1940 * ps_vs_write_complete(). If the size parameter is non-zero,
1941 * then there is work to be done. If error is also non-zero,
1942 * then the error number is recorded in the cluster and the
1943 * entire cluster is in error.
1944 */
1945 if (size && flag == CL_FIND) {
1946 vm_offset_t off = (vm_offset_t) 0;
1947
1948 if (!error) {
1949 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1950 i++) {
1951 VSM_SETPG(*vsmap, i);
1952 size -= vm_page_size;
1953 }
1954 ASSERT(i <= VSCLSIZE(vs));
1955 } else {
1956 BS_STAT(clmap->cl_ps->ps_bs,
1957 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1958 atop(size));
1959 off = VSM_CLOFF(*vsmap);
1960 VSM_SETERR(*vsmap, error);
1961 }
1962 /*
1963 * Deallocate cluster if error, and no valid pages
1964 * already present.
1965 */
1966 if (off != (vm_offset_t) 0)
1967 ps_deallocate_cluster(clmap->cl_ps, off);
1968 VS_MAP_UNLOCK(vs);
1969 return (vm_offset_t) 0;
1970 } else
1971 VS_MAP_UNLOCK(vs);
1972
1973 DEBUG(DEBUG_VS_INTERNAL,
1974 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1975 newcl+newoff, (int) vs, (int) vsmap, flag));
1976 DEBUG(DEBUG_VS_INTERNAL,
1977 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1978 (int) clmap->cl_ps, clmap->cl_numpages,
1979 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1980
1981 return (newcl + newoff);
1982 }
1983
1984 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */
1985
1986 void
1987 ps_clunmap(
1988 vstruct_t vs,
1989 vm_offset_t offset,
1990 vm_size_t length)
1991 {
1992 vm_offset_t cluster; /* The cluster number of offset */
1993 struct vs_map *vsmap;
1994
1995 VS_MAP_LOCK(vs);
1996
1997 /*
1998 * Loop through all clusters in this range, freeing paging segment
1999 * clusters and map entries as encountered.
2000 */
2001 while (length > 0) {
2002 vm_offset_t newoff;
2003 int i;
2004
2005 cluster = atop(offset) >> vs->vs_clshift;
2006 if (vs->vs_indirect) /* indirect map */
2007 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2008 else
2009 vsmap = vs->vs_dmap;
2010 if (vsmap == NULL) {
2011 VS_MAP_UNLOCK(vs);
2012 return;
2013 }
2014 vsmap += cluster%CLMAP_ENTRIES;
2015 if (VSM_ISCLR(*vsmap)) {
2016 length -= vm_page_size;
2017 offset += vm_page_size;
2018 continue;
2019 }
2020 /*
2021 * We've got a valid mapping. Clear it and deallocate
2022 * paging segment cluster pages.
2023 * Optimize for entire cluster cleraing.
2024 */
2025 if (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) {
2026 /*
2027 * Not cluster aligned.
2028 */
2029 ASSERT(trunc_page(newoff) == newoff);
2030 i = atop(newoff);
2031 } else
2032 i = 0;
2033 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2034 VSM_CLRPG(*vsmap, i);
2035 VSM_CLRALLOC(*vsmap, i);
2036 length -= vm_page_size;
2037 offset += vm_page_size;
2038 i++;
2039 }
2040
2041 /*
2042 * If map entry is empty, clear and deallocate cluster.
2043 */
2044 if (!VSM_ALLOC(*vsmap)) {
2045 ps_deallocate_cluster(VSM_PS(*vsmap),
2046 VSM_CLOFF(*vsmap));
2047 VSM_CLR(*vsmap);
2048 }
2049 }
2050
2051 VS_MAP_UNLOCK(vs);
2052 }
2053
2054 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2055
2056 void
2057 ps_vs_write_complete(
2058 vstruct_t vs,
2059 vm_offset_t offset,
2060 vm_size_t size,
2061 int error)
2062 {
2063 struct clmap clmap;
2064
2065 /*
2066 * Get the struct vsmap for this cluster.
2067 * Use READ, even though it was written, because the
2068 * cluster MUST be present, unless there was an error
2069 * in the original ps_clmap (e.g. no space), in which
2070 * case, nothing happens.
2071 *
2072 * Must pass enough information to ps_clmap to allow it
2073 * to set the vs_map structure bitmap under lock.
2074 */
2075 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2076 }
2077
2078 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */
2079
2080 void
2081 vs_cl_write_complete(
2082 vstruct_t vs,
2083 paging_segment_t ps,
2084 vm_offset_t offset,
2085 vm_offset_t addr,
2086 vm_size_t size,
2087 boolean_t async,
2088 int error)
2089 {
2090 kern_return_t kr;
2091
2092 if (error) {
2093 /*
2094 * For internal objects, the error is recorded on a
2095 * per-cluster basis by ps_clmap() which is called
2096 * by ps_vs_write_complete() below.
2097 */
2098 dprintf(("write failed error = 0x%x\n", error));
2099 /* add upl_abort code here */
2100 } else
2101 GSTAT(global_stats.gs_pages_out += atop(size));
2102 /*
2103 * Notify the vstruct mapping code, so it can do its accounting.
2104 */
2105 ps_vs_write_complete(vs, offset, size, error);
2106
2107 if (async) {
2108 VS_LOCK(vs);
2109 ASSERT(vs->vs_async_pending > 0);
2110 vs->vs_async_pending -= size;
2111 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2112 vs->vs_waiting_async = FALSE;
2113 VS_UNLOCK(vs);
2114 /* mutex_unlock(&vs->vs_waiting_async); */
2115 thread_wakeup(&vs->vs_async_pending);
2116 } else {
2117 VS_UNLOCK(vs);
2118 }
2119 }
2120 }
2121
2122 #ifdef DEVICE_PAGING
2123 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2124
2125 kern_return_t
2126 device_write_reply(
2127 MACH_PORT_FACE reply_port,
2128 kern_return_t device_code,
2129 io_buf_len_t bytes_written)
2130 {
2131 struct vs_async *vsa;
2132
2133 vsa = (struct vs_async *)
2134 ((struct vstruct_alias *)(reply_port->alias))->vs;
2135
2136 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2137 device_code = KERN_FAILURE;
2138 }
2139
2140 vsa->vsa_error = device_code;
2141
2142
2143 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2144 if(vsa->vsa_flags & VSA_TRANSFER) {
2145 /* revisit when async disk segments redone */
2146 if(vsa->vsa_error) {
2147 /* need to consider error condition. re-write data or */
2148 /* throw it away here. */
2149 vm_offset_t ioaddr;
2150 if(vm_map_copyout(kernel_map, &ioaddr,
2151 (vm_map_copy_t)vsa->vsa_addr) != KERN_SUCCESS)
2152 panic("vs_cluster_write: unable to copy source list\n");
2153 vm_deallocate(kernel_map, ioaddr, vsa->vsa_size);
2154 }
2155 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2156 vsa->vsa_size, vsa->vsa_error);
2157 } else {
2158 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2159 vsa->vsa_addr, vsa->vsa_size, TRUE,
2160 vsa->vsa_error);
2161 }
2162 VS_FREE_ASYNC(vsa);
2163
2164 return KERN_SUCCESS;
2165 }
2166
2167 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2168 kern_return_t
2169 device_write_reply_inband(
2170 MACH_PORT_FACE reply_port,
2171 kern_return_t return_code,
2172 io_buf_len_t bytes_written)
2173 {
2174 panic("device_write_reply_inband: illegal");
2175 return KERN_SUCCESS;
2176 }
2177
2178 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2179 kern_return_t
2180 device_read_reply(
2181 MACH_PORT_FACE reply_port,
2182 kern_return_t return_code,
2183 io_buf_ptr_t data,
2184 mach_msg_type_number_t dataCnt)
2185 {
2186 struct vs_async *vsa;
2187 vsa = (struct vs_async *)
2188 ((struct vstruct_alias *)(reply_port->alias))->vs;
2189 vsa->vsa_addr = (vm_offset_t)data;
2190 vsa->vsa_size = (vm_size_t)dataCnt;
2191 vsa->vsa_error = return_code;
2192 thread_wakeup(&vsa->vsa_lock);
2193 return KERN_SUCCESS;
2194 }
2195
2196 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2197 kern_return_t
2198 device_read_reply_inband(
2199 MACH_PORT_FACE reply_port,
2200 kern_return_t return_code,
2201 io_buf_ptr_inband_t data,
2202 mach_msg_type_number_t dataCnt)
2203 {
2204 panic("device_read_reply_inband: illegal");
2205 return KERN_SUCCESS;
2206 }
2207
2208 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2209 kern_return_t
2210 device_read_reply_overwrite(
2211 MACH_PORT_FACE reply_port,
2212 kern_return_t return_code,
2213 io_buf_len_t bytes_read)
2214 {
2215 panic("device_read_reply_overwrite: illegal\n");
2216 return KERN_SUCCESS;
2217 }
2218
2219 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2220 kern_return_t
2221 device_open_reply(
2222 MACH_PORT_FACE reply_port,
2223 kern_return_t return_code,
2224 MACH_PORT_FACE device_port)
2225 {
2226 panic("device_open_reply: illegal\n");
2227 return KERN_SUCCESS;
2228 }
2229
2230 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
2231
2232 kern_return_t
2233 ps_read_device(
2234 paging_segment_t ps,
2235 vm_offset_t offset,
2236 vm_offset_t *bufferp,
2237 unsigned int size,
2238 unsigned int *residualp,
2239 int flags)
2240 {
2241 kern_return_t kr;
2242 recnum_t dev_offset;
2243 unsigned int bytes_wanted;
2244 unsigned int bytes_read;
2245 unsigned int total_read;
2246 vm_offset_t dev_buffer;
2247 vm_offset_t buf_ptr;
2248 unsigned int records_read;
2249 struct vs_async *vsa;
2250 mutex_t vs_waiting_read_reply;
2251
2252 device_t device;
2253 vm_map_copy_t device_data = NULL;
2254 default_pager_thread_t *dpt = NULL;
2255
2256 device = dev_port_lookup(ps->ps_device);
2257 clustered_reads[atop(size)]++;
2258
2259 dev_offset = (ps->ps_offset +
2260 (offset >> (vm_page_shift - ps->ps_record_shift)));
2261 bytes_wanted = size;
2262 total_read = 0;
2263 *bufferp = (vm_offset_t)NULL;
2264
2265 do {
2266 vsa = VS_ALLOC_ASYNC();
2267 if (vsa) {
2268 vsa->vsa_vs = NULL;
2269 vsa->vsa_addr = 0;
2270 vsa->vsa_offset = 0;
2271 vsa->vsa_size = 0;
2272 vsa->vsa_ps = NULL;
2273 }
2274 mutex_init(&vsa->vsa_lock, ETAP_DPAGE_VSSEQNO);
2275 ip_lock(vsa->reply_port);
2276 vsa->reply_port->ip_sorights++;
2277 ip_reference(vsa->reply_port);
2278 ip_unlock(vsa->reply_port);
2279 kr = ds_device_read_common(device,
2280 vsa->reply_port,
2281 (mach_msg_type_name_t)
2282 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2283 (dev_mode_t) 0,
2284 dev_offset,
2285 bytes_wanted,
2286 (IO_READ | IO_CALL),
2287 (io_buf_ptr_t *) &dev_buffer,
2288 (mach_msg_type_number_t *) &bytes_read);
2289 if(kr == MIG_NO_REPLY) {
2290 assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2291 thread_block((void(*)(void))0);
2292
2293 dev_buffer = vsa->vsa_addr;
2294 bytes_read = (unsigned int)vsa->vsa_size;
2295 kr = vsa->vsa_error;
2296 }
2297 VS_FREE_ASYNC(vsa);
2298 if (kr != KERN_SUCCESS || bytes_read == 0) {
2299 break;
2300 }
2301 total_read += bytes_read;
2302
2303 /*
2304 * If we got the entire range, use the returned dev_buffer.
2305 */
2306 if (bytes_read == size) {
2307 *bufferp = (vm_offset_t)dev_buffer;
2308 break;
2309 }
2310
2311 #if 1
2312 dprintf(("read only %d bytes out of %d\n",
2313 bytes_read, bytes_wanted));
2314 #endif
2315 if(dpt == NULL) {
2316 dpt = get_read_buffer();
2317 buf_ptr = dpt->dpt_buffer;
2318 *bufferp = (vm_offset_t)buf_ptr;
2319 }
2320 /*
2321 * Otherwise, copy the data into the provided buffer (*bufferp)
2322 * and append the rest of the range as it comes in.
2323 */
2324 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2325 buf_ptr += bytes_read;
2326 bytes_wanted -= bytes_read;
2327 records_read = (bytes_read >>
2328 (vm_page_shift - ps->ps_record_shift));
2329 dev_offset += records_read;
2330 DEBUG(DEBUG_VS_INTERNAL,
2331 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2332 dev_buffer, bytes_read));
2333 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2334 != KERN_SUCCESS)
2335 Panic("dealloc buf");
2336 } while (bytes_wanted);
2337
2338 *residualp = size - total_read;
2339 if((dev_buffer != *bufferp) && (total_read != 0)) {
2340 vm_offset_t temp_buffer;
2341 vm_allocate(kernel_map, &temp_buffer, total_read, TRUE);
2342 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2343 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2344 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2345 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2346 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2347 (vm_map_copy_t *)&device_data, FALSE))
2348 panic("ps_read_device: cannot copyin locally provided buffer\n");
2349 }
2350 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2351 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2352 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2353 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2354 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2355 (vm_map_copy_t *)&device_data, FALSE))
2356 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2357 }
2358 else {
2359 device_data = NULL;
2360 }
2361 *bufferp = (vm_offset_t)device_data;
2362
2363 if(dpt != NULL) {
2364 /* Free the receive buffer */
2365 dpt->checked_out = 0;
2366 thread_wakeup(&dpt_array);
2367 }
2368 return KERN_SUCCESS;
2369 }
2370
2371 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
2372
2373 kern_return_t
2374 ps_write_device(
2375 paging_segment_t ps,
2376 vm_offset_t offset,
2377 vm_offset_t addr,
2378 unsigned int size,
2379 struct vs_async *vsa)
2380 {
2381 recnum_t dev_offset;
2382 io_buf_len_t bytes_to_write, bytes_written;
2383 recnum_t records_written;
2384 kern_return_t kr;
2385 MACH_PORT_FACE reply_port;
2386
2387
2388
2389 clustered_writes[atop(size)]++;
2390
2391 dev_offset = (ps->ps_offset +
2392 (offset >> (vm_page_shift - ps->ps_record_shift)));
2393 bytes_to_write = size;
2394
2395 if (vsa) {
2396 /*
2397 * Asynchronous write.
2398 */
2399 reply_port = vsa->reply_port;
2400 ip_lock(reply_port);
2401 reply_port->ip_sorights++;
2402 ip_reference(reply_port);
2403 ip_unlock(reply_port);
2404 {
2405 device_t device;
2406 device = dev_port_lookup(ps->ps_device);
2407
2408 vsa->vsa_addr = addr;
2409 kr=ds_device_write_common(device,
2410 reply_port,
2411 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2412 (dev_mode_t) 0,
2413 dev_offset,
2414 (io_buf_ptr_t) addr,
2415 size,
2416 (IO_WRITE | IO_CALL),
2417 &bytes_written);
2418 }
2419 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2420 if (verbose)
2421 dprintf(("%s0x%x, addr=0x%x,"
2422 "size=0x%x,offset=0x%x\n",
2423 "device_write_request returned ",
2424 kr, addr, size, offset));
2425 BS_STAT(ps->ps_bs,
2426 ps->ps_bs->bs_pages_out_fail += atop(size));
2427 /* do the completion notification to free resources */
2428 device_write_reply(reply_port, kr, 0);
2429 return PAGER_ERROR;
2430 }
2431 } else do {
2432 /*
2433 * Synchronous write.
2434 */
2435 {
2436 device_t device;
2437 device = dev_port_lookup(ps->ps_device);
2438 kr=ds_device_write_common(device,
2439 IP_NULL, 0,
2440 (dev_mode_t) 0,
2441 dev_offset,
2442 (io_buf_ptr_t) addr,
2443 size,
2444 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2445 &bytes_written);
2446 }
2447 if (kr != KERN_SUCCESS) {
2448 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2449 "device_write returned ",
2450 kr, addr, size, offset));
2451 BS_STAT(ps->ps_bs,
2452 ps->ps_bs->bs_pages_out_fail += atop(size));
2453 return PAGER_ERROR;
2454 }
2455 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2456 Panic("fragmented write");
2457 records_written = (bytes_written >>
2458 (vm_page_shift - ps->ps_record_shift));
2459 dev_offset += records_written;
2460 #if 1
2461 if (bytes_written != bytes_to_write) {
2462 dprintf(("wrote only %d bytes out of %d\n",
2463 bytes_written, bytes_to_write));
2464 }
2465 #endif
2466 bytes_to_write -= bytes_written;
2467 addr += bytes_written;
2468 } while (bytes_to_write > 0);
2469
2470 return PAGER_SUCCESS;
2471 }
2472
2473
2474 #else /* !DEVICE_PAGING */
2475
2476 kern_return_t
2477 ps_read_device(
2478 paging_segment_t ps,
2479 vm_offset_t offset,
2480 vm_offset_t *bufferp,
2481 unsigned int size,
2482 unsigned int *residualp,
2483 int flags)
2484 {
2485 panic("ps_read_device not supported");
2486 }
2487
2488 ps_write_device(
2489 paging_segment_t ps,
2490 vm_offset_t offset,
2491 vm_offset_t addr,
2492 unsigned int size,
2493 struct vs_async *vsa)
2494 {
2495 panic("ps_write_device not supported");
2496 }
2497
2498 #endif /* DEVICE_PAGING */
2499 void pvs_object_data_provided(vstruct_t, upl_t, vm_offset_t, vm_size_t); /* forward */
2500
2501 void
2502 pvs_object_data_provided(
2503 vstruct_t vs,
2504 upl_t upl,
2505 vm_offset_t offset,
2506 vm_size_t size)
2507 {
2508
2509 DEBUG(DEBUG_VS_INTERNAL,
2510 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2511 upl, offset, size));
2512
2513 ASSERT(size > 0);
2514 GSTAT(global_stats.gs_pages_in += atop(size));
2515
2516
2517 #if USE_PRECIOUS
2518 ps_clunmap(vs, offset, size);
2519 #endif /* USE_PRECIOUS */
2520
2521 }
2522
2523 kern_return_t
2524 pvs_cluster_read(
2525 vstruct_t vs,
2526 vm_offset_t vs_offset,
2527 vm_size_t cnt)
2528 {
2529 upl_t upl;
2530 kern_return_t error = KERN_SUCCESS;
2531 int size;
2532 unsigned int residual;
2533 unsigned int request_flags;
2534 int seg_index;
2535 int pages_in_cl;
2536 int cl_size;
2537 int cl_mask;
2538 int cl_index;
2539 int xfer_size;
2540 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2541 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2542 struct clmap clmap;
2543
2544 pages_in_cl = 1 << vs->vs_clshift;
2545 cl_size = pages_in_cl * vm_page_size;
2546 cl_mask = cl_size - 1;
2547
2548 /*
2549 * This loop will be executed multiple times until the entire
2550 * request has been satisfied... if the request spans cluster
2551 * boundaries, the clusters will be checked for logical continunity,
2552 * if contiguous the I/O request will span multiple clusters, otherwise
2553 * it will be broken up into the minimal set of I/O's
2554 *
2555 * If there are holes in a request (either unallocated pages in a paging
2556 * segment or an unallocated paging segment), we stop
2557 * reading at the hole, inform the VM of any data read, inform
2558 * the VM of an unavailable range, then loop again, hoping to
2559 * find valid pages later in the requested range. This continues until
2560 * the entire range has been examined, and read, if present.
2561 */
2562
2563 #if USE_PRECIOUS
2564 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS;
2565 #else
2566 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE ;
2567 #endif
2568 while (cnt && (error == KERN_SUCCESS)) {
2569 int ps_info_valid;
2570 int page_list_count;
2571
2572 if (cnt > VM_SUPER_CLUSTER)
2573 size = VM_SUPER_CLUSTER;
2574 else
2575 size = cnt;
2576 cnt -= size;
2577
2578 ps_info_valid = 0;
2579 seg_index = 0;
2580
2581 while (size > 0 && error == KERN_SUCCESS) {
2582 int abort_size;
2583 int failed_size;
2584 int beg_pseg;
2585 int beg_indx;
2586 vm_offset_t cur_offset;
2587
2588
2589 if ( !ps_info_valid) {
2590 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2591 psp[seg_index] = CLMAP_PS(clmap);
2592 ps_info_valid = 1;
2593 }
2594 /*
2595 * skip over unallocated physical segments
2596 */
2597 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2598 abort_size = cl_size - (vs_offset & cl_mask);
2599 abort_size = MIN(abort_size, size);
2600
2601 page_list_count = 0;
2602 memory_object_super_upl_request(
2603 vs->vs_control,
2604 (memory_object_offset_t)vs_offset,
2605 abort_size, abort_size,
2606 &upl, NULL, &page_list_count,
2607 request_flags);
2608
2609 if (clmap.cl_error) {
2610 upl_abort(upl, UPL_ABORT_ERROR);
2611 } else {
2612 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2613 }
2614 upl_deallocate(upl);
2615
2616 size -= abort_size;
2617 vs_offset += abort_size;
2618
2619 seg_index++;
2620 ps_info_valid = 0;
2621 continue;
2622 }
2623 cl_index = (vs_offset & cl_mask) / vm_page_size;
2624
2625 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2626 /*
2627 * skip over unallocated pages
2628 */
2629 if (CLMAP_ISSET(clmap, cl_index))
2630 break;
2631 abort_size += vm_page_size;
2632 }
2633 if (abort_size) {
2634 /*
2635 * Let VM system know about holes in clusters.
2636 */
2637 GSTAT(global_stats.gs_pages_unavail += atop(abort_size));
2638
2639 page_list_count = 0;
2640 memory_object_super_upl_request(
2641 vs->vs_control,
2642 (memory_object_offset_t)vs_offset,
2643 abort_size, abort_size,
2644 &upl, NULL, &page_list_count,
2645 request_flags);
2646
2647 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2648 upl_deallocate(upl);
2649
2650 size -= abort_size;
2651 vs_offset += abort_size;
2652
2653 if (cl_index == pages_in_cl) {
2654 /*
2655 * if we're at the end of this physical cluster
2656 * then bump to the next one and continue looking
2657 */
2658 seg_index++;
2659 ps_info_valid = 0;
2660 continue;
2661 }
2662 if (size == 0)
2663 break;
2664 }
2665 /*
2666 * remember the starting point of the first allocated page
2667 * for the I/O we're about to issue
2668 */
2669 beg_pseg = seg_index;
2670 beg_indx = cl_index;
2671 cur_offset = vs_offset;
2672
2673 /*
2674 * calculate the size of the I/O that we can do...
2675 * this may span multiple physical segments if
2676 * they are contiguous
2677 */
2678 for (xfer_size = 0; xfer_size < size; ) {
2679
2680 while (cl_index < pages_in_cl && xfer_size < size) {
2681 /*
2682 * accumulate allocated pages within a physical segment
2683 */
2684 if (CLMAP_ISSET(clmap, cl_index)) {
2685 xfer_size += vm_page_size;
2686 cur_offset += vm_page_size;
2687 cl_index++;
2688
2689 BS_STAT(psp[seg_index]->ps_bs,
2690 psp[seg_index]->ps_bs->bs_pages_in++);
2691 } else
2692 break;
2693 }
2694 if (cl_index < pages_in_cl || xfer_size >= size) {
2695 /*
2696 * we've hit an unallocated page or the
2697 * end of this request... go fire the I/O
2698 */
2699 break;
2700 }
2701 /*
2702 * we've hit the end of the current physical segment
2703 * and there's more to do, so try moving to the next one
2704 */
2705 seg_index++;
2706
2707 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2708 psp[seg_index] = CLMAP_PS(clmap);
2709 ps_info_valid = 1;
2710
2711 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2712 /*
2713 * if the physical segment we're about to step into
2714 * is not contiguous to the one we're currently
2715 * in, or it's in a different paging file, or
2716 * it hasn't been allocated....
2717 * we stop here and generate the I/O
2718 */
2719 break;
2720 }
2721 /*
2722 * start with first page of the next physical segment
2723 */
2724 cl_index = 0;
2725 }
2726 if (xfer_size) {
2727 /*
2728 * we have a contiguous range of allocated pages
2729 * to read from
2730 */
2731 page_list_count = 0;
2732 memory_object_super_upl_request(vs->vs_control,
2733 (memory_object_offset_t)vs_offset,
2734 xfer_size, xfer_size,
2735 &upl, NULL, &page_list_count,
2736 request_flags | UPL_SET_INTERNAL);
2737
2738 error = ps_read_file(psp[beg_pseg], upl, (vm_offset_t) 0,
2739 ps_offset[beg_pseg] + (beg_indx * vm_page_size), xfer_size, &residual, 0);
2740 } else
2741 continue;
2742
2743 failed_size = 0;
2744
2745 /*
2746 * Adjust counts and send response to VM. Optimize for the
2747 * common case, i.e. no error and/or partial data.
2748 * If there was an error, then we need to error the entire
2749 * range, even if some data was successfully read.
2750 * If there was a partial read we may supply some
2751 * data and may error some as well. In all cases the
2752 * VM must receive some notification for every page in the
2753 * range.
2754 */
2755 if ((error == KERN_SUCCESS) && (residual == 0)) {
2756 /*
2757 * Got everything we asked for, supply the data to
2758 * the VM. Note that as a side effect of supplying
2759 * the data, the buffer holding the supplied data is
2760 * deallocated from the pager's address space.
2761 */
2762 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
2763 } else {
2764 failed_size = xfer_size;
2765
2766 if (error == KERN_SUCCESS) {
2767 if (residual == xfer_size) {
2768 /*
2769 * If a read operation returns no error
2770 * and no data moved, we turn it into
2771 * an error, assuming we're reading at
2772 * or beyong EOF.
2773 * Fall through and error the entire
2774 * range.
2775 */
2776 error = KERN_FAILURE;
2777 } else {
2778 /*
2779 * Otherwise, we have partial read. If
2780 * the part read is a integral number
2781 * of pages supply it. Otherwise round
2782 * it up to a page boundary, zero fill
2783 * the unread part, and supply it.
2784 * Fall through and error the remainder
2785 * of the range, if any.
2786 */
2787 int fill, lsize;
2788
2789 fill = residual & ~vm_page_size;
2790 lsize = (xfer_size - residual) + fill;
2791 pvs_object_data_provided(vs, upl, vs_offset, lsize);
2792
2793 if (lsize < xfer_size) {
2794 failed_size = xfer_size - lsize;
2795 error = KERN_FAILURE;
2796 }
2797 }
2798 }
2799 }
2800 /*
2801 * If there was an error in any part of the range, tell
2802 * the VM. Note that error is explicitly checked again since
2803 * it can be modified above.
2804 */
2805 if (error != KERN_SUCCESS) {
2806 BS_STAT(psp[beg_pseg]->ps_bs,
2807 psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop(failed_size));
2808 }
2809 size -= xfer_size;
2810 vs_offset += xfer_size;
2811 }
2812
2813 } /* END while (cnt && (error == 0)) */
2814 return error;
2815 }
2816
2817 int vs_do_async_write = 1;
2818
2819 kern_return_t
2820 vs_cluster_write(
2821 vstruct_t vs,
2822 upl_t internal_upl,
2823 vm_offset_t offset,
2824 vm_size_t cnt,
2825 boolean_t dp_internal,
2826 int flags)
2827 {
2828 vm_offset_t size;
2829 vm_offset_t transfer_size;
2830 int error = 0;
2831 struct clmap clmap;
2832
2833 vm_offset_t actual_offset; /* Offset within paging segment */
2834 paging_segment_t ps;
2835 vm_offset_t subx_size;
2836 vm_offset_t mobj_base_addr;
2837 vm_offset_t mobj_target_addr;
2838 int mobj_size;
2839
2840 struct vs_async *vsa;
2841 vm_map_copy_t copy;
2842
2843 upl_t upl;
2844 upl_page_info_t *pl;
2845 int page_index;
2846 int list_size;
2847 int cl_size;
2848
2849 if (!dp_internal) {
2850 int page_list_count;
2851 int request_flags;
2852 int super_size;
2853 int first_dirty;
2854 int num_dirty;
2855 int num_of_pages;
2856 int seg_index;
2857 int pages_in_cl;
2858 int must_abort;
2859 vm_offset_t upl_offset;
2860 vm_offset_t seg_offset;
2861 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2862 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2863
2864
2865 pages_in_cl = 1 << vs->vs_clshift;
2866 cl_size = pages_in_cl * vm_page_size;
2867
2868 if (bs_low) {
2869 super_size = cl_size;
2870
2871 request_flags = UPL_NOBLOCK |
2872 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2873 UPL_NO_SYNC | UPL_SET_INTERNAL;
2874 } else {
2875 super_size = VM_SUPER_CLUSTER;
2876
2877 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2878 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2879 UPL_NO_SYNC | UPL_SET_INTERNAL;
2880 }
2881
2882 page_list_count = 0;
2883 memory_object_super_upl_request(vs->vs_control,
2884 (memory_object_offset_t)offset,
2885 cnt, super_size,
2886 &upl, NULL, &page_list_count,
2887 request_flags | UPL_PAGEOUT);
2888
2889 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2890
2891 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
2892
2893 ps_offset[seg_index] = ps_clmap(vs, upl->offset + (seg_index * cl_size),
2894 &clmap, CL_ALLOC,
2895 transfer_size < cl_size ?
2896 transfer_size : cl_size, 0);
2897
2898 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2899 upl_abort(upl, 0);
2900 upl_deallocate(upl);
2901
2902 return KERN_FAILURE;
2903
2904 }
2905 psp[seg_index] = CLMAP_PS(clmap);
2906
2907 if (transfer_size > cl_size) {
2908 transfer_size -= cl_size;
2909 seg_index++;
2910 } else
2911 transfer_size = 0;
2912 }
2913 for (page_index = 0, num_of_pages = upl->size / vm_page_size; page_index < num_of_pages; ) {
2914 /*
2915 * skip over non-dirty pages
2916 */
2917 for ( ; page_index < num_of_pages; page_index++) {
2918 if (UPL_DIRTY_PAGE(pl, page_index) || UPL_PRECIOUS_PAGE(pl, page_index))
2919 /*
2920 * this is a page we need to write
2921 * go see if we can buddy it up with others
2922 * that are contiguous to it
2923 */
2924 break;
2925 /*
2926 * if the page is not-dirty, but present we need to commit it...
2927 * this is an unusual case since we only asked for dirty pages
2928 */
2929 if (UPL_PAGE_PRESENT(pl, page_index)) {
2930 boolean_t empty = FALSE;
2931 upl_commit_range(upl,
2932 page_index * vm_page_size,
2933 vm_page_size,
2934 UPL_COMMIT_NOTIFY_EMPTY,
2935 pl,
2936 MAX_UPL_TRANSFER,
2937 &empty);
2938 if (empty)
2939 upl_deallocate(upl);
2940 }
2941 }
2942 if (page_index == num_of_pages)
2943 /*
2944 * no more pages to look at, we're out of here
2945 */
2946 break;
2947
2948 /*
2949 * gather up contiguous dirty pages... we have at least 1
2950 * otherwise we would have bailed above
2951 * make sure that each physical segment that we step
2952 * into is contiguous to the one we're currently in
2953 * if it's not, we have to stop and write what we have
2954 */
2955 for (first_dirty = page_index; page_index < num_of_pages; ) {
2956 if ( !UPL_DIRTY_PAGE(pl, page_index) && !UPL_PRECIOUS_PAGE(pl, page_index))
2957 break;
2958 page_index++;
2959 /*
2960 * if we just looked at the last page in the UPL
2961 * we don't need to check for physical segment
2962 * continuity
2963 */
2964 if (page_index < num_of_pages) {
2965 int cur_seg;
2966 int nxt_seg;
2967
2968 cur_seg = (page_index - 1) / pages_in_cl;
2969 nxt_seg = page_index / pages_in_cl;
2970
2971 if (cur_seg != nxt_seg) {
2972 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
2973 /*
2974 * if the segment we're about to step into
2975 * is not contiguous to the one we're currently
2976 * in, or it's in a different paging file....
2977 * we stop here and generate the I/O
2978 */
2979 break;
2980 }
2981 }
2982 }
2983 num_dirty = page_index - first_dirty;
2984 must_abort = 1;
2985
2986 if (num_dirty) {
2987 upl_offset = first_dirty * vm_page_size;
2988 seg_index = first_dirty / pages_in_cl;
2989 seg_offset = upl_offset - (seg_index * cl_size);
2990 transfer_size = num_dirty * vm_page_size;
2991
2992 error = ps_write_file(psp[seg_index], upl, upl_offset,
2993 ps_offset[seg_index] + seg_offset, transfer_size, flags);
2994
2995 if (error == 0) {
2996 while (transfer_size) {
2997 int seg_size;
2998
2999 if ((seg_size = cl_size - (upl_offset % cl_size)) > transfer_size)
3000 seg_size = transfer_size;
3001
3002 ps_vs_write_complete(vs, upl->offset + upl_offset, seg_size, error);
3003
3004 transfer_size -= seg_size;
3005 upl_offset += seg_size;
3006 }
3007 must_abort = 0;
3008 }
3009 }
3010 if (must_abort) {
3011 boolean_t empty = FALSE;
3012 upl_abort_range(upl,
3013 first_dirty * vm_page_size,
3014 num_dirty * vm_page_size,
3015 UPL_ABORT_NOTIFY_EMPTY,
3016 &empty);
3017 if (empty)
3018 upl_deallocate(upl);
3019 }
3020 }
3021
3022 } else {
3023 assert(cnt <= (vm_page_size << vs->vs_clshift));
3024 list_size = cnt;
3025
3026 page_index = 0;
3027 /* The caller provides a mapped_data which is derived */
3028 /* from a temporary object. The targeted pages are */
3029 /* guaranteed to be set at offset 0 in the mapped_data */
3030 /* The actual offset however must still be derived */
3031 /* from the offset in the vs in question */
3032 mobj_base_addr = offset;
3033 mobj_target_addr = mobj_base_addr;
3034
3035 for (transfer_size = list_size; transfer_size != 0;) {
3036 actual_offset = ps_clmap(vs, mobj_target_addr,
3037 &clmap, CL_ALLOC,
3038 transfer_size < cl_size ?
3039 transfer_size : cl_size, 0);
3040 if(actual_offset == (vm_offset_t) -1) {
3041 error = 1;
3042 break;
3043 }
3044 cnt = MIN(transfer_size,
3045 CLMAP_NPGS(clmap) * vm_page_size);
3046 ps = CLMAP_PS(clmap);
3047 /* Assume that the caller has given us contiguous */
3048 /* pages */
3049 if(cnt) {
3050 error = ps_write_file(ps, internal_upl,
3051 0, actual_offset,
3052 cnt, flags);
3053 if (error)
3054 break;
3055 ps_vs_write_complete(vs, mobj_target_addr,
3056 cnt, error);
3057 }
3058 if (error)
3059 break;
3060 actual_offset += cnt;
3061 mobj_target_addr += cnt;
3062 transfer_size -= cnt;
3063 cnt = 0;
3064
3065 if (error)
3066 break;
3067 }
3068 }
3069 if(error)
3070 return KERN_FAILURE;
3071 else
3072 return KERN_SUCCESS;
3073 }
3074
3075 vm_size_t
3076 ps_vstruct_allocated_size(
3077 vstruct_t vs)
3078 {
3079 int num_pages;
3080 struct vs_map *vsmap;
3081 int i, j, k;
3082
3083 num_pages = 0;
3084 if (vs->vs_indirect) {
3085 /* loop on indirect maps */
3086 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3087 vsmap = vs->vs_imap[i];
3088 if (vsmap == NULL)
3089 continue;
3090 /* loop on clusters in this indirect map */
3091 for (j = 0; j < CLMAP_ENTRIES; j++) {
3092 if (VSM_ISCLR(vsmap[j]) ||
3093 VSM_ISERR(vsmap[j]))
3094 continue;
3095 /* loop on pages in this cluster */
3096 for (k = 0; k < VSCLSIZE(vs); k++) {
3097 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3098 num_pages++;
3099 }
3100 }
3101 }
3102 } else {
3103 vsmap = vs->vs_dmap;
3104 if (vsmap == NULL)
3105 return 0;
3106 /* loop on clusters in the direct map */
3107 for (j = 0; j < CLMAP_ENTRIES; j++) {
3108 if (VSM_ISCLR(vsmap[j]) ||
3109 VSM_ISERR(vsmap[j]))
3110 continue;
3111 /* loop on pages in this cluster */
3112 for (k = 0; k < VSCLSIZE(vs); k++) {
3113 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3114 num_pages++;
3115 }
3116 }
3117 }
3118
3119 return ptoa(num_pages);
3120 }
3121
3122 size_t
3123 ps_vstruct_allocated_pages(
3124 vstruct_t vs,
3125 default_pager_page_t *pages,
3126 size_t pages_size)
3127 {
3128 int num_pages;
3129 struct vs_map *vsmap;
3130 vm_offset_t offset;
3131 int i, j, k;
3132
3133 num_pages = 0;
3134 offset = 0;
3135 if (vs->vs_indirect) {
3136 /* loop on indirect maps */
3137 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3138 vsmap = vs->vs_imap[i];
3139 if (vsmap == NULL) {
3140 offset += (vm_page_size * CLMAP_ENTRIES *
3141 VSCLSIZE(vs));
3142 continue;
3143 }
3144 /* loop on clusters in this indirect map */
3145 for (j = 0; j < CLMAP_ENTRIES; j++) {
3146 if (VSM_ISCLR(vsmap[j]) ||
3147 VSM_ISERR(vsmap[j])) {
3148 offset += vm_page_size * VSCLSIZE(vs);
3149 continue;
3150 }
3151 /* loop on pages in this cluster */
3152 for (k = 0; k < VSCLSIZE(vs); k++) {
3153 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3154 num_pages++;
3155 if (num_pages < pages_size)
3156 pages++->dpp_offset =
3157 offset;
3158 }
3159 offset += vm_page_size;
3160 }
3161 }
3162 }
3163 } else {
3164 vsmap = vs->vs_dmap;
3165 if (vsmap == NULL)
3166 return 0;
3167 /* loop on clusters in the direct map */
3168 for (j = 0; j < CLMAP_ENTRIES; j++) {
3169 if (VSM_ISCLR(vsmap[j]) ||
3170 VSM_ISERR(vsmap[j])) {
3171 offset += vm_page_size * VSCLSIZE(vs);
3172 continue;
3173 }
3174 /* loop on pages in this cluster */
3175 for (k = 0; k < VSCLSIZE(vs); k++) {
3176 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3177 num_pages++;
3178 if (num_pages < pages_size)
3179 pages++->dpp_offset = offset;
3180 }
3181 offset += vm_page_size;
3182 }
3183 }
3184 }
3185
3186 return num_pages;
3187 }
3188
3189
3190 kern_return_t
3191 ps_vstruct_transfer_from_segment(
3192 vstruct_t vs,
3193 paging_segment_t segment,
3194 #ifndef ubc_sync_working
3195 vm_object_t transfer_object)
3196 #else
3197 upl_t upl)
3198 #endif
3199 {
3200 struct vs_map *vsmap;
3201 struct vs_map old_vsmap;
3202 struct vs_map new_vsmap;
3203 int i, j, k;
3204
3205 VS_LOCK(vs); /* block all work on this vstruct */
3206 /* can't allow the normal multiple write */
3207 /* semantic because writes may conflict */
3208 vs->vs_xfer_pending = TRUE;
3209 vs_wait_for_sync_writers(vs);
3210 vs_start_write(vs);
3211 vs_wait_for_readers(vs);
3212 /* we will unlock the vs to allow other writes while transferring */
3213 /* and will be guaranteed of the persistance of the vs struct */
3214 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3215 /* vs_async_pending */
3216 /* OK we now have guaranteed no other parties are accessing this */
3217 /* vs. Now that we are also supporting simple lock versions of */
3218 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3219 /* our purpose in holding it before was the multiple write case */
3220 /* we now use the boolean xfer_pending to do that. We can use */
3221 /* a boolean instead of a count because we have guaranteed single */
3222 /* file access to this code in its caller */
3223 VS_UNLOCK(vs);
3224 vs_changed:
3225 if (vs->vs_indirect) {
3226 int vsmap_size;
3227 int clmap_off;
3228 /* loop on indirect maps */
3229 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3230 vsmap = vs->vs_imap[i];
3231 if (vsmap == NULL)
3232 continue;
3233 /* loop on clusters in this indirect map */
3234 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3235 VSCLSIZE(vs) * i);
3236 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3237 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3238 else
3239 vsmap_size = CLMAP_ENTRIES;
3240 for (j = 0; j < vsmap_size; j++) {
3241 if (VSM_ISCLR(vsmap[j]) ||
3242 VSM_ISERR(vsmap[j]) ||
3243 (VSM_PS(vsmap[j]) != segment))
3244 continue;
3245 if(vs_cluster_transfer(vs,
3246 (vm_page_size * (j << vs->vs_clshift))
3247 + clmap_off,
3248 vm_page_size << vs->vs_clshift,
3249 #ifndef ubc_sync_working
3250 transfer_object)
3251 #else
3252 upl)
3253 #endif
3254 != KERN_SUCCESS) {
3255 VS_LOCK(vs);
3256 vs->vs_xfer_pending = FALSE;
3257 VS_UNLOCK(vs);
3258 vs_finish_write(vs);
3259 return KERN_FAILURE;
3260 }
3261 /* allow other readers/writers during transfer*/
3262 VS_LOCK(vs);
3263 vs->vs_xfer_pending = FALSE;
3264 VS_UNLOCK(vs);
3265 vs_finish_write(vs);
3266 VS_LOCK(vs);
3267 vs->vs_xfer_pending = TRUE;
3268 vs_wait_for_sync_writers(vs);
3269 vs_start_write(vs);
3270 vs_wait_for_readers(vs);
3271 VS_UNLOCK(vs);
3272 if (!(vs->vs_indirect)) {
3273 goto vs_changed;
3274 }
3275 }
3276 }
3277 } else {
3278 vsmap = vs->vs_dmap;
3279 if (vsmap == NULL) {
3280 VS_LOCK(vs);
3281 vs->vs_xfer_pending = FALSE;
3282 VS_UNLOCK(vs);
3283 vs_finish_write(vs);
3284 return KERN_SUCCESS;
3285 }
3286 /* loop on clusters in the direct map */
3287 for (j = 0; j < vs->vs_size; j++) {
3288 if (VSM_ISCLR(vsmap[j]) ||
3289 VSM_ISERR(vsmap[j]) ||
3290 (VSM_PS(vsmap[j]) != segment))
3291 continue;
3292 if(vs_cluster_transfer(vs,
3293 vm_page_size * (j << vs->vs_clshift),
3294 vm_page_size << vs->vs_clshift,
3295 #ifndef ubc_sync_working
3296 transfer_object) != KERN_SUCCESS) {
3297 #else
3298 upl) != KERN_SUCCESS) {
3299 #endif
3300 VS_LOCK(vs);
3301 vs->vs_xfer_pending = FALSE;
3302 VS_UNLOCK(vs);
3303 vs_finish_write(vs);
3304 return KERN_FAILURE;
3305 }
3306 /* allow other readers/writers during transfer*/
3307 VS_LOCK(vs);
3308 vs->vs_xfer_pending = FALSE;
3309 VS_UNLOCK(vs);
3310 vs_finish_write(vs);
3311 VS_LOCK(vs);
3312 vs->vs_xfer_pending = TRUE;
3313 VS_UNLOCK(vs);
3314 vs_wait_for_sync_writers(vs);
3315 vs_start_write(vs);
3316 vs_wait_for_readers(vs);
3317 if (vs->vs_indirect) {
3318 goto vs_changed;
3319 }
3320 }
3321 }
3322
3323 VS_LOCK(vs);
3324 vs->vs_xfer_pending = FALSE;
3325 VS_UNLOCK(vs);
3326 vs_finish_write(vs);
3327 return KERN_SUCCESS;
3328 }
3329
3330
3331
3332 vs_map_t
3333 vs_get_map_entry(
3334 vstruct_t vs,
3335 vm_offset_t offset)
3336 {
3337 struct vs_map *vsmap;
3338 vm_offset_t cluster;
3339
3340 cluster = atop(offset) >> vs->vs_clshift;
3341 if (vs->vs_indirect) {
3342 long ind_block = cluster/CLMAP_ENTRIES;
3343
3344 /* Is the indirect block allocated? */
3345 vsmap = vs->vs_imap[ind_block];
3346 if(vsmap == (vs_map_t) NULL)
3347 return vsmap;
3348 } else
3349 vsmap = vs->vs_dmap;
3350 vsmap += cluster%CLMAP_ENTRIES;
3351 return vsmap;
3352 }
3353
3354 kern_return_t
3355 vs_cluster_transfer(
3356 vstruct_t vs,
3357 vm_offset_t offset,
3358 vm_size_t cnt,
3359 #ifndef ubc_sync_working
3360 vm_object_t transfer_object)
3361 #else
3362 upl_t upl)
3363 #endif
3364 {
3365 vm_offset_t actual_offset;
3366 paging_segment_t ps;
3367 struct clmap clmap;
3368 kern_return_t error = KERN_SUCCESS;
3369 int size, size_wanted, i;
3370 unsigned int residual;
3371 int unavail_size;
3372 default_pager_thread_t *dpt;
3373 boolean_t dealloc;
3374 struct vs_map *vsmap_ptr;
3375 struct vs_map read_vsmap;
3376 struct vs_map original_read_vsmap;
3377 struct vs_map write_vsmap;
3378 upl_t sync_upl;
3379 #ifndef ubc_sync_working
3380 upl_t upl;
3381 #endif
3382
3383 vm_offset_t ioaddr;
3384
3385 /* vs_cluster_transfer reads in the pages of a cluster and
3386 * then writes these pages back to new backing store. The
3387 * segment the pages are being read from is assumed to have
3388 * been taken off-line and is no longer considered for new
3389 * space requests.
3390 */
3391
3392 /*
3393 * This loop will be executed once per cluster referenced.
3394 * Typically this means once, since it's unlikely that the
3395 * VM system will ask for anything spanning cluster boundaries.
3396 *
3397 * If there are holes in a cluster (in a paging segment), we stop
3398 * reading at the hole, then loop again, hoping to
3399 * find valid pages later in the cluster. This continues until
3400 * the entire range has been examined, and read, if present. The
3401 * pages are written as they are read. If a failure occurs after
3402 * some pages are written the unmap call at the bottom of the loop
3403 * recovers the backing store and the old backing store remains
3404 * in effect.
3405 */
3406
3407 VSM_CLR(write_vsmap);
3408 VSM_CLR(original_read_vsmap);
3409 /* grab the actual object's pages to sync with I/O */
3410 while (cnt && (error == KERN_SUCCESS)) {
3411 vsmap_ptr = vs_get_map_entry(vs, offset);
3412 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3413
3414 if (actual_offset == (vm_offset_t) -1) {
3415
3416 /*
3417 * Nothing left to write in this cluster at least
3418 * set write cluster information for any previous
3419 * write, clear for next cluster, if there is one
3420 */
3421 unsigned int local_size, clmask, clsize;
3422
3423 clsize = vm_page_size << vs->vs_clshift;
3424 clmask = clsize - 1;
3425 local_size = clsize - (offset & clmask);
3426 ASSERT(local_size);
3427 local_size = MIN(local_size, cnt);
3428
3429 /* This cluster has no data in it beyond what may */
3430 /* have been found on a previous iteration through */
3431 /* the loop "write_vsmap" */
3432 *vsmap_ptr = write_vsmap;
3433 VSM_CLR(write_vsmap);
3434 VSM_CLR(original_read_vsmap);
3435
3436 cnt -= local_size;
3437 offset += local_size;
3438 continue;
3439 }
3440
3441 /*
3442 * Count up contiguous available or unavailable
3443 * pages.
3444 */
3445 ps = CLMAP_PS(clmap);
3446 ASSERT(ps);
3447 size = 0;
3448 unavail_size = 0;
3449 for (i = 0;
3450 (size < cnt) && (unavail_size < cnt) &&
3451 (i < CLMAP_NPGS(clmap)); i++) {
3452 if (CLMAP_ISSET(clmap, i)) {
3453 if (unavail_size != 0)
3454 break;
3455 size += vm_page_size;
3456 BS_STAT(ps->ps_bs,
3457 ps->ps_bs->bs_pages_in++);
3458 } else {
3459 if (size != 0)
3460 break;
3461 unavail_size += vm_page_size;
3462 }
3463 }
3464
3465 if (size == 0) {
3466 ASSERT(unavail_size);
3467 cnt -= unavail_size;
3468 offset += unavail_size;
3469 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3470 == 0) {
3471 /* There is no more to transfer in this
3472 cluster
3473 */
3474 *vsmap_ptr = write_vsmap;
3475 VSM_CLR(write_vsmap);
3476 VSM_CLR(original_read_vsmap);
3477 }
3478 continue;
3479 }
3480
3481 if(VSM_ISCLR(original_read_vsmap))
3482 original_read_vsmap = *vsmap_ptr;
3483
3484 if(ps->ps_segtype == PS_PARTITION) {
3485 /*
3486 NEED TO BE WITH SYNC & NO COMMIT
3487 error = ps_read_device(ps, actual_offset, &buffer,
3488 size, &residual, flags);
3489 */
3490 } else {
3491 #ifndef ubc_sync_working
3492 int page_list_count = 0;
3493
3494 error = vm_object_upl_request(transfer_object,
3495 (vm_object_offset_t) (actual_offset & ((vm_page_size << vs->vs_clshift) - 1)),
3496 size, &upl, NULL, &page_list_count,
3497 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
3498 | UPL_SET_INTERNAL);
3499 if (error == KERN_SUCCESS) {
3500 error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset,
3501 size, &residual, 0);
3502 if(error)
3503 upl_commit(upl, NULL);
3504 upl_deallocate(upl);
3505 }
3506
3507 #else
3508 /* NEED TO BE WITH SYNC & NO COMMIT & NO RDAHEAD*/
3509 error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset,
3510 size, &residual,
3511 (UPL_IOSYNC | UPL_NOCOMMIT | UPL_NORDAHEAD));
3512 #endif
3513 }
3514
3515 read_vsmap = *vsmap_ptr;
3516
3517
3518 /*
3519 * Adjust counts and put data in new BS. Optimize for the
3520 * common case, i.e. no error and/or partial data.
3521 * If there was an error, then we need to error the entire
3522 * range, even if some data was successfully read.
3523 *
3524 */
3525 if ((error == KERN_SUCCESS) && (residual == 0)) {
3526 int page_list_count = 0;
3527
3528 /*
3529 * Got everything we asked for, supply the data to
3530 * the new BS. Note that as a side effect of supplying
3531 * the data, the buffer holding the supplied data is
3532 * deallocated from the pager's address space unless
3533 * the write is unsuccessful.
3534 */
3535
3536 /* note buffer will be cleaned up in all cases by */
3537 /* internal_cluster_write or if an error on write */
3538 /* the vm_map_copy_page_discard call */
3539 *vsmap_ptr = write_vsmap;
3540
3541 #ifndef ubc_sync_working
3542 error = vm_object_upl_request(transfer_object,
3543 (vm_object_offset_t)
3544 (actual_offset & ((vm_page_size << vs->vs_clshift) - 1)),
3545 size, &upl, NULL, &page_list_count,
3546 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL);
3547 if(vs_cluster_write(vs, upl, offset,
3548 size, TRUE, 0) != KERN_SUCCESS) {
3549 upl_commit(upl, NULL);
3550 upl_deallocate(upl);
3551 #else
3552 if(vs_cluster_write(vs, upl, offset,
3553 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3554 #endif
3555 error = KERN_FAILURE;
3556 if(!(VSM_ISCLR(*vsmap_ptr))) {
3557 /* unmap the new backing store object */
3558 ps_clunmap(vs, offset, size);
3559 }
3560 /* original vsmap */
3561 *vsmap_ptr = original_read_vsmap;
3562 VSM_CLR(write_vsmap);
3563 } else {
3564 if((offset + size) &
3565 ((vm_page_size << vs->vs_clshift)
3566 - 1)) {
3567 /* There is more to transfer in this
3568 cluster
3569 */
3570 write_vsmap = *vsmap_ptr;
3571 *vsmap_ptr = read_vsmap;
3572 } else {
3573 /* discard the old backing object */
3574 write_vsmap = *vsmap_ptr;
3575 *vsmap_ptr = read_vsmap;
3576 ps_clunmap(vs, offset, size);
3577 *vsmap_ptr = write_vsmap;
3578 VSM_CLR(write_vsmap);
3579 VSM_CLR(original_read_vsmap);
3580 }
3581 }
3582 } else {
3583 size_wanted = size;
3584 if (error == KERN_SUCCESS) {
3585 if (residual == size) {
3586 /*
3587 * If a read operation returns no error
3588 * and no data moved, we turn it into
3589 * an error, assuming we're reading at
3590 * or beyond EOF.
3591 * Fall through and error the entire
3592 * range.
3593 */
3594 error = KERN_FAILURE;
3595 *vsmap_ptr = write_vsmap;
3596 if(!(VSM_ISCLR(*vsmap_ptr))) {
3597 /* unmap the new backing store object */
3598 ps_clunmap(vs, offset, size);
3599 }
3600 *vsmap_ptr = original_read_vsmap;
3601 VSM_CLR(write_vsmap);
3602 continue;
3603 } else {
3604 /*
3605 * Otherwise, we have partial read.
3606 * This is also considered an error
3607 * for the purposes of cluster transfer
3608 */
3609 error = KERN_FAILURE;
3610 *vsmap_ptr = write_vsmap;
3611 if(!(VSM_ISCLR(*vsmap_ptr))) {
3612 /* unmap the new backing store object */
3613 ps_clunmap(vs, offset, size);
3614 }
3615 *vsmap_ptr = original_read_vsmap;
3616 VSM_CLR(write_vsmap);
3617 continue;
3618 }
3619 }
3620
3621 }
3622 cnt -= size;
3623 offset += size;
3624
3625 } /* END while (cnt && (error == 0)) */
3626 if(!VSM_ISCLR(write_vsmap))
3627 *vsmap_ptr = write_vsmap;
3628
3629 return error;
3630 }
3631
3632 kern_return_t
3633 default_pager_add_file(MACH_PORT_FACE backing_store,
3634 int *vp,
3635 int record_size,
3636 long size)
3637 {
3638 backing_store_t bs;
3639 paging_segment_t ps;
3640 int i;
3641 int error;
3642
3643 if ((bs = backing_store_lookup(backing_store))
3644 == BACKING_STORE_NULL)
3645 return KERN_INVALID_ARGUMENT;
3646
3647 PSL_LOCK();
3648 for (i = 0; i <= paging_segment_max; i++) {
3649 ps = paging_segments[i];
3650 if (ps == PAGING_SEGMENT_NULL)
3651 continue;
3652 if (ps->ps_segtype != PS_FILE)
3653 continue;
3654
3655 /*
3656 * Check for overlap on same device.
3657 */
3658 if (ps->ps_vnode == (struct vnode *)vp) {
3659 PSL_UNLOCK();
3660 BS_UNLOCK(bs);
3661 return KERN_INVALID_ARGUMENT;
3662 }
3663 }
3664 PSL_UNLOCK();
3665
3666 /*
3667 * Set up the paging segment
3668 */
3669 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3670 if (ps == PAGING_SEGMENT_NULL) {
3671 BS_UNLOCK(bs);
3672 return KERN_RESOURCE_SHORTAGE;
3673 }
3674
3675 ps->ps_segtype = PS_FILE;
3676 ps->ps_vnode = (struct vnode *)vp;
3677 ps->ps_offset = 0;
3678 ps->ps_record_shift = local_log2(vm_page_size / record_size);
3679 ps->ps_recnum = size;
3680 ps->ps_pgnum = size >> ps->ps_record_shift;
3681
3682 ps->ps_pgcount = ps->ps_pgnum;
3683 ps->ps_clshift = local_log2(bs->bs_clsize);
3684 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3685 ps->ps_hint = 0;
3686
3687 PS_LOCK_INIT(ps);
3688 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3689 if (!ps->ps_bmap) {
3690 kfree((vm_offset_t)ps, sizeof *ps);
3691 BS_UNLOCK(bs);
3692 return KERN_RESOURCE_SHORTAGE;
3693 }
3694 for (i = 0; i < ps->ps_ncls; i++) {
3695 clrbit(ps->ps_bmap, i);
3696 }
3697
3698 ps->ps_going_away = FALSE;
3699 ps->ps_bs = bs;
3700
3701 if ((error = ps_enter(ps)) != 0) {
3702 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3703 kfree((vm_offset_t)ps, sizeof *ps);
3704 BS_UNLOCK(bs);
3705 return KERN_RESOURCE_SHORTAGE;
3706 }
3707
3708 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3709 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3710 PSL_LOCK();
3711 dp_pages_free += ps->ps_pgcount;
3712 PSL_UNLOCK();
3713
3714 BS_UNLOCK(bs);
3715
3716 bs_more_space(ps->ps_clcount);
3717
3718 DEBUG(DEBUG_BS_INTERNAL,
3719 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3720 device, offset, size, record_size,
3721 ps->ps_record_shift, ps->ps_pgnum));
3722
3723 return KERN_SUCCESS;
3724 }
3725
3726
3727
3728 kern_return_t
3729 ps_read_file(
3730 paging_segment_t ps,
3731 upl_t upl,
3732 vm_offset_t upl_offset,
3733 vm_offset_t offset,
3734 unsigned int size,
3735 unsigned int *residualp,
3736 int flags)
3737 {
3738 vm_object_offset_t f_offset;
3739 int error = 0;
3740 int result;
3741
3742
3743 clustered_reads[atop(size)]++;
3744
3745 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3746
3747 /* for transfer case we need to pass uploffset and flags */
3748 error = vnode_pagein(ps->ps_vnode,
3749 upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
3750
3751 /* The vnode_pagein semantic is somewhat at odds with the existing */
3752 /* device_read semantic. Partial reads are not experienced at this */
3753 /* level. It is up to the bit map code and cluster read code to */
3754 /* check that requested data locations are actually backed, and the */
3755 /* pagein code to either read all of the requested data or return an */
3756 /* error. */
3757
3758 if (error)
3759 result = KERN_FAILURE;
3760 else {
3761 *residualp = 0;
3762 result = KERN_SUCCESS;
3763 }
3764 return result;
3765 }
3766
3767 kern_return_t
3768 ps_write_file(
3769 paging_segment_t ps,
3770 upl_t upl,
3771 vm_offset_t upl_offset,
3772 vm_offset_t offset,
3773 unsigned int size,
3774 int flags)
3775 {
3776 vm_object_offset_t f_offset;
3777 kern_return_t result;
3778
3779 int error = 0;
3780
3781 clustered_writes[atop(size)]++;
3782 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3783
3784 if (vnode_pageout(ps->ps_vnode,
3785 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3786 result = KERN_FAILURE;
3787 else
3788 result = KERN_SUCCESS;
3789
3790 return result;
3791 }
3792
3793 kern_return_t
3794 default_pager_triggers(MACH_PORT_FACE default_pager,
3795 int hi_wat,
3796 int lo_wat,
3797 int flags,
3798 MACH_PORT_FACE trigger_port)
3799 {
3800 MACH_PORT_FACE release;
3801 kern_return_t kr;
3802
3803 PSL_LOCK();
3804 if (flags == HI_WAT_ALERT) {
3805 release = min_pages_trigger_port;
3806 min_pages_trigger_port = trigger_port;
3807 minimum_pages_remaining = hi_wat/vm_page_size;
3808 bs_low = FALSE;
3809 kr = KERN_SUCCESS;
3810 } else if (flags == LO_WAT_ALERT) {
3811 release = max_pages_trigger_port;
3812 max_pages_trigger_port = trigger_port;
3813 maximum_pages_free = lo_wat/vm_page_size;
3814 kr = KERN_SUCCESS;
3815 } else {
3816 release = trigger_port;
3817 kr = KERN_INVALID_ARGUMENT;
3818 }
3819 PSL_UNLOCK();
3820
3821 if (IP_VALID(release))
3822 ipc_port_release_send(release);
3823
3824 return kr;
3825 }