]> git.saurik.com Git - apple/xnu.git/blob - osfmk/default_pager/dp_backing_store.c
xnu-792.6.56.tar.gz
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /*
24 * @OSF_COPYRIGHT@
25 */
26 /*
27 * Mach Operating System
28 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
29 * All Rights Reserved.
30 *
31 * Permission to use, copy, modify and distribute this software and its
32 * documentation is hereby granted, provided that both the copyright
33 * notice and this permission notice appear in all copies of the
34 * software, derivative works or modified versions, and any portions
35 * thereof, and that both notices appear in supporting documentation.
36 *
37 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
38 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
39 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
40 *
41 * Carnegie Mellon requests users of this software to return to
42 *
43 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
44 * School of Computer Science
45 * Carnegie Mellon University
46 * Pittsburgh PA 15213-3890
47 *
48 * any improvements or extensions that they make and grant Carnegie Mellon
49 * the rights to redistribute these changes.
50 */
51
52 /*
53 * Default Pager.
54 * Paging File Management.
55 */
56
57 #include <mach/host_priv.h>
58 #include <mach/memory_object_control.h>
59 #include <mach/memory_object_server.h>
60 #include <mach/upl.h>
61 #include <default_pager/default_pager_internal.h>
62 #include <default_pager/default_pager_alerts.h>
63 #include <default_pager/default_pager_object_server.h>
64
65 #include <ipc/ipc_types.h>
66 #include <ipc/ipc_port.h>
67 #include <ipc/ipc_space.h>
68
69 #include <kern/kern_types.h>
70 #include <kern/host.h>
71 #include <kern/queue.h>
72 #include <kern/counters.h>
73 #include <kern/sched_prim.h>
74
75 #include <vm/vm_kern.h>
76 #include <vm/vm_pageout.h>
77 #include <vm/vm_map.h>
78 #include <vm/vm_object.h>
79 #include <vm/vm_protos.h>
80
81 /* LP64todo - need large internal object support */
82
83 /*
84 * ALLOC_STRIDE... the maximum number of bytes allocated from
85 * a swap file before moving on to the next swap file... if
86 * all swap files reside on a single disk, this value should
87 * be very large (this is the default assumption)... if the
88 * swap files are spread across multiple disks, than this value
89 * should be small (128 * 1024)...
90 *
91 * This should be determined dynamically in the future
92 */
93
94 #define ALLOC_STRIDE (1024 * 1024 * 1024)
95 int physical_transfer_cluster_count = 0;
96
97 #define VM_SUPER_CLUSTER 0x40000
98 #define VM_SUPER_PAGES 64
99
100 /*
101 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
102 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
103 */
104 #define VSTRUCT_DEF_CLSHIFT 2
105 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
106 int default_pager_clsize = 0;
107
108 /* statistics */
109 unsigned int clustered_writes[VM_SUPER_PAGES+1];
110 unsigned int clustered_reads[VM_SUPER_PAGES+1];
111
112 /*
113 * Globals used for asynchronous paging operations:
114 * vs_async_list: head of list of to-be-completed I/O ops
115 * async_num_queued: number of pages completed, but not yet
116 * processed by async thread.
117 * async_requests_out: number of pages of requests not completed.
118 */
119
120 #if 0
121 struct vs_async *vs_async_list;
122 int async_num_queued;
123 int async_requests_out;
124 #endif
125
126
127 #define VS_ASYNC_REUSE 1
128 struct vs_async *vs_async_free_list;
129
130 mutex_t default_pager_async_lock; /* Protects globals above */
131
132
133 int vs_alloc_async_failed = 0; /* statistics */
134 int vs_alloc_async_count = 0; /* statistics */
135 struct vs_async *vs_alloc_async(void); /* forward */
136 void vs_free_async(struct vs_async *vsa); /* forward */
137
138
139 #define VS_ALLOC_ASYNC() vs_alloc_async()
140 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
141
142 #define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
143 #define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
144 #define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, 0)
145 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
146 /*
147 * Paging Space Hysteresis triggers and the target notification port
148 *
149 */
150
151 unsigned int minimum_pages_remaining = 0;
152 unsigned int maximum_pages_free = 0;
153 ipc_port_t min_pages_trigger_port = NULL;
154 ipc_port_t max_pages_trigger_port = NULL;
155
156 boolean_t bs_low = FALSE;
157 int backing_store_release_trigger_disable = 0;
158
159
160 /* Have we decided if swap needs to be encrypted yet ? */
161 boolean_t dp_encryption_inited = FALSE;
162 /* Should we encrypt swap ? */
163 boolean_t dp_encryption = FALSE;
164
165
166 /*
167 * Object sizes are rounded up to the next power of 2,
168 * unless they are bigger than a given maximum size.
169 */
170 vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
171
172 /*
173 * List of all backing store and segments.
174 */
175 struct backing_store_list_head backing_store_list;
176 paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
177 mutex_t paging_segments_lock;
178 int paging_segment_max = 0;
179 int paging_segment_count = 0;
180 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
181
182
183 /*
184 * Total pages free in system
185 * This differs from clusters committed/avail which is a measure of the
186 * over commitment of paging segments to backing store. An idea which is
187 * likely to be deprecated.
188 */
189 unsigned int dp_pages_free = 0;
190 unsigned int cluster_transfer_minimum = 100;
191
192 /* forward declarations */
193 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, int); /* forward */
194 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, unsigned int *, int); /* forward */
195 default_pager_thread_t *get_read_buffer( void );
196 kern_return_t ps_vstruct_transfer_from_segment(
197 vstruct_t vs,
198 paging_segment_t segment,
199 upl_t upl);
200 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
201 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
202 kern_return_t vs_cluster_transfer(
203 vstruct_t vs,
204 upl_offset_t offset,
205 upl_size_t cnt,
206 upl_t upl);
207 vs_map_t vs_get_map_entry(
208 vstruct_t vs,
209 vm_offset_t offset);
210
211
212 default_pager_thread_t *
213 get_read_buffer( void )
214 {
215 int i;
216
217 DPT_LOCK(dpt_lock);
218 while(TRUE) {
219 for (i=0; i<default_pager_internal_count; i++) {
220 if(dpt_array[i]->checked_out == FALSE) {
221 dpt_array[i]->checked_out = TRUE;
222 DPT_UNLOCK(dpt_lock);
223 return dpt_array[i];
224 }
225 }
226 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
227 }
228 }
229
230 void
231 bs_initialize(void)
232 {
233 int i;
234
235 /*
236 * List of all backing store.
237 */
238 BSL_LOCK_INIT();
239 queue_init(&backing_store_list.bsl_queue);
240 PSL_LOCK_INIT();
241
242 VS_ASYNC_LOCK_INIT();
243 #if VS_ASYNC_REUSE
244 vs_async_free_list = NULL;
245 #endif /* VS_ASYNC_REUSE */
246
247 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
248 clustered_writes[i] = 0;
249 clustered_reads[i] = 0;
250 }
251
252 }
253
254 /*
255 * When things do not quite workout...
256 */
257 void bs_no_paging_space(boolean_t); /* forward */
258
259 void
260 bs_no_paging_space(
261 boolean_t out_of_memory)
262 {
263
264 if (out_of_memory)
265 dprintf(("*** OUT OF MEMORY ***\n"));
266 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
267 }
268
269 void bs_more_space(int); /* forward */
270 void bs_commit(int); /* forward */
271
272 boolean_t user_warned = FALSE;
273 unsigned int clusters_committed = 0;
274 unsigned int clusters_available = 0;
275 unsigned int clusters_committed_peak = 0;
276
277 void
278 bs_more_space(
279 int nclusters)
280 {
281 BSL_LOCK();
282 /*
283 * Account for new paging space.
284 */
285 clusters_available += nclusters;
286
287 if (clusters_available >= clusters_committed) {
288 if (verbose && user_warned) {
289 printf("%s%s - %d excess clusters now.\n",
290 my_name,
291 "paging space is OK now",
292 clusters_available - clusters_committed);
293 user_warned = FALSE;
294 clusters_committed_peak = 0;
295 }
296 } else {
297 if (verbose && user_warned) {
298 printf("%s%s - still short of %d clusters.\n",
299 my_name,
300 "WARNING: paging space over-committed",
301 clusters_committed - clusters_available);
302 clusters_committed_peak -= nclusters;
303 }
304 }
305 BSL_UNLOCK();
306
307 return;
308 }
309
310 void
311 bs_commit(
312 int nclusters)
313 {
314 BSL_LOCK();
315 clusters_committed += nclusters;
316 if (clusters_committed > clusters_available) {
317 if (verbose && !user_warned) {
318 user_warned = TRUE;
319 printf("%s%s - short of %d clusters.\n",
320 my_name,
321 "WARNING: paging space over-committed",
322 clusters_committed - clusters_available);
323 }
324 if (clusters_committed > clusters_committed_peak) {
325 clusters_committed_peak = clusters_committed;
326 }
327 } else {
328 if (verbose && user_warned) {
329 printf("%s%s - was short of up to %d clusters.\n",
330 my_name,
331 "paging space is OK now",
332 clusters_committed_peak - clusters_available);
333 user_warned = FALSE;
334 clusters_committed_peak = 0;
335 }
336 }
337 BSL_UNLOCK();
338
339 return;
340 }
341
342 int default_pager_info_verbose = 1;
343
344 void
345 bs_global_info(
346 vm_size_t *totalp,
347 vm_size_t *freep)
348 {
349 vm_size_t pages_total, pages_free;
350 paging_segment_t ps;
351 int i;
352
353 PSL_LOCK();
354 pages_total = pages_free = 0;
355 for (i = 0; i <= paging_segment_max; i++) {
356 ps = paging_segments[i];
357 if (ps == PAGING_SEGMENT_NULL)
358 continue;
359
360 /*
361 * no need to lock: by the time this data
362 * gets back to any remote requestor it
363 * will be obsolete anyways
364 */
365 pages_total += ps->ps_pgnum;
366 pages_free += ps->ps_clcount << ps->ps_clshift;
367 DP_DEBUG(DEBUG_BS_INTERNAL,
368 ("segment #%d: %d total, %d free\n",
369 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
370 }
371 *totalp = pages_total;
372 *freep = pages_free;
373 if (verbose && user_warned && default_pager_info_verbose) {
374 if (clusters_available < clusters_committed) {
375 printf("%s %d clusters committed, %d available.\n",
376 my_name,
377 clusters_committed,
378 clusters_available);
379 }
380 }
381 PSL_UNLOCK();
382 }
383
384 backing_store_t backing_store_alloc(void); /* forward */
385
386 backing_store_t
387 backing_store_alloc(void)
388 {
389 backing_store_t bs;
390
391 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
392 if (bs == BACKING_STORE_NULL)
393 panic("backing_store_alloc: no memory");
394
395 BS_LOCK_INIT(bs);
396 bs->bs_port = MACH_PORT_NULL;
397 bs->bs_priority = 0;
398 bs->bs_clsize = 0;
399 bs->bs_pages_total = 0;
400 bs->bs_pages_in = 0;
401 bs->bs_pages_in_fail = 0;
402 bs->bs_pages_out = 0;
403 bs->bs_pages_out_fail = 0;
404
405 return bs;
406 }
407
408 backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
409
410 /* Even in both the component space and external versions of this pager, */
411 /* backing_store_lookup will be called from tasks in the application space */
412 backing_store_t
413 backing_store_lookup(
414 MACH_PORT_FACE port)
415 {
416 backing_store_t bs;
417
418 /*
419 port is currently backed with a vs structure in the alias field
420 we could create an ISBS alias and a port_is_bs call but frankly
421 I see no reason for the test, the bs->port == port check below
422 will work properly on junk entries.
423
424 if ((port == MACH_PORT_NULL) || port_is_vs(port))
425 */
426 if ((port == MACH_PORT_NULL))
427 return BACKING_STORE_NULL;
428
429 BSL_LOCK();
430 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
431 bs_links) {
432 BS_LOCK(bs);
433 if (bs->bs_port == port) {
434 BSL_UNLOCK();
435 /* Success, return it locked. */
436 return bs;
437 }
438 BS_UNLOCK(bs);
439 }
440 BSL_UNLOCK();
441 return BACKING_STORE_NULL;
442 }
443
444 void backing_store_add(backing_store_t); /* forward */
445
446 void
447 backing_store_add(
448 __unused backing_store_t bs)
449 {
450 // MACH_PORT_FACE port = bs->bs_port;
451 // MACH_PORT_FACE pset = default_pager_default_set;
452 kern_return_t kr = KERN_SUCCESS;
453
454 if (kr != KERN_SUCCESS)
455 panic("backing_store_add: add to set");
456
457 }
458
459 /*
460 * Set up default page shift, but only if not already
461 * set and argument is within range.
462 */
463 boolean_t
464 bs_set_default_clsize(unsigned int npages)
465 {
466 switch(npages){
467 case 1:
468 case 2:
469 case 4:
470 case 8:
471 if (default_pager_clsize == 0) /* if not yet set */
472 vstruct_def_clshift = local_log2(npages);
473 return(TRUE);
474 }
475 return(FALSE);
476 }
477
478 int bs_get_global_clsize(int clsize); /* forward */
479
480 int
481 bs_get_global_clsize(
482 int clsize)
483 {
484 int i;
485 memory_object_default_t dmm;
486 kern_return_t kr;
487
488 /*
489 * Only allow setting of cluster size once. If called
490 * with no cluster size (default), we use the compiled-in default
491 * for the duration. The same cluster size is used for all
492 * paging segments.
493 */
494 if (default_pager_clsize == 0) {
495 /*
496 * Keep cluster size in bit shift because it's quicker
497 * arithmetic, and easier to keep at a power of 2.
498 */
499 if (clsize != NO_CLSIZE) {
500 for (i = 0; (1 << i) < clsize; i++);
501 if (i > MAX_CLUSTER_SHIFT)
502 i = MAX_CLUSTER_SHIFT;
503 vstruct_def_clshift = i;
504 }
505 default_pager_clsize = (1 << vstruct_def_clshift);
506
507 /*
508 * Let the user know the new (and definitive) cluster size.
509 */
510 if (verbose)
511 printf("%scluster size = %d page%s\n",
512 my_name, default_pager_clsize,
513 (default_pager_clsize == 1) ? "" : "s");
514
515 /*
516 * Let the kernel know too, in case it hasn't used the
517 * default value provided in main() yet.
518 */
519 dmm = default_pager_object;
520 clsize = default_pager_clsize * vm_page_size; /* in bytes */
521 kr = host_default_memory_manager(host_priv_self(),
522 &dmm,
523 clsize);
524 memory_object_default_deallocate(dmm);
525
526 if (kr != KERN_SUCCESS) {
527 panic("bs_get_global_cl_size:host_default_memory_manager");
528 }
529 if (dmm != default_pager_object) {
530 panic("bs_get_global_cl_size:there is another default pager");
531 }
532 }
533 ASSERT(default_pager_clsize > 0 &&
534 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
535
536 return default_pager_clsize;
537 }
538
539 kern_return_t
540 default_pager_backing_store_create(
541 memory_object_default_t pager,
542 int priority,
543 int clsize, /* in bytes */
544 MACH_PORT_FACE *backing_store)
545 {
546 backing_store_t bs;
547 MACH_PORT_FACE port;
548 // kern_return_t kr;
549 struct vstruct_alias *alias_struct;
550
551 if (pager != default_pager_object)
552 return KERN_INVALID_ARGUMENT;
553
554 bs = backing_store_alloc();
555 port = ipc_port_alloc_kernel();
556 ipc_port_make_send(port);
557 assert (port != IP_NULL);
558
559 DP_DEBUG(DEBUG_BS_EXTERNAL,
560 ("priority=%d clsize=%d bs_port=0x%x\n",
561 priority, clsize, (int) backing_store));
562
563 alias_struct = (struct vstruct_alias *)
564 kalloc(sizeof (struct vstruct_alias));
565 if(alias_struct != NULL) {
566 alias_struct->vs = (struct vstruct *)bs;
567 alias_struct->name = ISVS;
568 port->alias = (int) alias_struct;
569 }
570 else {
571 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
572 kfree(bs, sizeof (struct backing_store));
573 return KERN_RESOURCE_SHORTAGE;
574 }
575
576 bs->bs_port = port;
577 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
578 priority = BS_MAXPRI;
579 else if (priority == BS_NOPRI)
580 priority = BS_MAXPRI;
581 else
582 priority = BS_MINPRI;
583 bs->bs_priority = priority;
584
585 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
586
587 BSL_LOCK();
588 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
589 bs_links);
590 BSL_UNLOCK();
591
592 backing_store_add(bs);
593
594 *backing_store = port;
595 return KERN_SUCCESS;
596 }
597
598 kern_return_t
599 default_pager_backing_store_info(
600 MACH_PORT_FACE backing_store,
601 backing_store_flavor_t flavour,
602 backing_store_info_t info,
603 mach_msg_type_number_t *size)
604 {
605 backing_store_t bs;
606 backing_store_basic_info_t basic;
607 int i;
608 paging_segment_t ps;
609
610 if (flavour != BACKING_STORE_BASIC_INFO ||
611 *size < BACKING_STORE_BASIC_INFO_COUNT)
612 return KERN_INVALID_ARGUMENT;
613
614 basic = (backing_store_basic_info_t)info;
615 *size = BACKING_STORE_BASIC_INFO_COUNT;
616
617 VSTATS_LOCK(&global_stats.gs_lock);
618 basic->pageout_calls = global_stats.gs_pageout_calls;
619 basic->pagein_calls = global_stats.gs_pagein_calls;
620 basic->pages_in = global_stats.gs_pages_in;
621 basic->pages_out = global_stats.gs_pages_out;
622 basic->pages_unavail = global_stats.gs_pages_unavail;
623 basic->pages_init = global_stats.gs_pages_init;
624 basic->pages_init_writes= global_stats.gs_pages_init_writes;
625 VSTATS_UNLOCK(&global_stats.gs_lock);
626
627 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
628 return KERN_INVALID_ARGUMENT;
629
630 basic->bs_pages_total = bs->bs_pages_total;
631 PSL_LOCK();
632 bs->bs_pages_free = 0;
633 for (i = 0; i <= paging_segment_max; i++) {
634 ps = paging_segments[i];
635 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
636 PS_LOCK(ps);
637 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
638 PS_UNLOCK(ps);
639 }
640 }
641 PSL_UNLOCK();
642 basic->bs_pages_free = bs->bs_pages_free;
643 basic->bs_pages_in = bs->bs_pages_in;
644 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
645 basic->bs_pages_out = bs->bs_pages_out;
646 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
647
648 basic->bs_priority = bs->bs_priority;
649 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */
650
651 BS_UNLOCK(bs);
652
653 return KERN_SUCCESS;
654 }
655
656 int ps_delete(paging_segment_t); /* forward */
657
658 int
659 ps_delete(
660 paging_segment_t ps)
661 {
662 vstruct_t vs;
663 kern_return_t error = KERN_SUCCESS;
664 int vs_count;
665
666 VSL_LOCK(); /* get the lock on the list of vs's */
667
668 /* The lock relationship and sequence is farily complicated */
669 /* this code looks at a live list, locking and unlocking the list */
670 /* as it traverses it. It depends on the locking behavior of */
671 /* default_pager_no_senders. no_senders always locks the vstruct */
672 /* targeted for removal before locking the vstruct list. However */
673 /* it will remove that member of the list without locking its */
674 /* neighbors. We can be sure when we hold a lock on a vstruct */
675 /* it cannot be removed from the list but we must hold the list */
676 /* lock to be sure that its pointers to its neighbors are valid. */
677 /* Also, we can hold off destruction of a vstruct when the list */
678 /* lock and the vs locks are not being held by bumping the */
679 /* vs_async_pending count. */
680
681
682 while(backing_store_release_trigger_disable != 0) {
683 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
684 }
685
686 /* we will choose instead to hold a send right */
687 vs_count = vstruct_list.vsl_count;
688 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
689 if(vs == (vstruct_t)&vstruct_list) {
690 VSL_UNLOCK();
691 return KERN_SUCCESS;
692 }
693 VS_LOCK(vs);
694 vs_async_wait(vs); /* wait for any pending async writes */
695 if ((vs_count != 0) && (vs != NULL))
696 vs->vs_async_pending += 1; /* hold parties calling */
697 /* vs_async_wait */
698 VS_UNLOCK(vs);
699 VSL_UNLOCK();
700 while((vs_count != 0) && (vs != NULL)) {
701 /* We take the count of AMO's before beginning the */
702 /* transfer of of the target segment. */
703 /* We are guaranteed that the target segment cannot get */
704 /* more users. We also know that queue entries are */
705 /* made at the back of the list. If some of the entries */
706 /* we would check disappear while we are traversing the */
707 /* list then we will either check new entries which */
708 /* do not have any backing store in the target segment */
709 /* or re-check old entries. This might not be optimal */
710 /* but it will always be correct. The alternative is to */
711 /* take a snapshot of the list. */
712 vstruct_t next_vs;
713
714 if(dp_pages_free < cluster_transfer_minimum)
715 error = KERN_FAILURE;
716 else {
717 vm_object_t transfer_object;
718 int count;
719 upl_t upl;
720
721 transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
722 count = 0;
723 error = vm_object_upl_request(transfer_object,
724 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
725 &upl, NULL, &count,
726 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
727 | UPL_SET_INTERNAL);
728 if(error == KERN_SUCCESS) {
729 error = ps_vstruct_transfer_from_segment(
730 vs, ps, upl);
731 upl_commit(upl, NULL, 0);
732 upl_deallocate(upl);
733 } else {
734 error = KERN_FAILURE;
735 }
736 vm_object_deallocate(transfer_object);
737 }
738 if(error) {
739 VS_LOCK(vs);
740 vs->vs_async_pending -= 1; /* release vs_async_wait */
741 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
742 vs->vs_waiting_async = FALSE;
743 VS_UNLOCK(vs);
744 thread_wakeup(&vs->vs_async_pending);
745 } else {
746 VS_UNLOCK(vs);
747 }
748 return KERN_FAILURE;
749 }
750
751 VSL_LOCK();
752
753 while(backing_store_release_trigger_disable != 0) {
754 VSL_SLEEP(&backing_store_release_trigger_disable,
755 THREAD_UNINT);
756 }
757
758 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
759 if((next_vs != (vstruct_t)&vstruct_list) &&
760 (vs != next_vs) && (vs_count != 1)) {
761 VS_LOCK(next_vs);
762 vs_async_wait(next_vs); /* wait for any */
763 /* pending async writes */
764 next_vs->vs_async_pending += 1; /* hold parties */
765 /* calling vs_async_wait */
766 VS_UNLOCK(next_vs);
767 }
768 VSL_UNLOCK();
769 VS_LOCK(vs);
770 vs->vs_async_pending -= 1;
771 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
772 vs->vs_waiting_async = FALSE;
773 VS_UNLOCK(vs);
774 thread_wakeup(&vs->vs_async_pending);
775 } else {
776 VS_UNLOCK(vs);
777 }
778 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
779 vs = NULL;
780 else
781 vs = next_vs;
782 vs_count--;
783 }
784 return KERN_SUCCESS;
785 }
786
787
788 kern_return_t
789 default_pager_backing_store_delete(
790 MACH_PORT_FACE backing_store)
791 {
792 backing_store_t bs;
793 int i;
794 paging_segment_t ps;
795 int error;
796 int interim_pages_removed = 0;
797 // kern_return_t kr;
798
799 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
800 return KERN_INVALID_ARGUMENT;
801
802 #if 0
803 /* not implemented */
804 BS_UNLOCK(bs);
805 return KERN_FAILURE;
806 #endif
807
808 restart:
809 PSL_LOCK();
810 error = KERN_SUCCESS;
811 for (i = 0; i <= paging_segment_max; i++) {
812 ps = paging_segments[i];
813 if (ps != PAGING_SEGMENT_NULL &&
814 ps->ps_bs == bs &&
815 ! ps->ps_going_away) {
816 PS_LOCK(ps);
817 /* disable access to this segment */
818 ps->ps_going_away = TRUE;
819 PS_UNLOCK(ps);
820 /*
821 * The "ps" segment is "off-line" now,
822 * we can try and delete it...
823 */
824 if(dp_pages_free < (cluster_transfer_minimum
825 + ps->ps_pgcount)) {
826 error = KERN_FAILURE;
827 PSL_UNLOCK();
828 }
829 else {
830 /* remove all pages associated with the */
831 /* segment from the list of free pages */
832 /* when transfer is through, all target */
833 /* segment pages will appear to be free */
834
835 dp_pages_free -= ps->ps_pgcount;
836 interim_pages_removed += ps->ps_pgcount;
837 PSL_UNLOCK();
838 error = ps_delete(ps);
839 }
840 if (error != KERN_SUCCESS) {
841 /*
842 * We couldn't delete the segment,
843 * probably because there's not enough
844 * virtual memory left.
845 * Re-enable all the segments.
846 */
847 PSL_LOCK();
848 break;
849 }
850 goto restart;
851 }
852 }
853
854 if (error != KERN_SUCCESS) {
855 for (i = 0; i <= paging_segment_max; i++) {
856 ps = paging_segments[i];
857 if (ps != PAGING_SEGMENT_NULL &&
858 ps->ps_bs == bs &&
859 ps->ps_going_away) {
860 PS_LOCK(ps);
861 /* re-enable access to this segment */
862 ps->ps_going_away = FALSE;
863 PS_UNLOCK(ps);
864 }
865 }
866 dp_pages_free += interim_pages_removed;
867 PSL_UNLOCK();
868 BS_UNLOCK(bs);
869 return error;
870 }
871
872 for (i = 0; i <= paging_segment_max; i++) {
873 ps = paging_segments[i];
874 if (ps != PAGING_SEGMENT_NULL &&
875 ps->ps_bs == bs) {
876 if(ps->ps_going_away) {
877 paging_segments[i] = PAGING_SEGMENT_NULL;
878 paging_segment_count--;
879 PS_LOCK(ps);
880 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
881 kfree(ps, sizeof *ps);
882 }
883 }
884 }
885
886 /* Scan the entire ps array separately to make certain we find the */
887 /* proper paging_segment_max */
888 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
889 if(paging_segments[i] != PAGING_SEGMENT_NULL)
890 paging_segment_max = i;
891 }
892
893 PSL_UNLOCK();
894
895 /*
896 * All the segments have been deleted.
897 * We can remove the backing store.
898 */
899
900 /*
901 * Disable lookups of this backing store.
902 */
903 if((void *)bs->bs_port->alias != NULL)
904 kfree((void *) bs->bs_port->alias,
905 sizeof (struct vstruct_alias));
906 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
907 bs->bs_port = MACH_PORT_NULL;
908 BS_UNLOCK(bs);
909
910 /*
911 * Remove backing store from backing_store list.
912 */
913 BSL_LOCK();
914 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
915 bs_links);
916 BSL_UNLOCK();
917
918 /*
919 * Free the backing store structure.
920 */
921 kfree(bs, sizeof *bs);
922
923 return KERN_SUCCESS;
924 }
925
926 int ps_enter(paging_segment_t); /* forward */
927
928 int
929 ps_enter(
930 paging_segment_t ps)
931 {
932 int i;
933
934 PSL_LOCK();
935
936 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
937 if (paging_segments[i] == PAGING_SEGMENT_NULL)
938 break;
939 }
940
941 if (i < MAX_NUM_PAGING_SEGMENTS) {
942 paging_segments[i] = ps;
943 if (i > paging_segment_max)
944 paging_segment_max = i;
945 paging_segment_count++;
946 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
947 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
948 ps_select_array[ps->ps_bs->bs_priority] = 0;
949 i = 0;
950 } else {
951 PSL_UNLOCK();
952 return KERN_RESOURCE_SHORTAGE;
953 }
954
955 PSL_UNLOCK();
956 return i;
957 }
958
959 #ifdef DEVICE_PAGING
960 kern_return_t
961 default_pager_add_segment(
962 MACH_PORT_FACE backing_store,
963 MACH_PORT_FACE device,
964 recnum_t offset,
965 recnum_t count,
966 int record_size)
967 {
968 backing_store_t bs;
969 paging_segment_t ps;
970 int i;
971 int error;
972
973 if ((bs = backing_store_lookup(backing_store))
974 == BACKING_STORE_NULL)
975 return KERN_INVALID_ARGUMENT;
976
977 PSL_LOCK();
978 for (i = 0; i <= paging_segment_max; i++) {
979 ps = paging_segments[i];
980 if (ps == PAGING_SEGMENT_NULL)
981 continue;
982
983 /*
984 * Check for overlap on same device.
985 */
986 if (!(ps->ps_device != device
987 || offset >= ps->ps_offset + ps->ps_recnum
988 || offset + count <= ps->ps_offset)) {
989 PSL_UNLOCK();
990 BS_UNLOCK(bs);
991 return KERN_INVALID_ARGUMENT;
992 }
993 }
994 PSL_UNLOCK();
995
996 /*
997 * Set up the paging segment
998 */
999 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1000 if (ps == PAGING_SEGMENT_NULL) {
1001 BS_UNLOCK(bs);
1002 return KERN_RESOURCE_SHORTAGE;
1003 }
1004
1005 ps->ps_segtype = PS_PARTITION;
1006 ps->ps_device = device;
1007 ps->ps_offset = offset;
1008 ps->ps_record_shift = local_log2(vm_page_size / record_size);
1009 ps->ps_recnum = count;
1010 ps->ps_pgnum = count >> ps->ps_record_shift;
1011
1012 ps->ps_pgcount = ps->ps_pgnum;
1013 ps->ps_clshift = local_log2(bs->bs_clsize);
1014 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1015 ps->ps_hint = 0;
1016
1017 PS_LOCK_INIT(ps);
1018 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1019 if (!ps->ps_bmap) {
1020 kfree(ps, sizeof *ps);
1021 BS_UNLOCK(bs);
1022 return KERN_RESOURCE_SHORTAGE;
1023 }
1024 for (i = 0; i < ps->ps_ncls; i++) {
1025 clrbit(ps->ps_bmap, i);
1026 }
1027
1028 ps->ps_going_away = FALSE;
1029 ps->ps_bs = bs;
1030
1031 if ((error = ps_enter(ps)) != 0) {
1032 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1033 kfree(ps, sizeof *ps);
1034 BS_UNLOCK(bs);
1035 return KERN_RESOURCE_SHORTAGE;
1036 }
1037
1038 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1039 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1040 BS_UNLOCK(bs);
1041
1042 PSL_LOCK();
1043 dp_pages_free += ps->ps_pgcount;
1044 PSL_UNLOCK();
1045
1046 bs_more_space(ps->ps_clcount);
1047
1048 DP_DEBUG(DEBUG_BS_INTERNAL,
1049 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1050 device, offset, count, record_size,
1051 ps->ps_record_shift, ps->ps_pgnum));
1052
1053 return KERN_SUCCESS;
1054 }
1055
1056 boolean_t
1057 bs_add_device(
1058 char *dev_name,
1059 MACH_PORT_FACE master)
1060 {
1061 security_token_t null_security_token = {
1062 { 0, 0 }
1063 };
1064 MACH_PORT_FACE device;
1065 int info[DEV_GET_SIZE_COUNT];
1066 mach_msg_type_number_t info_count;
1067 MACH_PORT_FACE bs = MACH_PORT_NULL;
1068 unsigned int rec_size;
1069 recnum_t count;
1070 int clsize;
1071 MACH_PORT_FACE reply_port;
1072
1073 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1074 null_security_token, dev_name, &device))
1075 return FALSE;
1076
1077 info_count = DEV_GET_SIZE_COUNT;
1078 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1079 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1080 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1081 clsize = bs_get_global_clsize(0);
1082 if (!default_pager_backing_store_create(
1083 default_pager_object,
1084 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1085 (clsize * vm_page_size),
1086 &bs)) {
1087 if (!default_pager_add_segment(bs, device,
1088 0, count, rec_size)) {
1089 return TRUE;
1090 }
1091 ipc_port_release_receive(bs);
1092 }
1093 }
1094
1095 ipc_port_release_send(device);
1096 return FALSE;
1097 }
1098 #endif /* DEVICE_PAGING */
1099
1100 #if VS_ASYNC_REUSE
1101
1102 struct vs_async *
1103 vs_alloc_async(void)
1104 {
1105 struct vs_async *vsa;
1106 MACH_PORT_FACE reply_port;
1107 // kern_return_t kr;
1108
1109 VS_ASYNC_LOCK();
1110 if (vs_async_free_list == NULL) {
1111 VS_ASYNC_UNLOCK();
1112 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1113 if (vsa != NULL) {
1114 /*
1115 * Try allocating a reply port named after the
1116 * address of the vs_async structure.
1117 */
1118 struct vstruct_alias *alias_struct;
1119
1120 reply_port = ipc_port_alloc_kernel();
1121 alias_struct = (struct vstruct_alias *)
1122 kalloc(sizeof (struct vstruct_alias));
1123 if(alias_struct != NULL) {
1124 alias_struct->vs = (struct vstruct *)vsa;
1125 alias_struct->name = ISVS;
1126 reply_port->alias = (int) alias_struct;
1127 vsa->reply_port = reply_port;
1128 vs_alloc_async_count++;
1129 }
1130 else {
1131 vs_alloc_async_failed++;
1132 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1133 (reply_port));
1134 kfree(vsa, sizeof (struct vs_async));
1135 vsa = NULL;
1136 }
1137 }
1138 } else {
1139 vsa = vs_async_free_list;
1140 vs_async_free_list = vs_async_free_list->vsa_next;
1141 VS_ASYNC_UNLOCK();
1142 }
1143
1144 return vsa;
1145 }
1146
1147 void
1148 vs_free_async(
1149 struct vs_async *vsa)
1150 {
1151 VS_ASYNC_LOCK();
1152 vsa->vsa_next = vs_async_free_list;
1153 vs_async_free_list = vsa;
1154 VS_ASYNC_UNLOCK();
1155 }
1156
1157 #else /* VS_ASYNC_REUSE */
1158
1159 struct vs_async *
1160 vs_alloc_async(void)
1161 {
1162 struct vs_async *vsa;
1163 MACH_PORT_FACE reply_port;
1164 kern_return_t kr;
1165
1166 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1167 if (vsa != NULL) {
1168 /*
1169 * Try allocating a reply port named after the
1170 * address of the vs_async structure.
1171 */
1172 reply_port = ipc_port_alloc_kernel();
1173 alias_struct = (vstruct_alias *)
1174 kalloc(sizeof (struct vstruct_alias));
1175 if(alias_struct != NULL) {
1176 alias_struct->vs = reply_port;
1177 alias_struct->name = ISVS;
1178 reply_port->alias = (int) vsa;
1179 vsa->reply_port = reply_port;
1180 vs_alloc_async_count++;
1181 }
1182 else {
1183 vs_alloc_async_failed++;
1184 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1185 (reply_port));
1186 kfree(vsa, sizeof (struct vs_async));
1187 vsa = NULL;
1188 }
1189 }
1190
1191 return vsa;
1192 }
1193
1194 void
1195 vs_free_async(
1196 struct vs_async *vsa)
1197 {
1198 MACH_PORT_FACE reply_port;
1199 kern_return_t kr;
1200
1201 reply_port = vsa->reply_port;
1202 kfree(reply_port->alias, sizeof (struct vstuct_alias));
1203 kfree(vsa, sizeof (struct vs_async));
1204 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1205 #if 0
1206 VS_ASYNC_LOCK();
1207 vs_alloc_async_count--;
1208 VS_ASYNC_UNLOCK();
1209 #endif
1210 }
1211
1212 #endif /* VS_ASYNC_REUSE */
1213
1214 zone_t vstruct_zone;
1215
1216 vstruct_t
1217 ps_vstruct_create(
1218 vm_size_t size)
1219 {
1220 vstruct_t vs;
1221 unsigned int i;
1222
1223 vs = (vstruct_t) zalloc(vstruct_zone);
1224 if (vs == VSTRUCT_NULL) {
1225 return VSTRUCT_NULL;
1226 }
1227
1228 VS_LOCK_INIT(vs);
1229
1230 /*
1231 * The following fields will be provided later.
1232 */
1233 vs->vs_mem_obj = NULL;
1234 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1235 vs->vs_references = 1;
1236 vs->vs_seqno = 0;
1237
1238 #ifdef MACH_KERNEL
1239 vs->vs_waiting_seqno = FALSE;
1240 vs->vs_waiting_read = FALSE;
1241 vs->vs_waiting_write = FALSE;
1242 vs->vs_waiting_async = FALSE;
1243 #else
1244 mutex_init(&vs->vs_waiting_seqno, 0);
1245 mutex_init(&vs->vs_waiting_read, 0);
1246 mutex_init(&vs->vs_waiting_write, 0);
1247 mutex_init(&vs->vs_waiting_refs, 0);
1248 mutex_init(&vs->vs_waiting_async, 0);
1249 #endif
1250
1251 vs->vs_readers = 0;
1252 vs->vs_writers = 0;
1253
1254 vs->vs_errors = 0;
1255
1256 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1257 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1258 vs->vs_async_pending = 0;
1259
1260 /*
1261 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1262 * depending on the size of the memory object.
1263 */
1264 if (INDIRECT_CLMAP(vs->vs_size)) {
1265 vs->vs_imap = (struct vs_map **)
1266 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1267 vs->vs_indirect = TRUE;
1268 } else {
1269 vs->vs_dmap = (struct vs_map *)
1270 kalloc(CLMAP_SIZE(vs->vs_size));
1271 vs->vs_indirect = FALSE;
1272 }
1273 vs->vs_xfer_pending = FALSE;
1274 DP_DEBUG(DEBUG_VS_INTERNAL,
1275 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1276
1277 /*
1278 * Check to see that we got the space.
1279 */
1280 if (!vs->vs_dmap) {
1281 kfree(vs, sizeof *vs);
1282 return VSTRUCT_NULL;
1283 }
1284
1285 /*
1286 * Zero the indirect pointers, or clear the direct pointers.
1287 */
1288 if (vs->vs_indirect)
1289 memset(vs->vs_imap, 0,
1290 INDIRECT_CLMAP_SIZE(vs->vs_size));
1291 else
1292 for (i = 0; i < vs->vs_size; i++)
1293 VSM_CLR(vs->vs_dmap[i]);
1294
1295 VS_MAP_LOCK_INIT(vs);
1296
1297 bs_commit(vs->vs_size);
1298
1299 return vs;
1300 }
1301
1302 paging_segment_t ps_select_segment(unsigned int, int *); /* forward */
1303
1304 paging_segment_t
1305 ps_select_segment(
1306 unsigned int shift,
1307 int *psindex)
1308 {
1309 paging_segment_t ps;
1310 int i;
1311 int j;
1312
1313 /*
1314 * Optimize case where there's only one segment.
1315 * paging_segment_max will index the one and only segment.
1316 */
1317
1318 PSL_LOCK();
1319 if (paging_segment_count == 1) {
1320 paging_segment_t lps; /* used to avoid extra PS_UNLOCK */
1321 ipc_port_t trigger = IP_NULL;
1322
1323 ps = paging_segments[paging_segment_max];
1324 *psindex = paging_segment_max;
1325 PS_LOCK(ps);
1326 if (ps->ps_going_away) {
1327 /* this segment is being turned off */
1328 lps = PAGING_SEGMENT_NULL;
1329 } else {
1330 ASSERT(ps->ps_clshift >= shift);
1331 if (ps->ps_clcount) {
1332 ps->ps_clcount--;
1333 dp_pages_free -= 1 << ps->ps_clshift;
1334 if(min_pages_trigger_port &&
1335 (dp_pages_free < minimum_pages_remaining)) {
1336 trigger = min_pages_trigger_port;
1337 min_pages_trigger_port = NULL;
1338 bs_low = TRUE;
1339 }
1340 lps = ps;
1341 } else
1342 lps = PAGING_SEGMENT_NULL;
1343 }
1344 PS_UNLOCK(ps);
1345 PSL_UNLOCK();
1346
1347 if (trigger != IP_NULL) {
1348 default_pager_space_alert(trigger, HI_WAT_ALERT);
1349 ipc_port_release_send(trigger);
1350 }
1351 return lps;
1352 }
1353
1354 if (paging_segment_count == 0) {
1355 PSL_UNLOCK();
1356 return PAGING_SEGMENT_NULL;
1357 }
1358
1359 for (i = BS_MAXPRI;
1360 i >= BS_MINPRI; i--) {
1361 int start_index;
1362
1363 if ((ps_select_array[i] == BS_NOPRI) ||
1364 (ps_select_array[i] == BS_FULLPRI))
1365 continue;
1366 start_index = ps_select_array[i];
1367
1368 if(!(paging_segments[start_index])) {
1369 j = start_index+1;
1370 physical_transfer_cluster_count = 0;
1371 }
1372 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1373 (((paging_segments[start_index])->ps_clshift)
1374 + vm_page_shift))) {
1375 physical_transfer_cluster_count = 0;
1376 j = start_index + 1;
1377 } else {
1378 physical_transfer_cluster_count+=1;
1379 j = start_index;
1380 if(start_index == 0)
1381 start_index = paging_segment_max;
1382 else
1383 start_index = start_index - 1;
1384 }
1385
1386 while (1) {
1387 if (j > paging_segment_max)
1388 j = 0;
1389 if ((ps = paging_segments[j]) &&
1390 (ps->ps_bs->bs_priority == i)) {
1391 /*
1392 * Force the ps cluster size to be
1393 * >= that of the vstruct.
1394 */
1395 PS_LOCK(ps);
1396 if (ps->ps_going_away) {
1397 /* this segment is being turned off */
1398 } else if ((ps->ps_clcount) &&
1399 (ps->ps_clshift >= shift)) {
1400 ipc_port_t trigger = IP_NULL;
1401
1402 ps->ps_clcount--;
1403 dp_pages_free -= 1 << ps->ps_clshift;
1404 if(min_pages_trigger_port &&
1405 (dp_pages_free <
1406 minimum_pages_remaining)) {
1407 trigger = min_pages_trigger_port;
1408 min_pages_trigger_port = NULL;
1409 }
1410 PS_UNLOCK(ps);
1411 /*
1412 * found one, quit looking.
1413 */
1414 ps_select_array[i] = j;
1415 PSL_UNLOCK();
1416
1417 if (trigger != IP_NULL) {
1418 default_pager_space_alert(
1419 trigger,
1420 HI_WAT_ALERT);
1421 ipc_port_release_send(trigger);
1422 }
1423 *psindex = j;
1424 return ps;
1425 }
1426 PS_UNLOCK(ps);
1427 }
1428 if (j == start_index) {
1429 /*
1430 * none at this priority -- mark it full
1431 */
1432 ps_select_array[i] = BS_FULLPRI;
1433 break;
1434 }
1435 j++;
1436 }
1437 }
1438 PSL_UNLOCK();
1439 return PAGING_SEGMENT_NULL;
1440 }
1441
1442 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1443
1444 vm_offset_t
1445 ps_allocate_cluster(
1446 vstruct_t vs,
1447 int *psindex,
1448 paging_segment_t use_ps)
1449 {
1450 unsigned int byte_num;
1451 int bit_num = 0;
1452 paging_segment_t ps;
1453 vm_offset_t cluster;
1454 ipc_port_t trigger = IP_NULL;
1455
1456 /*
1457 * Find best paging segment.
1458 * ps_select_segment will decrement cluster count on ps.
1459 * Must pass cluster shift to find the most appropriate segment.
1460 */
1461 /* NOTE: The addition of paging segment delete capability threatened
1462 * to seriously complicate the treatment of paging segments in this
1463 * module and the ones that call it (notably ps_clmap), because of the
1464 * difficulty in assuring that the paging segment would continue to
1465 * exist between being unlocked and locked. This was
1466 * avoided because all calls to this module are based in either
1467 * dp_memory_object calls which rely on the vs lock, or by
1468 * the transfer function which is part of the segment delete path.
1469 * The transfer function which is part of paging segment delete is
1470 * protected from multiple callers by the backing store lock.
1471 * The paging segment delete function treats mappings to a paging
1472 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1473 * while data is transferred to the remaining segments. This is in
1474 * line with the view that incomplete or in-transition mappings between
1475 * data, a vstruct, and backing store are protected by the vs lock.
1476 * This and the ordering of the paging segment "going_away" bit setting
1477 * protects us.
1478 */
1479 if (use_ps != PAGING_SEGMENT_NULL) {
1480 ps = use_ps;
1481 PSL_LOCK();
1482 PS_LOCK(ps);
1483
1484 ASSERT(ps->ps_clcount != 0);
1485
1486 ps->ps_clcount--;
1487 dp_pages_free -= 1 << ps->ps_clshift;
1488 if(min_pages_trigger_port &&
1489 (dp_pages_free < minimum_pages_remaining)) {
1490 trigger = min_pages_trigger_port;
1491 min_pages_trigger_port = NULL;
1492 }
1493 PSL_UNLOCK();
1494 PS_UNLOCK(ps);
1495 if (trigger != IP_NULL) {
1496 default_pager_space_alert(trigger, HI_WAT_ALERT);
1497 ipc_port_release_send(trigger);
1498 }
1499
1500 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1501 PAGING_SEGMENT_NULL) {
1502 static uint32_t lastnotify = 0;
1503 uint32_t now, nanoseconds_dummy;
1504
1505 /*
1506 * Emit a notification of the low-paging resource condition
1507 * but don't issue it more than once every five seconds. This
1508 * prevents us from overflowing logs with thousands of
1509 * repetitions of the message.
1510 */
1511 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1512 if (now > lastnotify + 5) {
1513 dprintf(("no space in available paging segments\n"));
1514 lastnotify = now;
1515 }
1516
1517 /* the count got off maybe, reset to zero */
1518 PSL_LOCK();
1519 dp_pages_free = 0;
1520 if(min_pages_trigger_port) {
1521 trigger = min_pages_trigger_port;
1522 min_pages_trigger_port = NULL;
1523 bs_low = TRUE;
1524 }
1525 PSL_UNLOCK();
1526 if (trigger != IP_NULL) {
1527 default_pager_space_alert(trigger, HI_WAT_ALERT);
1528 ipc_port_release_send(trigger);
1529 }
1530 return (vm_offset_t) -1;
1531 }
1532
1533 /*
1534 * Look for an available cluster. At the end of the loop,
1535 * byte_num is the byte offset and bit_num is the bit offset of the
1536 * first zero bit in the paging segment bitmap.
1537 */
1538 PS_LOCK(ps);
1539 byte_num = ps->ps_hint;
1540 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1541 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1542 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1543 if (isclr((ps->ps_bmap + byte_num), bit_num))
1544 break;
1545 }
1546 ASSERT(bit_num != NBBY);
1547 break;
1548 }
1549 }
1550 ps->ps_hint = byte_num;
1551 cluster = (byte_num*NBBY) + bit_num;
1552
1553 /* Space was reserved, so this must be true */
1554 ASSERT(cluster < ps->ps_ncls);
1555
1556 setbit(ps->ps_bmap, cluster);
1557 PS_UNLOCK(ps);
1558
1559 return cluster;
1560 }
1561
1562 void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */
1563
1564 void
1565 ps_deallocate_cluster(
1566 paging_segment_t ps,
1567 vm_offset_t cluster)
1568 {
1569
1570 if (cluster >= (vm_offset_t) ps->ps_ncls)
1571 panic("ps_deallocate_cluster: Invalid cluster number");
1572
1573 /*
1574 * Lock the paging segment, clear the cluster's bitmap and increment the
1575 * number of free cluster.
1576 */
1577 PSL_LOCK();
1578 PS_LOCK(ps);
1579 clrbit(ps->ps_bmap, cluster);
1580 ++ps->ps_clcount;
1581 dp_pages_free += 1 << ps->ps_clshift;
1582 PSL_UNLOCK();
1583
1584 /*
1585 * Move the hint down to the freed cluster if it is
1586 * less than the current hint.
1587 */
1588 if ((cluster/NBBY) < ps->ps_hint) {
1589 ps->ps_hint = (cluster/NBBY);
1590 }
1591
1592 PS_UNLOCK(ps);
1593
1594 /*
1595 * If we're freeing space on a full priority, reset the array.
1596 */
1597 PSL_LOCK();
1598 if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1599 ps_select_array[ps->ps_bs->bs_priority] = 0;
1600 PSL_UNLOCK();
1601
1602 return;
1603 }
1604
1605 void ps_dealloc_vsmap(struct vs_map *, vm_size_t); /* forward */
1606
1607 void
1608 ps_dealloc_vsmap(
1609 struct vs_map *vsmap,
1610 vm_size_t size)
1611 {
1612 unsigned int i;
1613 for (i = 0; i < size; i++)
1614 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1615 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1616 VSM_CLOFF(vsmap[i]));
1617 }
1618
1619 void
1620 ps_vstruct_dealloc(
1621 vstruct_t vs)
1622 {
1623 unsigned int i;
1624 // spl_t s;
1625
1626 VS_MAP_LOCK(vs);
1627
1628 /*
1629 * If this is an indirect structure, then we walk through the valid
1630 * (non-zero) indirect pointers and deallocate the clusters
1631 * associated with each used map entry (via ps_dealloc_vsmap).
1632 * When all of the clusters in an indirect block have been
1633 * freed, we deallocate the block. When all of the indirect
1634 * blocks have been deallocated we deallocate the memory
1635 * holding the indirect pointers.
1636 */
1637 if (vs->vs_indirect) {
1638 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1639 if (vs->vs_imap[i] != NULL) {
1640 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1641 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1642 }
1643 }
1644 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1645 } else {
1646 /*
1647 * Direct map. Free used clusters, then memory.
1648 */
1649 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1650 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1651 }
1652 VS_MAP_UNLOCK(vs);
1653
1654 bs_commit(- vs->vs_size);
1655
1656 zfree(vstruct_zone, vs);
1657 }
1658
1659 int ps_map_extend(vstruct_t, unsigned int); /* forward */
1660
1661 int ps_map_extend(
1662 vstruct_t vs,
1663 unsigned int new_size)
1664 {
1665 struct vs_map **new_imap;
1666 struct vs_map *new_dmap = NULL;
1667 int newdsize;
1668 int i;
1669 void *old_map = NULL;
1670 int old_map_size = 0;
1671
1672 if (vs->vs_size >= new_size) {
1673 /*
1674 * Someone has already done the work.
1675 */
1676 return 0;
1677 }
1678
1679 /*
1680 * If the new size extends into the indirect range, then we have one
1681 * of two cases: we are going from indirect to indirect, or we are
1682 * going from direct to indirect. If we are going from indirect to
1683 * indirect, then it is possible that the new size will fit in the old
1684 * indirect map. If this is the case, then just reset the size of the
1685 * vstruct map and we are done. If the new size will not
1686 * fit into the old indirect map, then we have to allocate a new
1687 * indirect map and copy the old map pointers into this new map.
1688 *
1689 * If we are going from direct to indirect, then we have to allocate a
1690 * new indirect map and copy the old direct pages into the first
1691 * indirect page of the new map.
1692 * NOTE: allocating memory here is dangerous, as we're in the
1693 * pageout path.
1694 */
1695 if (INDIRECT_CLMAP(new_size)) {
1696 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1697
1698 /*
1699 * Get a new indirect map and zero it.
1700 */
1701 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1702 if (vs->vs_indirect &&
1703 (new_map_size == old_map_size)) {
1704 bs_commit(new_size - vs->vs_size);
1705 vs->vs_size = new_size;
1706 return 0;
1707 }
1708
1709 new_imap = (struct vs_map **)kalloc(new_map_size);
1710 if (new_imap == NULL) {
1711 return -1;
1712 }
1713 memset(new_imap, 0, new_map_size);
1714
1715 if (vs->vs_indirect) {
1716 /* Copy old entries into new map */
1717 memcpy(new_imap, vs->vs_imap, old_map_size);
1718 /* Arrange to free the old map */
1719 old_map = (void *) vs->vs_imap;
1720 newdsize = 0;
1721 } else { /* Old map was a direct map */
1722 /* Allocate an indirect page */
1723 if ((new_imap[0] = (struct vs_map *)
1724 kalloc(CLMAP_THRESHOLD)) == NULL) {
1725 kfree(new_imap, new_map_size);
1726 return -1;
1727 }
1728 new_dmap = new_imap[0];
1729 newdsize = CLMAP_ENTRIES;
1730 }
1731 } else {
1732 new_imap = NULL;
1733 newdsize = new_size;
1734 /*
1735 * If the new map is a direct map, then the old map must
1736 * also have been a direct map. All we have to do is
1737 * to allocate a new direct map, copy the old entries
1738 * into it and free the old map.
1739 */
1740 if ((new_dmap = (struct vs_map *)
1741 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1742 return -1;
1743 }
1744 }
1745 if (newdsize) {
1746
1747 /* Free the old map */
1748 old_map = (void *) vs->vs_dmap;
1749 old_map_size = CLMAP_SIZE(vs->vs_size);
1750
1751 /* Copy info from the old map into the new map */
1752 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1753
1754 /* Initialize the rest of the new map */
1755 for (i = vs->vs_size; i < newdsize; i++)
1756 VSM_CLR(new_dmap[i]);
1757 }
1758 if (new_imap) {
1759 vs->vs_imap = new_imap;
1760 vs->vs_indirect = TRUE;
1761 } else
1762 vs->vs_dmap = new_dmap;
1763 bs_commit(new_size - vs->vs_size);
1764 vs->vs_size = new_size;
1765 if (old_map)
1766 kfree(old_map, old_map_size);
1767 return 0;
1768 }
1769
1770 vm_offset_t
1771 ps_clmap(
1772 vstruct_t vs,
1773 vm_offset_t offset,
1774 struct clmap *clmap,
1775 int flag,
1776 vm_size_t size,
1777 int error)
1778 {
1779 vm_offset_t cluster; /* The cluster of offset. */
1780 vm_offset_t newcl; /* The new cluster allocated. */
1781 vm_offset_t newoff;
1782 unsigned int i;
1783 struct vs_map *vsmap;
1784
1785 VS_MAP_LOCK(vs);
1786
1787 ASSERT(vs->vs_dmap);
1788 cluster = atop_32(offset) >> vs->vs_clshift;
1789
1790 /*
1791 * Initialize cluster error value
1792 */
1793 clmap->cl_error = 0;
1794
1795 /*
1796 * If the object has grown, extend the page map.
1797 */
1798 if (cluster >= vs->vs_size) {
1799 if (flag == CL_FIND) {
1800 /* Do not allocate if just doing a lookup */
1801 VS_MAP_UNLOCK(vs);
1802 return (vm_offset_t) -1;
1803 }
1804 if (ps_map_extend(vs, cluster + 1)) {
1805 VS_MAP_UNLOCK(vs);
1806 return (vm_offset_t) -1;
1807 }
1808 }
1809
1810 /*
1811 * Look for the desired cluster. If the map is indirect, then we
1812 * have a two level lookup. First find the indirect block, then
1813 * find the actual cluster. If the indirect block has not yet
1814 * been allocated, then do so. If the cluster has not yet been
1815 * allocated, then do so.
1816 *
1817 * If any of the allocations fail, then return an error.
1818 * Don't allocate if just doing a lookup.
1819 */
1820 if (vs->vs_indirect) {
1821 long ind_block = cluster/CLMAP_ENTRIES;
1822
1823 /* Is the indirect block allocated? */
1824 vsmap = vs->vs_imap[ind_block];
1825 if (vsmap == NULL) {
1826 if (flag == CL_FIND) {
1827 VS_MAP_UNLOCK(vs);
1828 return (vm_offset_t) -1;
1829 }
1830
1831 /* Allocate the indirect block */
1832 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1833 if (vsmap == NULL) {
1834 VS_MAP_UNLOCK(vs);
1835 return (vm_offset_t) -1;
1836 }
1837 /* Initialize the cluster offsets */
1838 for (i = 0; i < CLMAP_ENTRIES; i++)
1839 VSM_CLR(vsmap[i]);
1840 vs->vs_imap[ind_block] = vsmap;
1841 }
1842 } else
1843 vsmap = vs->vs_dmap;
1844
1845 ASSERT(vsmap);
1846 vsmap += cluster%CLMAP_ENTRIES;
1847
1848 /*
1849 * At this point, vsmap points to the struct vs_map desired.
1850 *
1851 * Look in the map for the cluster, if there was an error on a
1852 * previous write, flag it and return. If it is not yet
1853 * allocated, then allocate it, if we're writing; if we're
1854 * doing a lookup and the cluster's not allocated, return error.
1855 */
1856 if (VSM_ISERR(*vsmap)) {
1857 clmap->cl_error = VSM_GETERR(*vsmap);
1858 VS_MAP_UNLOCK(vs);
1859 return (vm_offset_t) -1;
1860 } else if (VSM_ISCLR(*vsmap)) {
1861 int psindex;
1862
1863 if (flag == CL_FIND) {
1864 /*
1865 * If there's an error and the entry is clear, then
1866 * we've run out of swap space. Record the error
1867 * here and return.
1868 */
1869 if (error) {
1870 VSM_SETERR(*vsmap, error);
1871 }
1872 VS_MAP_UNLOCK(vs);
1873 return (vm_offset_t) -1;
1874 } else {
1875 /*
1876 * Attempt to allocate a cluster from the paging segment
1877 */
1878 newcl = ps_allocate_cluster(vs, &psindex,
1879 PAGING_SEGMENT_NULL);
1880 if (newcl == (vm_offset_t) -1) {
1881 VS_MAP_UNLOCK(vs);
1882 return (vm_offset_t) -1;
1883 }
1884 VSM_CLR(*vsmap);
1885 VSM_SETCLOFF(*vsmap, newcl);
1886 VSM_SETPS(*vsmap, psindex);
1887 }
1888 } else
1889 newcl = VSM_CLOFF(*vsmap);
1890
1891 /*
1892 * Fill in pertinent fields of the clmap
1893 */
1894 clmap->cl_ps = VSM_PS(*vsmap);
1895 clmap->cl_numpages = VSCLSIZE(vs);
1896 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1897
1898 /*
1899 * Byte offset in paging segment is byte offset to cluster plus
1900 * byte offset within cluster. It looks ugly, but should be
1901 * relatively quick.
1902 */
1903 ASSERT(trunc_page(offset) == offset);
1904 newcl = ptoa_32(newcl) << vs->vs_clshift;
1905 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1906 if (flag == CL_ALLOC) {
1907 /*
1908 * set bits in the allocation bitmap according to which
1909 * pages were requested. size is in bytes.
1910 */
1911 i = atop_32(newoff);
1912 while ((size > 0) && (i < VSCLSIZE(vs))) {
1913 VSM_SETALLOC(*vsmap, i);
1914 i++;
1915 size -= vm_page_size;
1916 }
1917 }
1918 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1919 if (newoff) {
1920 /*
1921 * Offset is not cluster aligned, so number of pages
1922 * and bitmaps must be adjusted
1923 */
1924 clmap->cl_numpages -= atop_32(newoff);
1925 CLMAP_SHIFT(clmap, vs);
1926 CLMAP_SHIFTALLOC(clmap, vs);
1927 }
1928
1929 /*
1930 *
1931 * The setting of valid bits and handling of write errors
1932 * must be done here, while we hold the lock on the map.
1933 * It logically should be done in ps_vs_write_complete().
1934 * The size and error information has been passed from
1935 * ps_vs_write_complete(). If the size parameter is non-zero,
1936 * then there is work to be done. If error is also non-zero,
1937 * then the error number is recorded in the cluster and the
1938 * entire cluster is in error.
1939 */
1940 if (size && flag == CL_FIND) {
1941 vm_offset_t off = (vm_offset_t) 0;
1942
1943 if (!error) {
1944 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1945 i++) {
1946 VSM_SETPG(*vsmap, i);
1947 size -= vm_page_size;
1948 }
1949 ASSERT(i <= VSCLSIZE(vs));
1950 } else {
1951 BS_STAT(clmap->cl_ps->ps_bs,
1952 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1953 atop_32(size));
1954 off = VSM_CLOFF(*vsmap);
1955 VSM_SETERR(*vsmap, error);
1956 }
1957 /*
1958 * Deallocate cluster if error, and no valid pages
1959 * already present.
1960 */
1961 if (off != (vm_offset_t) 0)
1962 ps_deallocate_cluster(clmap->cl_ps, off);
1963 VS_MAP_UNLOCK(vs);
1964 return (vm_offset_t) 0;
1965 } else
1966 VS_MAP_UNLOCK(vs);
1967
1968 DP_DEBUG(DEBUG_VS_INTERNAL,
1969 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1970 newcl+newoff, (int) vs, (int) vsmap, flag));
1971 DP_DEBUG(DEBUG_VS_INTERNAL,
1972 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1973 (int) clmap->cl_ps, clmap->cl_numpages,
1974 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1975
1976 return (newcl + newoff);
1977 }
1978
1979 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */
1980
1981 void
1982 ps_clunmap(
1983 vstruct_t vs,
1984 vm_offset_t offset,
1985 vm_size_t length)
1986 {
1987 vm_offset_t cluster; /* The cluster number of offset */
1988 struct vs_map *vsmap;
1989
1990 VS_MAP_LOCK(vs);
1991
1992 /*
1993 * Loop through all clusters in this range, freeing paging segment
1994 * clusters and map entries as encountered.
1995 */
1996 while (length > 0) {
1997 vm_offset_t newoff;
1998 unsigned int i;
1999
2000 cluster = atop_32(offset) >> vs->vs_clshift;
2001 if (vs->vs_indirect) /* indirect map */
2002 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2003 else
2004 vsmap = vs->vs_dmap;
2005 if (vsmap == NULL) {
2006 VS_MAP_UNLOCK(vs);
2007 return;
2008 }
2009 vsmap += cluster%CLMAP_ENTRIES;
2010 if (VSM_ISCLR(*vsmap)) {
2011 length -= vm_page_size;
2012 offset += vm_page_size;
2013 continue;
2014 }
2015 /*
2016 * We've got a valid mapping. Clear it and deallocate
2017 * paging segment cluster pages.
2018 * Optimize for entire cluster cleraing.
2019 */
2020 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2021 /*
2022 * Not cluster aligned.
2023 */
2024 ASSERT(trunc_page(newoff) == newoff);
2025 i = atop_32(newoff);
2026 } else
2027 i = 0;
2028 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2029 VSM_CLRPG(*vsmap, i);
2030 VSM_CLRALLOC(*vsmap, i);
2031 length -= vm_page_size;
2032 offset += vm_page_size;
2033 i++;
2034 }
2035
2036 /*
2037 * If map entry is empty, clear and deallocate cluster.
2038 */
2039 if (!VSM_ALLOC(*vsmap)) {
2040 ps_deallocate_cluster(VSM_PS(*vsmap),
2041 VSM_CLOFF(*vsmap));
2042 VSM_CLR(*vsmap);
2043 }
2044 }
2045
2046 VS_MAP_UNLOCK(vs);
2047 }
2048
2049 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2050
2051 void
2052 ps_vs_write_complete(
2053 vstruct_t vs,
2054 vm_offset_t offset,
2055 vm_size_t size,
2056 int error)
2057 {
2058 struct clmap clmap;
2059
2060 /*
2061 * Get the struct vsmap for this cluster.
2062 * Use READ, even though it was written, because the
2063 * cluster MUST be present, unless there was an error
2064 * in the original ps_clmap (e.g. no space), in which
2065 * case, nothing happens.
2066 *
2067 * Must pass enough information to ps_clmap to allow it
2068 * to set the vs_map structure bitmap under lock.
2069 */
2070 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2071 }
2072
2073 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */
2074
2075 void
2076 vs_cl_write_complete(
2077 vstruct_t vs,
2078 __unused paging_segment_t ps,
2079 vm_offset_t offset,
2080 __unused vm_offset_t addr,
2081 vm_size_t size,
2082 boolean_t async,
2083 int error)
2084 {
2085 // kern_return_t kr;
2086
2087 if (error) {
2088 /*
2089 * For internal objects, the error is recorded on a
2090 * per-cluster basis by ps_clmap() which is called
2091 * by ps_vs_write_complete() below.
2092 */
2093 dprintf(("write failed error = 0x%x\n", error));
2094 /* add upl_abort code here */
2095 } else
2096 GSTAT(global_stats.gs_pages_out += atop_32(size));
2097 /*
2098 * Notify the vstruct mapping code, so it can do its accounting.
2099 */
2100 ps_vs_write_complete(vs, offset, size, error);
2101
2102 if (async) {
2103 VS_LOCK(vs);
2104 ASSERT(vs->vs_async_pending > 0);
2105 vs->vs_async_pending -= size;
2106 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2107 vs->vs_waiting_async = FALSE;
2108 VS_UNLOCK(vs);
2109 /* mutex_unlock(&vs->vs_waiting_async); */
2110 thread_wakeup(&vs->vs_async_pending);
2111 } else {
2112 VS_UNLOCK(vs);
2113 }
2114 }
2115 }
2116
2117 #ifdef DEVICE_PAGING
2118 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2119
2120 kern_return_t
2121 device_write_reply(
2122 MACH_PORT_FACE reply_port,
2123 kern_return_t device_code,
2124 io_buf_len_t bytes_written)
2125 {
2126 struct vs_async *vsa;
2127
2128 vsa = (struct vs_async *)
2129 ((struct vstruct_alias *)(reply_port->alias))->vs;
2130
2131 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2132 device_code = KERN_FAILURE;
2133 }
2134
2135 vsa->vsa_error = device_code;
2136
2137
2138 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2139 if(vsa->vsa_flags & VSA_TRANSFER) {
2140 /* revisit when async disk segments redone */
2141 if(vsa->vsa_error) {
2142 /* need to consider error condition. re-write data or */
2143 /* throw it away here. */
2144 vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2145 }
2146 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2147 vsa->vsa_size, vsa->vsa_error);
2148 } else {
2149 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2150 vsa->vsa_addr, vsa->vsa_size, TRUE,
2151 vsa->vsa_error);
2152 }
2153 VS_FREE_ASYNC(vsa);
2154
2155 return KERN_SUCCESS;
2156 }
2157
2158 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2159 kern_return_t
2160 device_write_reply_inband(
2161 MACH_PORT_FACE reply_port,
2162 kern_return_t return_code,
2163 io_buf_len_t bytes_written)
2164 {
2165 panic("device_write_reply_inband: illegal");
2166 return KERN_SUCCESS;
2167 }
2168
2169 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2170 kern_return_t
2171 device_read_reply(
2172 MACH_PORT_FACE reply_port,
2173 kern_return_t return_code,
2174 io_buf_ptr_t data,
2175 mach_msg_type_number_t dataCnt)
2176 {
2177 struct vs_async *vsa;
2178 vsa = (struct vs_async *)
2179 ((struct vstruct_alias *)(reply_port->alias))->vs;
2180 vsa->vsa_addr = (vm_offset_t)data;
2181 vsa->vsa_size = (vm_size_t)dataCnt;
2182 vsa->vsa_error = return_code;
2183 thread_wakeup(&vsa->vsa_lock);
2184 return KERN_SUCCESS;
2185 }
2186
2187 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2188 kern_return_t
2189 device_read_reply_inband(
2190 MACH_PORT_FACE reply_port,
2191 kern_return_t return_code,
2192 io_buf_ptr_inband_t data,
2193 mach_msg_type_number_t dataCnt)
2194 {
2195 panic("device_read_reply_inband: illegal");
2196 return KERN_SUCCESS;
2197 }
2198
2199 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2200 kern_return_t
2201 device_read_reply_overwrite(
2202 MACH_PORT_FACE reply_port,
2203 kern_return_t return_code,
2204 io_buf_len_t bytes_read)
2205 {
2206 panic("device_read_reply_overwrite: illegal\n");
2207 return KERN_SUCCESS;
2208 }
2209
2210 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2211 kern_return_t
2212 device_open_reply(
2213 MACH_PORT_FACE reply_port,
2214 kern_return_t return_code,
2215 MACH_PORT_FACE device_port)
2216 {
2217 panic("device_open_reply: illegal\n");
2218 return KERN_SUCCESS;
2219 }
2220
2221 kern_return_t
2222 ps_read_device(
2223 paging_segment_t ps,
2224 vm_offset_t offset,
2225 vm_offset_t *bufferp,
2226 unsigned int size,
2227 unsigned int *residualp,
2228 int flags)
2229 {
2230 kern_return_t kr;
2231 recnum_t dev_offset;
2232 unsigned int bytes_wanted;
2233 unsigned int bytes_read;
2234 unsigned int total_read;
2235 vm_offset_t dev_buffer;
2236 vm_offset_t buf_ptr;
2237 unsigned int records_read;
2238 struct vs_async *vsa;
2239 mutex_t vs_waiting_read_reply;
2240
2241 device_t device;
2242 vm_map_copy_t device_data = NULL;
2243 default_pager_thread_t *dpt = NULL;
2244
2245 device = dev_port_lookup(ps->ps_device);
2246 clustered_reads[atop_32(size)]++;
2247
2248 dev_offset = (ps->ps_offset +
2249 (offset >> (vm_page_shift - ps->ps_record_shift)));
2250 bytes_wanted = size;
2251 total_read = 0;
2252 *bufferp = (vm_offset_t)NULL;
2253
2254 do {
2255 vsa = VS_ALLOC_ASYNC();
2256 if (vsa) {
2257 vsa->vsa_vs = NULL;
2258 vsa->vsa_addr = 0;
2259 vsa->vsa_offset = 0;
2260 vsa->vsa_size = 0;
2261 vsa->vsa_ps = NULL;
2262 }
2263 mutex_init(&vsa->vsa_lock, 0);
2264 ip_lock(vsa->reply_port);
2265 vsa->reply_port->ip_sorights++;
2266 ip_reference(vsa->reply_port);
2267 ip_unlock(vsa->reply_port);
2268 kr = ds_device_read_common(device,
2269 vsa->reply_port,
2270 (mach_msg_type_name_t)
2271 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2272 (dev_mode_t) 0,
2273 dev_offset,
2274 bytes_wanted,
2275 (IO_READ | IO_CALL),
2276 (io_buf_ptr_t *) &dev_buffer,
2277 (mach_msg_type_number_t *) &bytes_read);
2278 if(kr == MIG_NO_REPLY) {
2279 assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2280 thread_block(THREAD_CONTINUE_NULL);
2281
2282 dev_buffer = vsa->vsa_addr;
2283 bytes_read = (unsigned int)vsa->vsa_size;
2284 kr = vsa->vsa_error;
2285 }
2286 VS_FREE_ASYNC(vsa);
2287 if (kr != KERN_SUCCESS || bytes_read == 0) {
2288 break;
2289 }
2290 total_read += bytes_read;
2291
2292 /*
2293 * If we got the entire range, use the returned dev_buffer.
2294 */
2295 if (bytes_read == size) {
2296 *bufferp = (vm_offset_t)dev_buffer;
2297 break;
2298 }
2299
2300 #if 1
2301 dprintf(("read only %d bytes out of %d\n",
2302 bytes_read, bytes_wanted));
2303 #endif
2304 if(dpt == NULL) {
2305 dpt = get_read_buffer();
2306 buf_ptr = dpt->dpt_buffer;
2307 *bufferp = (vm_offset_t)buf_ptr;
2308 }
2309 /*
2310 * Otherwise, copy the data into the provided buffer (*bufferp)
2311 * and append the rest of the range as it comes in.
2312 */
2313 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2314 buf_ptr += bytes_read;
2315 bytes_wanted -= bytes_read;
2316 records_read = (bytes_read >>
2317 (vm_page_shift - ps->ps_record_shift));
2318 dev_offset += records_read;
2319 DP_DEBUG(DEBUG_VS_INTERNAL,
2320 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2321 dev_buffer, bytes_read));
2322 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2323 != KERN_SUCCESS)
2324 Panic("dealloc buf");
2325 } while (bytes_wanted);
2326
2327 *residualp = size - total_read;
2328 if((dev_buffer != *bufferp) && (total_read != 0)) {
2329 vm_offset_t temp_buffer;
2330 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2331 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2332 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2333 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2334 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2335 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2336 (vm_map_copy_t *)&device_data, FALSE))
2337 panic("ps_read_device: cannot copyin locally provided buffer\n");
2338 }
2339 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2340 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2341 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2342 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2343 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2344 (vm_map_copy_t *)&device_data, FALSE))
2345 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2346 }
2347 else {
2348 device_data = NULL;
2349 }
2350 *bufferp = (vm_offset_t)device_data;
2351
2352 if(dpt != NULL) {
2353 /* Free the receive buffer */
2354 dpt->checked_out = 0;
2355 thread_wakeup(&dpt_array);
2356 }
2357 return KERN_SUCCESS;
2358 }
2359
2360 kern_return_t
2361 ps_write_device(
2362 paging_segment_t ps,
2363 vm_offset_t offset,
2364 vm_offset_t addr,
2365 unsigned int size,
2366 struct vs_async *vsa)
2367 {
2368 recnum_t dev_offset;
2369 io_buf_len_t bytes_to_write, bytes_written;
2370 recnum_t records_written;
2371 kern_return_t kr;
2372 MACH_PORT_FACE reply_port;
2373
2374
2375
2376 clustered_writes[atop_32(size)]++;
2377
2378 dev_offset = (ps->ps_offset +
2379 (offset >> (vm_page_shift - ps->ps_record_shift)));
2380 bytes_to_write = size;
2381
2382 if (vsa) {
2383 /*
2384 * Asynchronous write.
2385 */
2386 reply_port = vsa->reply_port;
2387 ip_lock(reply_port);
2388 reply_port->ip_sorights++;
2389 ip_reference(reply_port);
2390 ip_unlock(reply_port);
2391 {
2392 device_t device;
2393 device = dev_port_lookup(ps->ps_device);
2394
2395 vsa->vsa_addr = addr;
2396 kr=ds_device_write_common(device,
2397 reply_port,
2398 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2399 (dev_mode_t) 0,
2400 dev_offset,
2401 (io_buf_ptr_t) addr,
2402 size,
2403 (IO_WRITE | IO_CALL),
2404 &bytes_written);
2405 }
2406 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2407 if (verbose)
2408 dprintf(("%s0x%x, addr=0x%x,"
2409 "size=0x%x,offset=0x%x\n",
2410 "device_write_request returned ",
2411 kr, addr, size, offset));
2412 BS_STAT(ps->ps_bs,
2413 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2414 /* do the completion notification to free resources */
2415 device_write_reply(reply_port, kr, 0);
2416 return PAGER_ERROR;
2417 }
2418 } else do {
2419 /*
2420 * Synchronous write.
2421 */
2422 {
2423 device_t device;
2424 device = dev_port_lookup(ps->ps_device);
2425 kr=ds_device_write_common(device,
2426 IP_NULL, 0,
2427 (dev_mode_t) 0,
2428 dev_offset,
2429 (io_buf_ptr_t) addr,
2430 size,
2431 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2432 &bytes_written);
2433 }
2434 if (kr != KERN_SUCCESS) {
2435 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2436 "device_write returned ",
2437 kr, addr, size, offset));
2438 BS_STAT(ps->ps_bs,
2439 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2440 return PAGER_ERROR;
2441 }
2442 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2443 Panic("fragmented write");
2444 records_written = (bytes_written >>
2445 (vm_page_shift - ps->ps_record_shift));
2446 dev_offset += records_written;
2447 #if 1
2448 if (bytes_written != bytes_to_write) {
2449 dprintf(("wrote only %d bytes out of %d\n",
2450 bytes_written, bytes_to_write));
2451 }
2452 #endif
2453 bytes_to_write -= bytes_written;
2454 addr += bytes_written;
2455 } while (bytes_to_write > 0);
2456
2457 return PAGER_SUCCESS;
2458 }
2459
2460
2461 #else /* !DEVICE_PAGING */
2462
2463 kern_return_t
2464 ps_read_device(
2465 __unused paging_segment_t ps,
2466 __unused vm_offset_t offset,
2467 __unused vm_offset_t *bufferp,
2468 __unused unsigned int size,
2469 __unused unsigned int *residualp,
2470 __unused int flags)
2471 {
2472 panic("ps_read_device not supported");
2473 }
2474
2475 kern_return_t
2476 ps_write_device(
2477 __unused paging_segment_t ps,
2478 __unused vm_offset_t offset,
2479 __unused vm_offset_t addr,
2480 __unused unsigned int size,
2481 __unused struct vs_async *vsa)
2482 {
2483 panic("ps_write_device not supported");
2484 }
2485
2486 #endif /* DEVICE_PAGING */
2487 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t); /* forward */
2488
2489 void
2490 pvs_object_data_provided(
2491 __unused vstruct_t vs,
2492 __unused upl_t upl,
2493 __unused upl_offset_t offset,
2494 upl_size_t size)
2495 {
2496
2497 DP_DEBUG(DEBUG_VS_INTERNAL,
2498 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2499 upl, offset, size));
2500
2501 ASSERT(size > 0);
2502 GSTAT(global_stats.gs_pages_in += atop_32(size));
2503
2504
2505 #if USE_PRECIOUS
2506 ps_clunmap(vs, offset, size);
2507 #endif /* USE_PRECIOUS */
2508
2509 }
2510
2511 kern_return_t
2512 pvs_cluster_read(
2513 vstruct_t vs,
2514 vm_offset_t vs_offset,
2515 vm_size_t cnt)
2516 {
2517 upl_t upl;
2518 kern_return_t error = KERN_SUCCESS;
2519 int size;
2520 int residual;
2521 unsigned int request_flags;
2522 int seg_index;
2523 int pages_in_cl;
2524 int cl_size;
2525 int cl_mask;
2526 int cl_index;
2527 int xfer_size;
2528 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2529 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2530 struct clmap clmap;
2531
2532 pages_in_cl = 1 << vs->vs_clshift;
2533 cl_size = pages_in_cl * vm_page_size;
2534 cl_mask = cl_size - 1;
2535
2536 /*
2537 * This loop will be executed multiple times until the entire
2538 * request has been satisfied... if the request spans cluster
2539 * boundaries, the clusters will be checked for logical continunity,
2540 * if contiguous the I/O request will span multiple clusters, otherwise
2541 * it will be broken up into the minimal set of I/O's
2542 *
2543 * If there are holes in a request (either unallocated pages in a paging
2544 * segment or an unallocated paging segment), we stop
2545 * reading at the hole, inform the VM of any data read, inform
2546 * the VM of an unavailable range, then loop again, hoping to
2547 * find valid pages later in the requested range. This continues until
2548 * the entire range has been examined, and read, if present.
2549 */
2550
2551 #if USE_PRECIOUS
2552 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
2553 #else
2554 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
2555 #endif
2556
2557 assert(dp_encryption_inited);
2558 if (dp_encryption) {
2559 /*
2560 * ENCRYPTED SWAP:
2561 * request that the UPL be prepared for
2562 * decryption.
2563 */
2564 request_flags |= UPL_ENCRYPT;
2565 }
2566
2567 while (cnt && (error == KERN_SUCCESS)) {
2568 int ps_info_valid;
2569 int page_list_count;
2570
2571 if((vs_offset & cl_mask) &&
2572 (cnt > (VM_SUPER_CLUSTER -
2573 (vs_offset & cl_mask)))) {
2574 size = VM_SUPER_CLUSTER;
2575 size -= vs_offset & cl_mask;
2576 } else if (cnt > VM_SUPER_CLUSTER) {
2577 size = VM_SUPER_CLUSTER;
2578 } else {
2579 size = cnt;
2580 }
2581 cnt -= size;
2582
2583 ps_info_valid = 0;
2584 seg_index = 0;
2585
2586 while (size > 0 && error == KERN_SUCCESS) {
2587 int abort_size;
2588 int failed_size;
2589 int beg_pseg;
2590 int beg_indx;
2591 vm_offset_t cur_offset;
2592
2593
2594 if ( !ps_info_valid) {
2595 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2596 psp[seg_index] = CLMAP_PS(clmap);
2597 ps_info_valid = 1;
2598 }
2599 /*
2600 * skip over unallocated physical segments
2601 */
2602 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2603 abort_size = cl_size - (vs_offset & cl_mask);
2604 abort_size = MIN(abort_size, size);
2605
2606 page_list_count = 0;
2607 memory_object_super_upl_request(
2608 vs->vs_control,
2609 (memory_object_offset_t)vs_offset,
2610 abort_size, abort_size,
2611 &upl, NULL, &page_list_count,
2612 request_flags);
2613
2614 if (clmap.cl_error) {
2615 upl_abort(upl, UPL_ABORT_ERROR);
2616 } else {
2617 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2618 }
2619 upl_deallocate(upl);
2620
2621 size -= abort_size;
2622 vs_offset += abort_size;
2623
2624 seg_index++;
2625 ps_info_valid = 0;
2626 continue;
2627 }
2628 cl_index = (vs_offset & cl_mask) / vm_page_size;
2629
2630 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2631 /*
2632 * skip over unallocated pages
2633 */
2634 if (CLMAP_ISSET(clmap, cl_index))
2635 break;
2636 abort_size += vm_page_size;
2637 }
2638 if (abort_size) {
2639 /*
2640 * Let VM system know about holes in clusters.
2641 */
2642 GSTAT(global_stats.gs_pages_unavail += atop_32(abort_size));
2643
2644 page_list_count = 0;
2645 memory_object_super_upl_request(
2646 vs->vs_control,
2647 (memory_object_offset_t)vs_offset,
2648 abort_size, abort_size,
2649 &upl, NULL, &page_list_count,
2650 request_flags);
2651
2652 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2653 upl_deallocate(upl);
2654
2655 size -= abort_size;
2656 vs_offset += abort_size;
2657
2658 if (cl_index == pages_in_cl) {
2659 /*
2660 * if we're at the end of this physical cluster
2661 * then bump to the next one and continue looking
2662 */
2663 seg_index++;
2664 ps_info_valid = 0;
2665 continue;
2666 }
2667 if (size == 0)
2668 break;
2669 }
2670 /*
2671 * remember the starting point of the first allocated page
2672 * for the I/O we're about to issue
2673 */
2674 beg_pseg = seg_index;
2675 beg_indx = cl_index;
2676 cur_offset = vs_offset;
2677
2678 /*
2679 * calculate the size of the I/O that we can do...
2680 * this may span multiple physical segments if
2681 * they are contiguous
2682 */
2683 for (xfer_size = 0; xfer_size < size; ) {
2684
2685 while (cl_index < pages_in_cl
2686 && xfer_size < size) {
2687 /*
2688 * accumulate allocated pages within
2689 * a physical segment
2690 */
2691 if (CLMAP_ISSET(clmap, cl_index)) {
2692 xfer_size += vm_page_size;
2693 cur_offset += vm_page_size;
2694 cl_index++;
2695
2696 BS_STAT(psp[seg_index]->ps_bs,
2697 psp[seg_index]->ps_bs->bs_pages_in++);
2698 } else
2699 break;
2700 }
2701 if (cl_index < pages_in_cl
2702 || xfer_size >= size) {
2703 /*
2704 * we've hit an unallocated page or
2705 * the end of this request... go fire
2706 * the I/O
2707 */
2708 break;
2709 }
2710 /*
2711 * we've hit the end of the current physical
2712 * segment and there's more to do, so try
2713 * moving to the next one
2714 */
2715 seg_index++;
2716
2717 ps_offset[seg_index] =
2718 ps_clmap(vs,
2719 cur_offset & ~cl_mask,
2720 &clmap, CL_FIND, 0, 0);
2721 psp[seg_index] = CLMAP_PS(clmap);
2722 ps_info_valid = 1;
2723
2724 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2725 /*
2726 * if the physical segment we're about
2727 * to step into is not contiguous to
2728 * the one we're currently in, or it's
2729 * in a different paging file, or
2730 * it hasn't been allocated....
2731 * we stop here and generate the I/O
2732 */
2733 break;
2734 }
2735 /*
2736 * start with first page of the next physical
2737 * segment
2738 */
2739 cl_index = 0;
2740 }
2741 if (xfer_size) {
2742 /*
2743 * we have a contiguous range of allocated pages
2744 * to read from
2745 */
2746 page_list_count = 0;
2747 memory_object_super_upl_request(vs->vs_control,
2748 (memory_object_offset_t)vs_offset,
2749 xfer_size, xfer_size,
2750 &upl, NULL, &page_list_count,
2751 request_flags | UPL_SET_INTERNAL);
2752
2753 error = ps_read_file(psp[beg_pseg],
2754 upl, (upl_offset_t) 0,
2755 ps_offset[beg_pseg] +
2756 (beg_indx * vm_page_size),
2757 xfer_size, &residual, 0);
2758 } else
2759 continue;
2760
2761 failed_size = 0;
2762
2763 /*
2764 * Adjust counts and send response to VM. Optimize
2765 * for the common case, i.e. no error and/or partial
2766 * data. If there was an error, then we need to error
2767 * the entire range, even if some data was successfully
2768 * read. If there was a partial read we may supply some
2769 * data and may error some as well. In all cases the
2770 * VM must receive some notification for every page
2771 * in the range.
2772 */
2773 if ((error == KERN_SUCCESS) && (residual == 0)) {
2774 /*
2775 * Got everything we asked for, supply the data
2776 * to the VM. Note that as a side effect of
2777 * supplying the data, the buffer holding the
2778 * supplied data is deallocated from the pager's
2779 * address space.
2780 */
2781 pvs_object_data_provided(
2782 vs, upl, vs_offset, xfer_size);
2783 } else {
2784 failed_size = xfer_size;
2785
2786 if (error == KERN_SUCCESS) {
2787 if (residual == xfer_size) {
2788 /*
2789 * If a read operation returns no error
2790 * and no data moved, we turn it into
2791 * an error, assuming we're reading at
2792 * or beyong EOF.
2793 * Fall through and error the entire
2794 * range.
2795 */
2796 error = KERN_FAILURE;
2797 } else {
2798 /*
2799 * Otherwise, we have partial read. If
2800 * the part read is a integral number
2801 * of pages supply it. Otherwise round
2802 * it up to a page boundary, zero fill
2803 * the unread part, and supply it.
2804 * Fall through and error the remainder
2805 * of the range, if any.
2806 */
2807 int fill, lsize;
2808
2809 fill = residual
2810 & ~vm_page_size;
2811 lsize = (xfer_size - residual)
2812 + fill;
2813 pvs_object_data_provided(
2814 vs, upl,
2815 vs_offset, lsize);
2816
2817 if (lsize < xfer_size) {
2818 failed_size =
2819 xfer_size - lsize;
2820 error = KERN_FAILURE;
2821 }
2822 }
2823 }
2824 }
2825 /*
2826 * If there was an error in any part of the range, tell
2827 * the VM. Note that error is explicitly checked again
2828 * since it can be modified above.
2829 */
2830 if (error != KERN_SUCCESS) {
2831 BS_STAT(psp[beg_pseg]->ps_bs,
2832 psp[beg_pseg]->ps_bs->bs_pages_in_fail
2833 += atop_32(failed_size));
2834 }
2835 size -= xfer_size;
2836 vs_offset += xfer_size;
2837 }
2838
2839 } /* END while (cnt && (error == 0)) */
2840 return error;
2841 }
2842
2843 int vs_do_async_write = 1;
2844
2845 kern_return_t
2846 vs_cluster_write(
2847 vstruct_t vs,
2848 upl_t internal_upl,
2849 upl_offset_t offset,
2850 upl_size_t cnt,
2851 boolean_t dp_internal,
2852 int flags)
2853 {
2854 upl_size_t transfer_size;
2855 int error = 0;
2856 struct clmap clmap;
2857
2858 vm_offset_t actual_offset; /* Offset within paging segment */
2859 paging_segment_t ps;
2860 vm_offset_t mobj_base_addr;
2861 vm_offset_t mobj_target_addr;
2862
2863 upl_t upl;
2864 upl_page_info_t *pl;
2865 int page_index;
2866 int list_size;
2867 int pages_in_cl;
2868 unsigned int cl_size;
2869 int base_index;
2870 unsigned int seg_size;
2871
2872 pages_in_cl = 1 << vs->vs_clshift;
2873 cl_size = pages_in_cl * vm_page_size;
2874
2875 if (!dp_internal) {
2876 int page_list_count;
2877 int request_flags;
2878 unsigned int super_size;
2879 int first_dirty;
2880 int num_dirty;
2881 int num_of_pages;
2882 int seg_index;
2883 upl_offset_t upl_offset;
2884 vm_offset_t seg_offset;
2885 vm_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2886 paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2887
2888
2889 if (bs_low) {
2890 super_size = cl_size;
2891
2892 request_flags = UPL_NOBLOCK |
2893 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2894 UPL_NO_SYNC | UPL_SET_INTERNAL;
2895 } else {
2896 super_size = VM_SUPER_CLUSTER;
2897
2898 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2899 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2900 UPL_NO_SYNC | UPL_SET_INTERNAL;
2901 }
2902
2903 if (!dp_encryption_inited) {
2904 /*
2905 * ENCRYPTED SWAP:
2906 * Once we've started using swap, we
2907 * can't change our mind on whether
2908 * it needs to be encrypted or
2909 * not.
2910 */
2911 dp_encryption_inited = TRUE;
2912 }
2913 if (dp_encryption) {
2914 /*
2915 * ENCRYPTED SWAP:
2916 * request that the UPL be prepared for
2917 * encryption.
2918 */
2919 request_flags |= UPL_ENCRYPT;
2920 flags |= UPL_PAGING_ENCRYPTED;
2921 }
2922
2923 page_list_count = 0;
2924 memory_object_super_upl_request(vs->vs_control,
2925 (memory_object_offset_t)offset,
2926 cnt, super_size,
2927 &upl, NULL, &page_list_count,
2928 request_flags | UPL_FOR_PAGEOUT);
2929
2930 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2931
2932 seg_size = cl_size - (upl->offset % cl_size);
2933 upl_offset = upl->offset & ~(cl_size - 1);
2934
2935 for (seg_index = 0, transfer_size = upl->size;
2936 transfer_size > 0; ) {
2937 ps_offset[seg_index] =
2938 ps_clmap(vs,
2939 upl_offset,
2940 &clmap, CL_ALLOC,
2941 cl_size, 0);
2942
2943 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2944 upl_abort(upl, 0);
2945 upl_deallocate(upl);
2946
2947 return KERN_FAILURE;
2948
2949 }
2950 psp[seg_index] = CLMAP_PS(clmap);
2951
2952 if (transfer_size > seg_size) {
2953 transfer_size -= seg_size;
2954 upl_offset += cl_size;
2955 seg_size = cl_size;
2956 seg_index++;
2957 } else
2958 transfer_size = 0;
2959 }
2960 /*
2961 * Ignore any non-present pages at the end of the
2962 * UPL.
2963 */
2964 for (page_index = upl->size / vm_page_size; page_index > 0;)
2965 if (UPL_PAGE_PRESENT(pl, --page_index))
2966 break;
2967 num_of_pages = page_index + 1;
2968
2969 base_index = (upl->offset % cl_size) / PAGE_SIZE;
2970
2971 for (page_index = 0; page_index < num_of_pages; ) {
2972 /*
2973 * skip over non-dirty pages
2974 */
2975 for ( ; page_index < num_of_pages; page_index++) {
2976 if (UPL_DIRTY_PAGE(pl, page_index)
2977 || UPL_PRECIOUS_PAGE(pl, page_index))
2978 /*
2979 * this is a page we need to write
2980 * go see if we can buddy it up with
2981 * others that are contiguous to it
2982 */
2983 break;
2984 /*
2985 * if the page is not-dirty, but present we
2986 * need to commit it... This is an unusual
2987 * case since we only asked for dirty pages
2988 */
2989 if (UPL_PAGE_PRESENT(pl, page_index)) {
2990 boolean_t empty = FALSE;
2991 upl_commit_range(upl,
2992 page_index * vm_page_size,
2993 vm_page_size,
2994 UPL_COMMIT_NOTIFY_EMPTY,
2995 pl,
2996 page_list_count,
2997 &empty);
2998 if (empty) {
2999 assert(page_index ==
3000 num_of_pages - 1);
3001 upl_deallocate(upl);
3002 }
3003 }
3004 }
3005 if (page_index == num_of_pages)
3006 /*
3007 * no more pages to look at, we're out of here
3008 */
3009 break;
3010
3011 /*
3012 * gather up contiguous dirty pages... we have at
3013 * least 1 * otherwise we would have bailed above
3014 * make sure that each physical segment that we step
3015 * into is contiguous to the one we're currently in
3016 * if it's not, we have to stop and write what we have
3017 */
3018 for (first_dirty = page_index;
3019 page_index < num_of_pages; ) {
3020 if ( !UPL_DIRTY_PAGE(pl, page_index)
3021 && !UPL_PRECIOUS_PAGE(pl, page_index))
3022 break;
3023 page_index++;
3024 /*
3025 * if we just looked at the last page in the UPL
3026 * we don't need to check for physical segment
3027 * continuity
3028 */
3029 if (page_index < num_of_pages) {
3030 int cur_seg;
3031 int nxt_seg;
3032
3033 cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3034 nxt_seg = (base_index + page_index)/pages_in_cl;
3035
3036 if (cur_seg != nxt_seg) {
3037 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3038 /*
3039 * if the segment we're about
3040 * to step into is not
3041 * contiguous to the one we're
3042 * currently in, or it's in a
3043 * different paging file....
3044 * we stop here and generate
3045 * the I/O
3046 */
3047 break;
3048 }
3049 }
3050 }
3051 num_dirty = page_index - first_dirty;
3052
3053 if (num_dirty) {
3054 upl_offset = first_dirty * vm_page_size;
3055 transfer_size = num_dirty * vm_page_size;
3056
3057 while (transfer_size) {
3058
3059 if ((seg_size = cl_size -
3060 ((upl->offset + upl_offset) % cl_size))
3061 > transfer_size)
3062 seg_size = transfer_size;
3063
3064 ps_vs_write_complete(vs,
3065 upl->offset + upl_offset,
3066 seg_size, error);
3067
3068 transfer_size -= seg_size;
3069 upl_offset += seg_size;
3070 }
3071 upl_offset = first_dirty * vm_page_size;
3072 transfer_size = num_dirty * vm_page_size;
3073
3074 seg_index = (base_index + first_dirty) / pages_in_cl;
3075 seg_offset = (upl->offset + upl_offset) % cl_size;
3076
3077 error = ps_write_file(psp[seg_index],
3078 upl, upl_offset,
3079 ps_offset[seg_index]
3080 + seg_offset,
3081 transfer_size, flags);
3082 } else {
3083 boolean_t empty = FALSE;
3084 upl_abort_range(upl,
3085 first_dirty * vm_page_size,
3086 num_dirty * vm_page_size,
3087 UPL_ABORT_NOTIFY_EMPTY,
3088 &empty);
3089 if (empty) {
3090 assert(page_index == num_of_pages);
3091 upl_deallocate(upl);
3092 }
3093 }
3094 }
3095
3096 } else {
3097 assert(cnt <= (vm_page_size << vs->vs_clshift));
3098 list_size = cnt;
3099
3100 page_index = 0;
3101 /* The caller provides a mapped_data which is derived */
3102 /* from a temporary object. The targeted pages are */
3103 /* guaranteed to be set at offset 0 in the mapped_data */
3104 /* The actual offset however must still be derived */
3105 /* from the offset in the vs in question */
3106 mobj_base_addr = offset;
3107 mobj_target_addr = mobj_base_addr;
3108
3109 for (transfer_size = list_size; transfer_size != 0;) {
3110 actual_offset = ps_clmap(vs, mobj_target_addr,
3111 &clmap, CL_ALLOC,
3112 transfer_size < cl_size ?
3113 transfer_size : cl_size, 0);
3114 if(actual_offset == (vm_offset_t) -1) {
3115 error = 1;
3116 break;
3117 }
3118 cnt = MIN(transfer_size,
3119 CLMAP_NPGS(clmap) * vm_page_size);
3120 ps = CLMAP_PS(clmap);
3121 /* Assume that the caller has given us contiguous */
3122 /* pages */
3123 if(cnt) {
3124 ps_vs_write_complete(vs, mobj_target_addr,
3125 cnt, error);
3126 error = ps_write_file(ps, internal_upl,
3127 0, actual_offset,
3128 cnt, flags);
3129 if (error)
3130 break;
3131 }
3132 if (error)
3133 break;
3134 actual_offset += cnt;
3135 mobj_target_addr += cnt;
3136 transfer_size -= cnt;
3137 cnt = 0;
3138
3139 if (error)
3140 break;
3141 }
3142 }
3143 if(error)
3144 return KERN_FAILURE;
3145 else
3146 return KERN_SUCCESS;
3147 }
3148
3149 vm_size_t
3150 ps_vstruct_allocated_size(
3151 vstruct_t vs)
3152 {
3153 int num_pages;
3154 struct vs_map *vsmap;
3155 unsigned int i, j, k;
3156
3157 num_pages = 0;
3158 if (vs->vs_indirect) {
3159 /* loop on indirect maps */
3160 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3161 vsmap = vs->vs_imap[i];
3162 if (vsmap == NULL)
3163 continue;
3164 /* loop on clusters in this indirect map */
3165 for (j = 0; j < CLMAP_ENTRIES; j++) {
3166 if (VSM_ISCLR(vsmap[j]) ||
3167 VSM_ISERR(vsmap[j]))
3168 continue;
3169 /* loop on pages in this cluster */
3170 for (k = 0; k < VSCLSIZE(vs); k++) {
3171 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3172 num_pages++;
3173 }
3174 }
3175 }
3176 } else {
3177 vsmap = vs->vs_dmap;
3178 if (vsmap == NULL)
3179 return 0;
3180 /* loop on clusters in the direct map */
3181 for (j = 0; j < CLMAP_ENTRIES; j++) {
3182 if (VSM_ISCLR(vsmap[j]) ||
3183 VSM_ISERR(vsmap[j]))
3184 continue;
3185 /* loop on pages in this cluster */
3186 for (k = 0; k < VSCLSIZE(vs); k++) {
3187 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3188 num_pages++;
3189 }
3190 }
3191 }
3192
3193 return ptoa_32(num_pages);
3194 }
3195
3196 size_t
3197 ps_vstruct_allocated_pages(
3198 vstruct_t vs,
3199 default_pager_page_t *pages,
3200 size_t pages_size)
3201 {
3202 unsigned int num_pages;
3203 struct vs_map *vsmap;
3204 vm_offset_t offset;
3205 unsigned int i, j, k;
3206
3207 num_pages = 0;
3208 offset = 0;
3209 if (vs->vs_indirect) {
3210 /* loop on indirect maps */
3211 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3212 vsmap = vs->vs_imap[i];
3213 if (vsmap == NULL) {
3214 offset += (vm_page_size * CLMAP_ENTRIES *
3215 VSCLSIZE(vs));
3216 continue;
3217 }
3218 /* loop on clusters in this indirect map */
3219 for (j = 0; j < CLMAP_ENTRIES; j++) {
3220 if (VSM_ISCLR(vsmap[j]) ||
3221 VSM_ISERR(vsmap[j])) {
3222 offset += vm_page_size * VSCLSIZE(vs);
3223 continue;
3224 }
3225 /* loop on pages in this cluster */
3226 for (k = 0; k < VSCLSIZE(vs); k++) {
3227 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3228 num_pages++;
3229 if (num_pages < pages_size)
3230 pages++->dpp_offset =
3231 offset;
3232 }
3233 offset += vm_page_size;
3234 }
3235 }
3236 }
3237 } else {
3238 vsmap = vs->vs_dmap;
3239 if (vsmap == NULL)
3240 return 0;
3241 /* loop on clusters in the direct map */
3242 for (j = 0; j < CLMAP_ENTRIES; j++) {
3243 if (VSM_ISCLR(vsmap[j]) ||
3244 VSM_ISERR(vsmap[j])) {
3245 offset += vm_page_size * VSCLSIZE(vs);
3246 continue;
3247 }
3248 /* loop on pages in this cluster */
3249 for (k = 0; k < VSCLSIZE(vs); k++) {
3250 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3251 num_pages++;
3252 if (num_pages < pages_size)
3253 pages++->dpp_offset = offset;
3254 }
3255 offset += vm_page_size;
3256 }
3257 }
3258 }
3259
3260 return num_pages;
3261 }
3262
3263
3264 kern_return_t
3265 ps_vstruct_transfer_from_segment(
3266 vstruct_t vs,
3267 paging_segment_t segment,
3268 upl_t upl)
3269 {
3270 struct vs_map *vsmap;
3271 // struct vs_map old_vsmap;
3272 // struct vs_map new_vsmap;
3273 unsigned int i, j;
3274
3275 VS_LOCK(vs); /* block all work on this vstruct */
3276 /* can't allow the normal multiple write */
3277 /* semantic because writes may conflict */
3278 vs->vs_xfer_pending = TRUE;
3279 vs_wait_for_sync_writers(vs);
3280 vs_start_write(vs);
3281 vs_wait_for_readers(vs);
3282 /* we will unlock the vs to allow other writes while transferring */
3283 /* and will be guaranteed of the persistance of the vs struct */
3284 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3285 /* vs_async_pending */
3286 /* OK we now have guaranteed no other parties are accessing this */
3287 /* vs. Now that we are also supporting simple lock versions of */
3288 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3289 /* our purpose in holding it before was the multiple write case */
3290 /* we now use the boolean xfer_pending to do that. We can use */
3291 /* a boolean instead of a count because we have guaranteed single */
3292 /* file access to this code in its caller */
3293 VS_UNLOCK(vs);
3294 vs_changed:
3295 if (vs->vs_indirect) {
3296 unsigned int vsmap_size;
3297 int clmap_off;
3298 /* loop on indirect maps */
3299 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3300 vsmap = vs->vs_imap[i];
3301 if (vsmap == NULL)
3302 continue;
3303 /* loop on clusters in this indirect map */
3304 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3305 VSCLSIZE(vs) * i);
3306 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3307 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3308 else
3309 vsmap_size = CLMAP_ENTRIES;
3310 for (j = 0; j < vsmap_size; j++) {
3311 if (VSM_ISCLR(vsmap[j]) ||
3312 VSM_ISERR(vsmap[j]) ||
3313 (VSM_PS(vsmap[j]) != segment))
3314 continue;
3315 if(vs_cluster_transfer(vs,
3316 (vm_page_size * (j << vs->vs_clshift))
3317 + clmap_off,
3318 vm_page_size << vs->vs_clshift,
3319 upl)
3320 != KERN_SUCCESS) {
3321 VS_LOCK(vs);
3322 vs->vs_xfer_pending = FALSE;
3323 VS_UNLOCK(vs);
3324 vs_finish_write(vs);
3325 return KERN_FAILURE;
3326 }
3327 /* allow other readers/writers during transfer*/
3328 VS_LOCK(vs);
3329 vs->vs_xfer_pending = FALSE;
3330 VS_UNLOCK(vs);
3331 vs_finish_write(vs);
3332 VS_LOCK(vs);
3333 vs->vs_xfer_pending = TRUE;
3334 vs_wait_for_sync_writers(vs);
3335 vs_start_write(vs);
3336 vs_wait_for_readers(vs);
3337 VS_UNLOCK(vs);
3338 if (!(vs->vs_indirect)) {
3339 goto vs_changed;
3340 }
3341 }
3342 }
3343 } else {
3344 vsmap = vs->vs_dmap;
3345 if (vsmap == NULL) {
3346 VS_LOCK(vs);
3347 vs->vs_xfer_pending = FALSE;
3348 VS_UNLOCK(vs);
3349 vs_finish_write(vs);
3350 return KERN_SUCCESS;
3351 }
3352 /* loop on clusters in the direct map */
3353 for (j = 0; j < vs->vs_size; j++) {
3354 if (VSM_ISCLR(vsmap[j]) ||
3355 VSM_ISERR(vsmap[j]) ||
3356 (VSM_PS(vsmap[j]) != segment))
3357 continue;
3358 if(vs_cluster_transfer(vs,
3359 vm_page_size * (j << vs->vs_clshift),
3360 vm_page_size << vs->vs_clshift,
3361 upl) != KERN_SUCCESS) {
3362 VS_LOCK(vs);
3363 vs->vs_xfer_pending = FALSE;
3364 VS_UNLOCK(vs);
3365 vs_finish_write(vs);
3366 return KERN_FAILURE;
3367 }
3368 /* allow other readers/writers during transfer*/
3369 VS_LOCK(vs);
3370 vs->vs_xfer_pending = FALSE;
3371 VS_UNLOCK(vs);
3372 vs_finish_write(vs);
3373 VS_LOCK(vs);
3374 vs->vs_xfer_pending = TRUE;
3375 VS_UNLOCK(vs);
3376 vs_wait_for_sync_writers(vs);
3377 vs_start_write(vs);
3378 vs_wait_for_readers(vs);
3379 if (vs->vs_indirect) {
3380 goto vs_changed;
3381 }
3382 }
3383 }
3384
3385 VS_LOCK(vs);
3386 vs->vs_xfer_pending = FALSE;
3387 VS_UNLOCK(vs);
3388 vs_finish_write(vs);
3389 return KERN_SUCCESS;
3390 }
3391
3392
3393
3394 vs_map_t
3395 vs_get_map_entry(
3396 vstruct_t vs,
3397 vm_offset_t offset)
3398 {
3399 struct vs_map *vsmap;
3400 vm_offset_t cluster;
3401
3402 cluster = atop_32(offset) >> vs->vs_clshift;
3403 if (vs->vs_indirect) {
3404 long ind_block = cluster/CLMAP_ENTRIES;
3405
3406 /* Is the indirect block allocated? */
3407 vsmap = vs->vs_imap[ind_block];
3408 if(vsmap == (vs_map_t) NULL)
3409 return vsmap;
3410 } else
3411 vsmap = vs->vs_dmap;
3412 vsmap += cluster%CLMAP_ENTRIES;
3413 return vsmap;
3414 }
3415
3416 kern_return_t
3417 vs_cluster_transfer(
3418 vstruct_t vs,
3419 vm_offset_t offset,
3420 vm_size_t cnt,
3421 upl_t upl)
3422 {
3423 vm_offset_t actual_offset;
3424 paging_segment_t ps;
3425 struct clmap clmap;
3426 kern_return_t error = KERN_SUCCESS;
3427 unsigned int size, size_wanted;
3428 int i;
3429 unsigned int residual;
3430 unsigned int unavail_size;
3431 // default_pager_thread_t *dpt;
3432 // boolean_t dealloc;
3433 struct vs_map *vsmap_ptr = NULL;
3434 struct vs_map read_vsmap;
3435 struct vs_map original_read_vsmap;
3436 struct vs_map write_vsmap;
3437 // upl_t sync_upl;
3438 // vm_offset_t ioaddr;
3439
3440 /* vs_cluster_transfer reads in the pages of a cluster and
3441 * then writes these pages back to new backing store. The
3442 * segment the pages are being read from is assumed to have
3443 * been taken off-line and is no longer considered for new
3444 * space requests.
3445 */
3446
3447 /*
3448 * This loop will be executed once per cluster referenced.
3449 * Typically this means once, since it's unlikely that the
3450 * VM system will ask for anything spanning cluster boundaries.
3451 *
3452 * If there are holes in a cluster (in a paging segment), we stop
3453 * reading at the hole, then loop again, hoping to
3454 * find valid pages later in the cluster. This continues until
3455 * the entire range has been examined, and read, if present. The
3456 * pages are written as they are read. If a failure occurs after
3457 * some pages are written the unmap call at the bottom of the loop
3458 * recovers the backing store and the old backing store remains
3459 * in effect.
3460 */
3461
3462 VSM_CLR(write_vsmap);
3463 VSM_CLR(original_read_vsmap);
3464 /* grab the actual object's pages to sync with I/O */
3465 while (cnt && (error == KERN_SUCCESS)) {
3466 vsmap_ptr = vs_get_map_entry(vs, offset);
3467 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3468
3469 if (actual_offset == (vm_offset_t) -1) {
3470
3471 /*
3472 * Nothing left to write in this cluster at least
3473 * set write cluster information for any previous
3474 * write, clear for next cluster, if there is one
3475 */
3476 unsigned int local_size, clmask, clsize;
3477
3478 clsize = vm_page_size << vs->vs_clshift;
3479 clmask = clsize - 1;
3480 local_size = clsize - (offset & clmask);
3481 ASSERT(local_size);
3482 local_size = MIN(local_size, cnt);
3483
3484 /* This cluster has no data in it beyond what may */
3485 /* have been found on a previous iteration through */
3486 /* the loop "write_vsmap" */
3487 *vsmap_ptr = write_vsmap;
3488 VSM_CLR(write_vsmap);
3489 VSM_CLR(original_read_vsmap);
3490
3491 cnt -= local_size;
3492 offset += local_size;
3493 continue;
3494 }
3495
3496 /*
3497 * Count up contiguous available or unavailable
3498 * pages.
3499 */
3500 ps = CLMAP_PS(clmap);
3501 ASSERT(ps);
3502 size = 0;
3503 unavail_size = 0;
3504 for (i = 0;
3505 (size < cnt) && (unavail_size < cnt) &&
3506 (i < CLMAP_NPGS(clmap)); i++) {
3507 if (CLMAP_ISSET(clmap, i)) {
3508 if (unavail_size != 0)
3509 break;
3510 size += vm_page_size;
3511 BS_STAT(ps->ps_bs,
3512 ps->ps_bs->bs_pages_in++);
3513 } else {
3514 if (size != 0)
3515 break;
3516 unavail_size += vm_page_size;
3517 }
3518 }
3519
3520 if (size == 0) {
3521 ASSERT(unavail_size);
3522 cnt -= unavail_size;
3523 offset += unavail_size;
3524 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3525 == 0) {
3526 /* There is no more to transfer in this
3527 cluster
3528 */
3529 *vsmap_ptr = write_vsmap;
3530 VSM_CLR(write_vsmap);
3531 VSM_CLR(original_read_vsmap);
3532 }
3533 continue;
3534 }
3535
3536 if(VSM_ISCLR(original_read_vsmap))
3537 original_read_vsmap = *vsmap_ptr;
3538
3539 if(ps->ps_segtype == PS_PARTITION) {
3540 /*
3541 NEED TO ISSUE WITH SYNC & NO COMMIT
3542 error = ps_read_device(ps, actual_offset, &buffer,
3543 size, &residual, flags);
3544 */
3545 } else {
3546 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3547 error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
3548 size, &residual,
3549 (UPL_IOSYNC | UPL_NOCOMMIT));
3550 }
3551
3552 read_vsmap = *vsmap_ptr;
3553
3554
3555 /*
3556 * Adjust counts and put data in new BS. Optimize for the
3557 * common case, i.e. no error and/or partial data.
3558 * If there was an error, then we need to error the entire
3559 * range, even if some data was successfully read.
3560 *
3561 */
3562 if ((error == KERN_SUCCESS) && (residual == 0)) {
3563
3564 /*
3565 * Got everything we asked for, supply the data to
3566 * the new BS. Note that as a side effect of supplying
3567 * the data, the buffer holding the supplied data is
3568 * deallocated from the pager's address space unless
3569 * the write is unsuccessful.
3570 */
3571
3572 /* note buffer will be cleaned up in all cases by */
3573 /* internal_cluster_write or if an error on write */
3574 /* the vm_map_copy_page_discard call */
3575 *vsmap_ptr = write_vsmap;
3576
3577 if(vs_cluster_write(vs, upl, offset,
3578 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3579 error = KERN_FAILURE;
3580 if(!(VSM_ISCLR(*vsmap_ptr))) {
3581 /* unmap the new backing store object */
3582 ps_clunmap(vs, offset, size);
3583 }
3584 /* original vsmap */
3585 *vsmap_ptr = original_read_vsmap;
3586 VSM_CLR(write_vsmap);
3587 } else {
3588 if((offset + size) &
3589 ((vm_page_size << vs->vs_clshift)
3590 - 1)) {
3591 /* There is more to transfer in this
3592 cluster
3593 */
3594 write_vsmap = *vsmap_ptr;
3595 *vsmap_ptr = read_vsmap;
3596 } else {
3597 /* discard the old backing object */
3598 write_vsmap = *vsmap_ptr;
3599 *vsmap_ptr = read_vsmap;
3600 ps_clunmap(vs, offset, size);
3601 *vsmap_ptr = write_vsmap;
3602 VSM_CLR(write_vsmap);
3603 VSM_CLR(original_read_vsmap);
3604 }
3605 }
3606 } else {
3607 size_wanted = size;
3608 if (error == KERN_SUCCESS) {
3609 if (residual == size) {
3610 /*
3611 * If a read operation returns no error
3612 * and no data moved, we turn it into
3613 * an error, assuming we're reading at
3614 * or beyond EOF.
3615 * Fall through and error the entire
3616 * range.
3617 */
3618 error = KERN_FAILURE;
3619 *vsmap_ptr = write_vsmap;
3620 if(!(VSM_ISCLR(*vsmap_ptr))) {
3621 /* unmap the new backing store object */
3622 ps_clunmap(vs, offset, size);
3623 }
3624 *vsmap_ptr = original_read_vsmap;
3625 VSM_CLR(write_vsmap);
3626 continue;
3627 } else {
3628 /*
3629 * Otherwise, we have partial read.
3630 * This is also considered an error
3631 * for the purposes of cluster transfer
3632 */
3633 error = KERN_FAILURE;
3634 *vsmap_ptr = write_vsmap;
3635 if(!(VSM_ISCLR(*vsmap_ptr))) {
3636 /* unmap the new backing store object */
3637 ps_clunmap(vs, offset, size);
3638 }
3639 *vsmap_ptr = original_read_vsmap;
3640 VSM_CLR(write_vsmap);
3641 continue;
3642 }
3643 }
3644
3645 }
3646 cnt -= size;
3647 offset += size;
3648
3649 } /* END while (cnt && (error == 0)) */
3650 if(!VSM_ISCLR(write_vsmap))
3651 *vsmap_ptr = write_vsmap;
3652
3653 return error;
3654 }
3655
3656 kern_return_t
3657 default_pager_add_file(
3658 MACH_PORT_FACE backing_store,
3659 vnode_ptr_t vp,
3660 int record_size,
3661 vm_size_t size)
3662 {
3663 backing_store_t bs;
3664 paging_segment_t ps;
3665 int i;
3666 unsigned int j;
3667 int error;
3668
3669 if ((bs = backing_store_lookup(backing_store))
3670 == BACKING_STORE_NULL)
3671 return KERN_INVALID_ARGUMENT;
3672
3673 PSL_LOCK();
3674 for (i = 0; i <= paging_segment_max; i++) {
3675 ps = paging_segments[i];
3676 if (ps == PAGING_SEGMENT_NULL)
3677 continue;
3678 if (ps->ps_segtype != PS_FILE)
3679 continue;
3680
3681 /*
3682 * Check for overlap on same device.
3683 */
3684 if (ps->ps_vnode == (struct vnode *)vp) {
3685 PSL_UNLOCK();
3686 BS_UNLOCK(bs);
3687 return KERN_INVALID_ARGUMENT;
3688 }
3689 }
3690 PSL_UNLOCK();
3691
3692 /*
3693 * Set up the paging segment
3694 */
3695 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3696 if (ps == PAGING_SEGMENT_NULL) {
3697 BS_UNLOCK(bs);
3698 return KERN_RESOURCE_SHORTAGE;
3699 }
3700
3701 ps->ps_segtype = PS_FILE;
3702 ps->ps_vnode = (struct vnode *)vp;
3703 ps->ps_offset = 0;
3704 ps->ps_record_shift = local_log2(vm_page_size / record_size);
3705 ps->ps_recnum = size;
3706 ps->ps_pgnum = size >> ps->ps_record_shift;
3707
3708 ps->ps_pgcount = ps->ps_pgnum;
3709 ps->ps_clshift = local_log2(bs->bs_clsize);
3710 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3711 ps->ps_hint = 0;
3712
3713 PS_LOCK_INIT(ps);
3714 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3715 if (!ps->ps_bmap) {
3716 kfree(ps, sizeof *ps);
3717 BS_UNLOCK(bs);
3718 return KERN_RESOURCE_SHORTAGE;
3719 }
3720 for (j = 0; j < ps->ps_ncls; j++) {
3721 clrbit(ps->ps_bmap, j);
3722 }
3723
3724 ps->ps_going_away = FALSE;
3725 ps->ps_bs = bs;
3726
3727 if ((error = ps_enter(ps)) != 0) {
3728 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3729 kfree(ps, sizeof *ps);
3730 BS_UNLOCK(bs);
3731 return KERN_RESOURCE_SHORTAGE;
3732 }
3733
3734 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3735 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3736 PSL_LOCK();
3737 dp_pages_free += ps->ps_pgcount;
3738 PSL_UNLOCK();
3739
3740 BS_UNLOCK(bs);
3741
3742 bs_more_space(ps->ps_clcount);
3743
3744 DP_DEBUG(DEBUG_BS_INTERNAL,
3745 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3746 device, offset, size, record_size,
3747 ps->ps_record_shift, ps->ps_pgnum));
3748
3749 return KERN_SUCCESS;
3750 }
3751
3752
3753
3754 kern_return_t
3755 ps_read_file(
3756 paging_segment_t ps,
3757 upl_t upl,
3758 upl_offset_t upl_offset,
3759 vm_offset_t offset,
3760 upl_size_t size,
3761 unsigned int *residualp,
3762 int flags)
3763 {
3764 vm_object_offset_t f_offset;
3765 int error = 0;
3766 int result;
3767
3768 assert(dp_encryption_inited);
3769
3770 clustered_reads[atop_32(size)]++;
3771
3772 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3773
3774 /* for transfer case we need to pass uploffset and flags */
3775 error = vnode_pagein(ps->ps_vnode,
3776 upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
3777
3778 /* The vnode_pagein semantic is somewhat at odds with the existing */
3779 /* device_read semantic. Partial reads are not experienced at this */
3780 /* level. It is up to the bit map code and cluster read code to */
3781 /* check that requested data locations are actually backed, and the */
3782 /* pagein code to either read all of the requested data or return an */
3783 /* error. */
3784
3785 if (error)
3786 result = KERN_FAILURE;
3787 else {
3788 *residualp = 0;
3789 result = KERN_SUCCESS;
3790 }
3791 return result;
3792 }
3793
3794 kern_return_t
3795 ps_write_file(
3796 paging_segment_t ps,
3797 upl_t upl,
3798 upl_offset_t upl_offset,
3799 vm_offset_t offset,
3800 unsigned int size,
3801 int flags)
3802 {
3803 vm_object_offset_t f_offset;
3804 kern_return_t result;
3805
3806 assert(dp_encryption_inited);
3807
3808 clustered_writes[atop_32(size)]++;
3809 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3810
3811 if (flags & UPL_PAGING_ENCRYPTED) {
3812 /*
3813 * ENCRYPTED SWAP:
3814 * encrypt all the pages that we're going
3815 * to pageout.
3816 */
3817 upl_encrypt(upl, upl_offset, size);
3818 }
3819
3820 if (vnode_pageout(ps->ps_vnode,
3821 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3822 result = KERN_FAILURE;
3823 else
3824 result = KERN_SUCCESS;
3825
3826 return result;
3827 }
3828
3829 kern_return_t
3830 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
3831 int hi_wat,
3832 int lo_wat,
3833 int flags,
3834 MACH_PORT_FACE trigger_port)
3835 {
3836 MACH_PORT_FACE release;
3837 kern_return_t kr;
3838
3839 PSL_LOCK();
3840 if (flags == SWAP_ENCRYPT_ON) {
3841 /* ENCRYPTED SWAP: turn encryption on */
3842 release = trigger_port;
3843 if (!dp_encryption_inited) {
3844 dp_encryption_inited = TRUE;
3845 dp_encryption = TRUE;
3846 kr = KERN_SUCCESS;
3847 } else {
3848 kr = KERN_FAILURE;
3849 }
3850 } else if (flags == SWAP_ENCRYPT_OFF) {
3851 /* ENCRYPTED SWAP: turn encryption off */
3852 release = trigger_port;
3853 if (!dp_encryption_inited) {
3854 dp_encryption_inited = TRUE;
3855 dp_encryption = FALSE;
3856 kr = KERN_SUCCESS;
3857 } else {
3858 kr = KERN_FAILURE;
3859 }
3860 } else if (flags == HI_WAT_ALERT) {
3861 release = min_pages_trigger_port;
3862 min_pages_trigger_port = trigger_port;
3863 minimum_pages_remaining = hi_wat/vm_page_size;
3864 bs_low = FALSE;
3865 kr = KERN_SUCCESS;
3866 } else if (flags == LO_WAT_ALERT) {
3867 release = max_pages_trigger_port;
3868 max_pages_trigger_port = trigger_port;
3869 maximum_pages_free = lo_wat/vm_page_size;
3870 kr = KERN_SUCCESS;
3871 } else {
3872 release = trigger_port;
3873 kr = KERN_INVALID_ARGUMENT;
3874 }
3875 PSL_UNLOCK();
3876
3877 if (IP_VALID(release))
3878 ipc_port_release_send(release);
3879
3880 return kr;
3881 }
3882
3883 /*
3884 * Monitor the amount of available backing store vs. the amount of
3885 * required backing store, notify a listener (if present) when
3886 * backing store may safely be removed.
3887 *
3888 * We attempt to avoid the situation where backing store is
3889 * discarded en masse, as this can lead to thrashing as the
3890 * backing store is compacted.
3891 */
3892
3893 #define PF_INTERVAL 3 /* time between free level checks */
3894 #define PF_LATENCY 10 /* number of intervals before release */
3895
3896 static int dp_pages_free_low_count = 0;
3897 thread_call_t default_pager_backing_store_monitor_callout;
3898
3899 void
3900 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
3901 __unused thread_call_param_t p2)
3902 {
3903 // unsigned long long average;
3904 ipc_port_t trigger;
3905 uint64_t deadline;
3906
3907 /*
3908 * We determine whether it will be safe to release some
3909 * backing store by watching the free page level. If
3910 * it remains below the maximum_pages_free threshold for
3911 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3912 * then we deem it safe.
3913 *
3914 * Note that this establishes a maximum rate at which backing
3915 * store will be released, as each notification (currently)
3916 * only results in a single backing store object being
3917 * released.
3918 */
3919 if (dp_pages_free > maximum_pages_free) {
3920 dp_pages_free_low_count++;
3921 } else {
3922 dp_pages_free_low_count = 0;
3923 }
3924
3925 /* decide whether to send notification */
3926 trigger = IP_NULL;
3927 if (max_pages_trigger_port &&
3928 (backing_store_release_trigger_disable == 0) &&
3929 (dp_pages_free_low_count > PF_LATENCY)) {
3930 trigger = max_pages_trigger_port;
3931 max_pages_trigger_port = NULL;
3932 }
3933
3934 /* send notification */
3935 if (trigger != IP_NULL) {
3936 VSL_LOCK();
3937 if(backing_store_release_trigger_disable != 0) {
3938 assert_wait((event_t)
3939 &backing_store_release_trigger_disable,
3940 THREAD_UNINT);
3941 VSL_UNLOCK();
3942 thread_block(THREAD_CONTINUE_NULL);
3943 } else {
3944 VSL_UNLOCK();
3945 }
3946 default_pager_space_alert(trigger, LO_WAT_ALERT);
3947 ipc_port_release_send(trigger);
3948 dp_pages_free_low_count = 0;
3949 }
3950
3951 clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
3952 thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
3953 }