]> git.saurik.com Git - apple/xnu.git/blob - osfmk/default_pager/dp_backing_store.c
8bba4fe3b0e2b4901aadf566fd60971d58e4ece9
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /*
31 * @OSF_COPYRIGHT@
32 */
33 /*
34 * Mach Operating System
35 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
36 * All Rights Reserved.
37 *
38 * Permission to use, copy, modify and distribute this software and its
39 * documentation is hereby granted, provided that both the copyright
40 * notice and this permission notice appear in all copies of the
41 * software, derivative works or modified versions, and any portions
42 * thereof, and that both notices appear in supporting documentation.
43 *
44 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
45 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
46 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
47 *
48 * Carnegie Mellon requests users of this software to return to
49 *
50 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
51 * School of Computer Science
52 * Carnegie Mellon University
53 * Pittsburgh PA 15213-3890
54 *
55 * any improvements or extensions that they make and grant Carnegie Mellon
56 * the rights to redistribute these changes.
57 */
58
59 /*
60 * Default Pager.
61 * Paging File Management.
62 */
63
64 #include <mach/host_priv.h>
65 #include <mach/memory_object_control.h>
66 #include <mach/memory_object_server.h>
67 #include <mach/upl.h>
68 #include <default_pager/default_pager_internal.h>
69 #include <default_pager/default_pager_alerts.h>
70 #include <default_pager/default_pager_object_server.h>
71
72 #include <ipc/ipc_types.h>
73 #include <ipc/ipc_port.h>
74 #include <ipc/ipc_space.h>
75
76 #include <kern/kern_types.h>
77 #include <kern/host.h>
78 #include <kern/queue.h>
79 #include <kern/counters.h>
80 #include <kern/sched_prim.h>
81
82 #include <vm/vm_kern.h>
83 #include <vm/vm_pageout.h>
84 #include <vm/vm_map.h>
85 #include <vm/vm_object.h>
86 #include <vm/vm_protos.h>
87
88 /* LP64todo - need large internal object support */
89
90 /*
91 * ALLOC_STRIDE... the maximum number of bytes allocated from
92 * a swap file before moving on to the next swap file... if
93 * all swap files reside on a single disk, this value should
94 * be very large (this is the default assumption)... if the
95 * swap files are spread across multiple disks, than this value
96 * should be small (128 * 1024)...
97 *
98 * This should be determined dynamically in the future
99 */
100
101 #define ALLOC_STRIDE (1024 * 1024 * 1024)
102 int physical_transfer_cluster_count = 0;
103
104 #define VM_SUPER_CLUSTER 0x40000
105 #define VM_SUPER_PAGES 64
106
107 /*
108 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
109 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
110 */
111 #define VSTRUCT_DEF_CLSHIFT 2
112 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
113 int default_pager_clsize = 0;
114
115 /* statistics */
116 unsigned int clustered_writes[VM_SUPER_PAGES+1];
117 unsigned int clustered_reads[VM_SUPER_PAGES+1];
118
119 /*
120 * Globals used for asynchronous paging operations:
121 * vs_async_list: head of list of to-be-completed I/O ops
122 * async_num_queued: number of pages completed, but not yet
123 * processed by async thread.
124 * async_requests_out: number of pages of requests not completed.
125 */
126
127 #if 0
128 struct vs_async *vs_async_list;
129 int async_num_queued;
130 int async_requests_out;
131 #endif
132
133
134 #define VS_ASYNC_REUSE 1
135 struct vs_async *vs_async_free_list;
136
137 mutex_t default_pager_async_lock; /* Protects globals above */
138
139
140 int vs_alloc_async_failed = 0; /* statistics */
141 int vs_alloc_async_count = 0; /* statistics */
142 struct vs_async *vs_alloc_async(void); /* forward */
143 void vs_free_async(struct vs_async *vsa); /* forward */
144
145
146 #define VS_ALLOC_ASYNC() vs_alloc_async()
147 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
148
149 #define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
150 #define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
151 #define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, 0)
152 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
153 /*
154 * Paging Space Hysteresis triggers and the target notification port
155 *
156 */
157
158 unsigned int minimum_pages_remaining = 0;
159 unsigned int maximum_pages_free = 0;
160 ipc_port_t min_pages_trigger_port = NULL;
161 ipc_port_t max_pages_trigger_port = NULL;
162
163 boolean_t bs_low = FALSE;
164 int backing_store_release_trigger_disable = 0;
165
166
167 /* Have we decided if swap needs to be encrypted yet ? */
168 boolean_t dp_encryption_inited = FALSE;
169 /* Should we encrypt swap ? */
170 boolean_t dp_encryption = FALSE;
171
172
173 /*
174 * Object sizes are rounded up to the next power of 2,
175 * unless they are bigger than a given maximum size.
176 */
177 vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
178
179 /*
180 * List of all backing store and segments.
181 */
182 struct backing_store_list_head backing_store_list;
183 paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
184 mutex_t paging_segments_lock;
185 int paging_segment_max = 0;
186 int paging_segment_count = 0;
187 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
188
189
190 /*
191 * Total pages free in system
192 * This differs from clusters committed/avail which is a measure of the
193 * over commitment of paging segments to backing store. An idea which is
194 * likely to be deprecated.
195 */
196 unsigned int dp_pages_free = 0;
197 unsigned int cluster_transfer_minimum = 100;
198
199 /* forward declarations */
200 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, int); /* forward */
201 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, unsigned int *, int); /* forward */
202 default_pager_thread_t *get_read_buffer( void );
203 kern_return_t ps_vstruct_transfer_from_segment(
204 vstruct_t vs,
205 paging_segment_t segment,
206 upl_t upl);
207 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
208 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
209 kern_return_t vs_cluster_transfer(
210 vstruct_t vs,
211 upl_offset_t offset,
212 upl_size_t cnt,
213 upl_t upl);
214 vs_map_t vs_get_map_entry(
215 vstruct_t vs,
216 vm_offset_t offset);
217
218
219 default_pager_thread_t *
220 get_read_buffer( void )
221 {
222 int i;
223
224 DPT_LOCK(dpt_lock);
225 while(TRUE) {
226 for (i=0; i<default_pager_internal_count; i++) {
227 if(dpt_array[i]->checked_out == FALSE) {
228 dpt_array[i]->checked_out = TRUE;
229 DPT_UNLOCK(dpt_lock);
230 return dpt_array[i];
231 }
232 }
233 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
234 }
235 }
236
237 void
238 bs_initialize(void)
239 {
240 int i;
241
242 /*
243 * List of all backing store.
244 */
245 BSL_LOCK_INIT();
246 queue_init(&backing_store_list.bsl_queue);
247 PSL_LOCK_INIT();
248
249 VS_ASYNC_LOCK_INIT();
250 #if VS_ASYNC_REUSE
251 vs_async_free_list = NULL;
252 #endif /* VS_ASYNC_REUSE */
253
254 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
255 clustered_writes[i] = 0;
256 clustered_reads[i] = 0;
257 }
258
259 }
260
261 /*
262 * When things do not quite workout...
263 */
264 void bs_no_paging_space(boolean_t); /* forward */
265
266 void
267 bs_no_paging_space(
268 boolean_t out_of_memory)
269 {
270
271 if (out_of_memory)
272 dprintf(("*** OUT OF MEMORY ***\n"));
273 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
274 }
275
276 void bs_more_space(int); /* forward */
277 void bs_commit(int); /* forward */
278
279 boolean_t user_warned = FALSE;
280 unsigned int clusters_committed = 0;
281 unsigned int clusters_available = 0;
282 unsigned int clusters_committed_peak = 0;
283
284 void
285 bs_more_space(
286 int nclusters)
287 {
288 BSL_LOCK();
289 /*
290 * Account for new paging space.
291 */
292 clusters_available += nclusters;
293
294 if (clusters_available >= clusters_committed) {
295 if (verbose && user_warned) {
296 printf("%s%s - %d excess clusters now.\n",
297 my_name,
298 "paging space is OK now",
299 clusters_available - clusters_committed);
300 user_warned = FALSE;
301 clusters_committed_peak = 0;
302 }
303 } else {
304 if (verbose && user_warned) {
305 printf("%s%s - still short of %d clusters.\n",
306 my_name,
307 "WARNING: paging space over-committed",
308 clusters_committed - clusters_available);
309 clusters_committed_peak -= nclusters;
310 }
311 }
312 BSL_UNLOCK();
313
314 return;
315 }
316
317 void
318 bs_commit(
319 int nclusters)
320 {
321 BSL_LOCK();
322 clusters_committed += nclusters;
323 if (clusters_committed > clusters_available) {
324 if (verbose && !user_warned) {
325 user_warned = TRUE;
326 printf("%s%s - short of %d clusters.\n",
327 my_name,
328 "WARNING: paging space over-committed",
329 clusters_committed - clusters_available);
330 }
331 if (clusters_committed > clusters_committed_peak) {
332 clusters_committed_peak = clusters_committed;
333 }
334 } else {
335 if (verbose && user_warned) {
336 printf("%s%s - was short of up to %d clusters.\n",
337 my_name,
338 "paging space is OK now",
339 clusters_committed_peak - clusters_available);
340 user_warned = FALSE;
341 clusters_committed_peak = 0;
342 }
343 }
344 BSL_UNLOCK();
345
346 return;
347 }
348
349 int default_pager_info_verbose = 1;
350
351 void
352 bs_global_info(
353 vm_size_t *totalp,
354 vm_size_t *freep)
355 {
356 vm_size_t pages_total, pages_free;
357 paging_segment_t ps;
358 int i;
359
360 PSL_LOCK();
361 pages_total = pages_free = 0;
362 for (i = 0; i <= paging_segment_max; i++) {
363 ps = paging_segments[i];
364 if (ps == PAGING_SEGMENT_NULL)
365 continue;
366
367 /*
368 * no need to lock: by the time this data
369 * gets back to any remote requestor it
370 * will be obsolete anyways
371 */
372 pages_total += ps->ps_pgnum;
373 pages_free += ps->ps_clcount << ps->ps_clshift;
374 DP_DEBUG(DEBUG_BS_INTERNAL,
375 ("segment #%d: %d total, %d free\n",
376 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
377 }
378 *totalp = pages_total;
379 *freep = pages_free;
380 if (verbose && user_warned && default_pager_info_verbose) {
381 if (clusters_available < clusters_committed) {
382 printf("%s %d clusters committed, %d available.\n",
383 my_name,
384 clusters_committed,
385 clusters_available);
386 }
387 }
388 PSL_UNLOCK();
389 }
390
391 backing_store_t backing_store_alloc(void); /* forward */
392
393 backing_store_t
394 backing_store_alloc(void)
395 {
396 backing_store_t bs;
397
398 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
399 if (bs == BACKING_STORE_NULL)
400 panic("backing_store_alloc: no memory");
401
402 BS_LOCK_INIT(bs);
403 bs->bs_port = MACH_PORT_NULL;
404 bs->bs_priority = 0;
405 bs->bs_clsize = 0;
406 bs->bs_pages_total = 0;
407 bs->bs_pages_in = 0;
408 bs->bs_pages_in_fail = 0;
409 bs->bs_pages_out = 0;
410 bs->bs_pages_out_fail = 0;
411
412 return bs;
413 }
414
415 backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
416
417 /* Even in both the component space and external versions of this pager, */
418 /* backing_store_lookup will be called from tasks in the application space */
419 backing_store_t
420 backing_store_lookup(
421 MACH_PORT_FACE port)
422 {
423 backing_store_t bs;
424
425 /*
426 port is currently backed with a vs structure in the alias field
427 we could create an ISBS alias and a port_is_bs call but frankly
428 I see no reason for the test, the bs->port == port check below
429 will work properly on junk entries.
430
431 if ((port == MACH_PORT_NULL) || port_is_vs(port))
432 */
433 if ((port == MACH_PORT_NULL))
434 return BACKING_STORE_NULL;
435
436 BSL_LOCK();
437 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
438 bs_links) {
439 BS_LOCK(bs);
440 if (bs->bs_port == port) {
441 BSL_UNLOCK();
442 /* Success, return it locked. */
443 return bs;
444 }
445 BS_UNLOCK(bs);
446 }
447 BSL_UNLOCK();
448 return BACKING_STORE_NULL;
449 }
450
451 void backing_store_add(backing_store_t); /* forward */
452
453 void
454 backing_store_add(
455 __unused backing_store_t bs)
456 {
457 // MACH_PORT_FACE port = bs->bs_port;
458 // MACH_PORT_FACE pset = default_pager_default_set;
459 kern_return_t kr = KERN_SUCCESS;
460
461 if (kr != KERN_SUCCESS)
462 panic("backing_store_add: add to set");
463
464 }
465
466 /*
467 * Set up default page shift, but only if not already
468 * set and argument is within range.
469 */
470 boolean_t
471 bs_set_default_clsize(unsigned int npages)
472 {
473 switch(npages){
474 case 1:
475 case 2:
476 case 4:
477 case 8:
478 if (default_pager_clsize == 0) /* if not yet set */
479 vstruct_def_clshift = local_log2(npages);
480 return(TRUE);
481 }
482 return(FALSE);
483 }
484
485 int bs_get_global_clsize(int clsize); /* forward */
486
487 int
488 bs_get_global_clsize(
489 int clsize)
490 {
491 int i;
492 memory_object_default_t dmm;
493 kern_return_t kr;
494
495 /*
496 * Only allow setting of cluster size once. If called
497 * with no cluster size (default), we use the compiled-in default
498 * for the duration. The same cluster size is used for all
499 * paging segments.
500 */
501 if (default_pager_clsize == 0) {
502 /*
503 * Keep cluster size in bit shift because it's quicker
504 * arithmetic, and easier to keep at a power of 2.
505 */
506 if (clsize != NO_CLSIZE) {
507 for (i = 0; (1 << i) < clsize; i++);
508 if (i > MAX_CLUSTER_SHIFT)
509 i = MAX_CLUSTER_SHIFT;
510 vstruct_def_clshift = i;
511 }
512 default_pager_clsize = (1 << vstruct_def_clshift);
513
514 /*
515 * Let the user know the new (and definitive) cluster size.
516 */
517 if (verbose)
518 printf("%scluster size = %d page%s\n",
519 my_name, default_pager_clsize,
520 (default_pager_clsize == 1) ? "" : "s");
521
522 /*
523 * Let the kernel know too, in case it hasn't used the
524 * default value provided in main() yet.
525 */
526 dmm = default_pager_object;
527 clsize = default_pager_clsize * vm_page_size; /* in bytes */
528 kr = host_default_memory_manager(host_priv_self(),
529 &dmm,
530 clsize);
531 memory_object_default_deallocate(dmm);
532
533 if (kr != KERN_SUCCESS) {
534 panic("bs_get_global_cl_size:host_default_memory_manager");
535 }
536 if (dmm != default_pager_object) {
537 panic("bs_get_global_cl_size:there is another default pager");
538 }
539 }
540 ASSERT(default_pager_clsize > 0 &&
541 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
542
543 return default_pager_clsize;
544 }
545
546 kern_return_t
547 default_pager_backing_store_create(
548 memory_object_default_t pager,
549 int priority,
550 int clsize, /* in bytes */
551 MACH_PORT_FACE *backing_store)
552 {
553 backing_store_t bs;
554 MACH_PORT_FACE port;
555 // kern_return_t kr;
556 struct vstruct_alias *alias_struct;
557
558 if (pager != default_pager_object)
559 return KERN_INVALID_ARGUMENT;
560
561 bs = backing_store_alloc();
562 port = ipc_port_alloc_kernel();
563 ipc_port_make_send(port);
564 assert (port != IP_NULL);
565
566 DP_DEBUG(DEBUG_BS_EXTERNAL,
567 ("priority=%d clsize=%d bs_port=0x%x\n",
568 priority, clsize, (int) backing_store));
569
570 alias_struct = (struct vstruct_alias *)
571 kalloc(sizeof (struct vstruct_alias));
572 if(alias_struct != NULL) {
573 alias_struct->vs = (struct vstruct *)bs;
574 alias_struct->name = ISVS;
575 port->alias = (int) alias_struct;
576 }
577 else {
578 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
579 kfree(bs, sizeof (struct backing_store));
580 return KERN_RESOURCE_SHORTAGE;
581 }
582
583 bs->bs_port = port;
584 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
585 priority = BS_MAXPRI;
586 else if (priority == BS_NOPRI)
587 priority = BS_MAXPRI;
588 else
589 priority = BS_MINPRI;
590 bs->bs_priority = priority;
591
592 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
593
594 BSL_LOCK();
595 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
596 bs_links);
597 BSL_UNLOCK();
598
599 backing_store_add(bs);
600
601 *backing_store = port;
602 return KERN_SUCCESS;
603 }
604
605 kern_return_t
606 default_pager_backing_store_info(
607 MACH_PORT_FACE backing_store,
608 backing_store_flavor_t flavour,
609 backing_store_info_t info,
610 mach_msg_type_number_t *size)
611 {
612 backing_store_t bs;
613 backing_store_basic_info_t basic;
614 int i;
615 paging_segment_t ps;
616
617 if (flavour != BACKING_STORE_BASIC_INFO ||
618 *size < BACKING_STORE_BASIC_INFO_COUNT)
619 return KERN_INVALID_ARGUMENT;
620
621 basic = (backing_store_basic_info_t)info;
622 *size = BACKING_STORE_BASIC_INFO_COUNT;
623
624 VSTATS_LOCK(&global_stats.gs_lock);
625 basic->pageout_calls = global_stats.gs_pageout_calls;
626 basic->pagein_calls = global_stats.gs_pagein_calls;
627 basic->pages_in = global_stats.gs_pages_in;
628 basic->pages_out = global_stats.gs_pages_out;
629 basic->pages_unavail = global_stats.gs_pages_unavail;
630 basic->pages_init = global_stats.gs_pages_init;
631 basic->pages_init_writes= global_stats.gs_pages_init_writes;
632 VSTATS_UNLOCK(&global_stats.gs_lock);
633
634 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
635 return KERN_INVALID_ARGUMENT;
636
637 basic->bs_pages_total = bs->bs_pages_total;
638 PSL_LOCK();
639 bs->bs_pages_free = 0;
640 for (i = 0; i <= paging_segment_max; i++) {
641 ps = paging_segments[i];
642 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
643 PS_LOCK(ps);
644 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
645 PS_UNLOCK(ps);
646 }
647 }
648 PSL_UNLOCK();
649 basic->bs_pages_free = bs->bs_pages_free;
650 basic->bs_pages_in = bs->bs_pages_in;
651 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
652 basic->bs_pages_out = bs->bs_pages_out;
653 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
654
655 basic->bs_priority = bs->bs_priority;
656 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */
657
658 BS_UNLOCK(bs);
659
660 return KERN_SUCCESS;
661 }
662
663 int ps_delete(paging_segment_t); /* forward */
664
665 int
666 ps_delete(
667 paging_segment_t ps)
668 {
669 vstruct_t vs;
670 kern_return_t error = KERN_SUCCESS;
671 int vs_count;
672
673 VSL_LOCK(); /* get the lock on the list of vs's */
674
675 /* The lock relationship and sequence is farily complicated */
676 /* this code looks at a live list, locking and unlocking the list */
677 /* as it traverses it. It depends on the locking behavior of */
678 /* default_pager_no_senders. no_senders always locks the vstruct */
679 /* targeted for removal before locking the vstruct list. However */
680 /* it will remove that member of the list without locking its */
681 /* neighbors. We can be sure when we hold a lock on a vstruct */
682 /* it cannot be removed from the list but we must hold the list */
683 /* lock to be sure that its pointers to its neighbors are valid. */
684 /* Also, we can hold off destruction of a vstruct when the list */
685 /* lock and the vs locks are not being held by bumping the */
686 /* vs_async_pending count. */
687
688
689 while(backing_store_release_trigger_disable != 0) {
690 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
691 }
692
693 /* we will choose instead to hold a send right */
694 vs_count = vstruct_list.vsl_count;
695 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
696 if(vs == (vstruct_t)&vstruct_list) {
697 VSL_UNLOCK();
698 return KERN_SUCCESS;
699 }
700 VS_LOCK(vs);
701 vs_async_wait(vs); /* wait for any pending async writes */
702 if ((vs_count != 0) && (vs != NULL))
703 vs->vs_async_pending += 1; /* hold parties calling */
704 /* vs_async_wait */
705 VS_UNLOCK(vs);
706 VSL_UNLOCK();
707 while((vs_count != 0) && (vs != NULL)) {
708 /* We take the count of AMO's before beginning the */
709 /* transfer of of the target segment. */
710 /* We are guaranteed that the target segment cannot get */
711 /* more users. We also know that queue entries are */
712 /* made at the back of the list. If some of the entries */
713 /* we would check disappear while we are traversing the */
714 /* list then we will either check new entries which */
715 /* do not have any backing store in the target segment */
716 /* or re-check old entries. This might not be optimal */
717 /* but it will always be correct. The alternative is to */
718 /* take a snapshot of the list. */
719 vstruct_t next_vs;
720
721 if(dp_pages_free < cluster_transfer_minimum)
722 error = KERN_FAILURE;
723 else {
724 vm_object_t transfer_object;
725 int count;
726 upl_t upl;
727
728 transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
729 count = 0;
730 error = vm_object_upl_request(transfer_object,
731 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
732 &upl, NULL, &count,
733 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
734 | UPL_SET_INTERNAL);
735 if(error == KERN_SUCCESS) {
736 error = ps_vstruct_transfer_from_segment(
737 vs, ps, upl);
738 upl_commit(upl, NULL, 0);
739 upl_deallocate(upl);
740 } else {
741 error = KERN_FAILURE;
742 }
743 vm_object_deallocate(transfer_object);
744 }
745 if(error) {
746 VS_LOCK(vs);
747 vs->vs_async_pending -= 1; /* release vs_async_wait */
748 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
749 vs->vs_waiting_async = FALSE;
750 VS_UNLOCK(vs);
751 thread_wakeup(&vs->vs_async_pending);
752 } else {
753 VS_UNLOCK(vs);
754 }
755 return KERN_FAILURE;
756 }
757
758 VSL_LOCK();
759
760 while(backing_store_release_trigger_disable != 0) {
761 VSL_SLEEP(&backing_store_release_trigger_disable,
762 THREAD_UNINT);
763 }
764
765 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
766 if((next_vs != (vstruct_t)&vstruct_list) &&
767 (vs != next_vs) && (vs_count != 1)) {
768 VS_LOCK(next_vs);
769 vs_async_wait(next_vs); /* wait for any */
770 /* pending async writes */
771 next_vs->vs_async_pending += 1; /* hold parties */
772 /* calling vs_async_wait */
773 VS_UNLOCK(next_vs);
774 }
775 VSL_UNLOCK();
776 VS_LOCK(vs);
777 vs->vs_async_pending -= 1;
778 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
779 vs->vs_waiting_async = FALSE;
780 VS_UNLOCK(vs);
781 thread_wakeup(&vs->vs_async_pending);
782 } else {
783 VS_UNLOCK(vs);
784 }
785 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
786 vs = NULL;
787 else
788 vs = next_vs;
789 vs_count--;
790 }
791 return KERN_SUCCESS;
792 }
793
794
795 kern_return_t
796 default_pager_backing_store_delete(
797 MACH_PORT_FACE backing_store)
798 {
799 backing_store_t bs;
800 int i;
801 paging_segment_t ps;
802 int error;
803 int interim_pages_removed = 0;
804 // kern_return_t kr;
805
806 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
807 return KERN_INVALID_ARGUMENT;
808
809 #if 0
810 /* not implemented */
811 BS_UNLOCK(bs);
812 return KERN_FAILURE;
813 #endif
814
815 restart:
816 PSL_LOCK();
817 error = KERN_SUCCESS;
818 for (i = 0; i <= paging_segment_max; i++) {
819 ps = paging_segments[i];
820 if (ps != PAGING_SEGMENT_NULL &&
821 ps->ps_bs == bs &&
822 ! ps->ps_going_away) {
823 PS_LOCK(ps);
824 /* disable access to this segment */
825 ps->ps_going_away = TRUE;
826 PS_UNLOCK(ps);
827 /*
828 * The "ps" segment is "off-line" now,
829 * we can try and delete it...
830 */
831 if(dp_pages_free < (cluster_transfer_minimum
832 + ps->ps_pgcount)) {
833 error = KERN_FAILURE;
834 PSL_UNLOCK();
835 }
836 else {
837 /* remove all pages associated with the */
838 /* segment from the list of free pages */
839 /* when transfer is through, all target */
840 /* segment pages will appear to be free */
841
842 dp_pages_free -= ps->ps_pgcount;
843 interim_pages_removed += ps->ps_pgcount;
844 PSL_UNLOCK();
845 error = ps_delete(ps);
846 }
847 if (error != KERN_SUCCESS) {
848 /*
849 * We couldn't delete the segment,
850 * probably because there's not enough
851 * virtual memory left.
852 * Re-enable all the segments.
853 */
854 PSL_LOCK();
855 break;
856 }
857 goto restart;
858 }
859 }
860
861 if (error != KERN_SUCCESS) {
862 for (i = 0; i <= paging_segment_max; i++) {
863 ps = paging_segments[i];
864 if (ps != PAGING_SEGMENT_NULL &&
865 ps->ps_bs == bs &&
866 ps->ps_going_away) {
867 PS_LOCK(ps);
868 /* re-enable access to this segment */
869 ps->ps_going_away = FALSE;
870 PS_UNLOCK(ps);
871 }
872 }
873 dp_pages_free += interim_pages_removed;
874 PSL_UNLOCK();
875 BS_UNLOCK(bs);
876 return error;
877 }
878
879 for (i = 0; i <= paging_segment_max; i++) {
880 ps = paging_segments[i];
881 if (ps != PAGING_SEGMENT_NULL &&
882 ps->ps_bs == bs) {
883 if(ps->ps_going_away) {
884 paging_segments[i] = PAGING_SEGMENT_NULL;
885 paging_segment_count--;
886 PS_LOCK(ps);
887 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
888 kfree(ps, sizeof *ps);
889 }
890 }
891 }
892
893 /* Scan the entire ps array separately to make certain we find the */
894 /* proper paging_segment_max */
895 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
896 if(paging_segments[i] != PAGING_SEGMENT_NULL)
897 paging_segment_max = i;
898 }
899
900 PSL_UNLOCK();
901
902 /*
903 * All the segments have been deleted.
904 * We can remove the backing store.
905 */
906
907 /*
908 * Disable lookups of this backing store.
909 */
910 if((void *)bs->bs_port->alias != NULL)
911 kfree((void *) bs->bs_port->alias,
912 sizeof (struct vstruct_alias));
913 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
914 bs->bs_port = MACH_PORT_NULL;
915 BS_UNLOCK(bs);
916
917 /*
918 * Remove backing store from backing_store list.
919 */
920 BSL_LOCK();
921 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
922 bs_links);
923 BSL_UNLOCK();
924
925 /*
926 * Free the backing store structure.
927 */
928 kfree(bs, sizeof *bs);
929
930 return KERN_SUCCESS;
931 }
932
933 int ps_enter(paging_segment_t); /* forward */
934
935 int
936 ps_enter(
937 paging_segment_t ps)
938 {
939 int i;
940
941 PSL_LOCK();
942
943 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
944 if (paging_segments[i] == PAGING_SEGMENT_NULL)
945 break;
946 }
947
948 if (i < MAX_NUM_PAGING_SEGMENTS) {
949 paging_segments[i] = ps;
950 if (i > paging_segment_max)
951 paging_segment_max = i;
952 paging_segment_count++;
953 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
954 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
955 ps_select_array[ps->ps_bs->bs_priority] = 0;
956 i = 0;
957 } else {
958 PSL_UNLOCK();
959 return KERN_RESOURCE_SHORTAGE;
960 }
961
962 PSL_UNLOCK();
963 return i;
964 }
965
966 #ifdef DEVICE_PAGING
967 kern_return_t
968 default_pager_add_segment(
969 MACH_PORT_FACE backing_store,
970 MACH_PORT_FACE device,
971 recnum_t offset,
972 recnum_t count,
973 int record_size)
974 {
975 backing_store_t bs;
976 paging_segment_t ps;
977 int i;
978 int error;
979
980 if ((bs = backing_store_lookup(backing_store))
981 == BACKING_STORE_NULL)
982 return KERN_INVALID_ARGUMENT;
983
984 PSL_LOCK();
985 for (i = 0; i <= paging_segment_max; i++) {
986 ps = paging_segments[i];
987 if (ps == PAGING_SEGMENT_NULL)
988 continue;
989
990 /*
991 * Check for overlap on same device.
992 */
993 if (!(ps->ps_device != device
994 || offset >= ps->ps_offset + ps->ps_recnum
995 || offset + count <= ps->ps_offset)) {
996 PSL_UNLOCK();
997 BS_UNLOCK(bs);
998 return KERN_INVALID_ARGUMENT;
999 }
1000 }
1001 PSL_UNLOCK();
1002
1003 /*
1004 * Set up the paging segment
1005 */
1006 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1007 if (ps == PAGING_SEGMENT_NULL) {
1008 BS_UNLOCK(bs);
1009 return KERN_RESOURCE_SHORTAGE;
1010 }
1011
1012 ps->ps_segtype = PS_PARTITION;
1013 ps->ps_device = device;
1014 ps->ps_offset = offset;
1015 ps->ps_record_shift = local_log2(vm_page_size / record_size);
1016 ps->ps_recnum = count;
1017 ps->ps_pgnum = count >> ps->ps_record_shift;
1018
1019 ps->ps_pgcount = ps->ps_pgnum;
1020 ps->ps_clshift = local_log2(bs->bs_clsize);
1021 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1022 ps->ps_hint = 0;
1023
1024 PS_LOCK_INIT(ps);
1025 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1026 if (!ps->ps_bmap) {
1027 kfree(ps, sizeof *ps);
1028 BS_UNLOCK(bs);
1029 return KERN_RESOURCE_SHORTAGE;
1030 }
1031 for (i = 0; i < ps->ps_ncls; i++) {
1032 clrbit(ps->ps_bmap, i);
1033 }
1034
1035 ps->ps_going_away = FALSE;
1036 ps->ps_bs = bs;
1037
1038 if ((error = ps_enter(ps)) != 0) {
1039 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1040 kfree(ps, sizeof *ps);
1041 BS_UNLOCK(bs);
1042 return KERN_RESOURCE_SHORTAGE;
1043 }
1044
1045 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1046 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1047 BS_UNLOCK(bs);
1048
1049 PSL_LOCK();
1050 dp_pages_free += ps->ps_pgcount;
1051 PSL_UNLOCK();
1052
1053 bs_more_space(ps->ps_clcount);
1054
1055 DP_DEBUG(DEBUG_BS_INTERNAL,
1056 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1057 device, offset, count, record_size,
1058 ps->ps_record_shift, ps->ps_pgnum));
1059
1060 return KERN_SUCCESS;
1061 }
1062
1063 boolean_t
1064 bs_add_device(
1065 char *dev_name,
1066 MACH_PORT_FACE master)
1067 {
1068 security_token_t null_security_token = {
1069 { 0, 0 }
1070 };
1071 MACH_PORT_FACE device;
1072 int info[DEV_GET_SIZE_COUNT];
1073 mach_msg_type_number_t info_count;
1074 MACH_PORT_FACE bs = MACH_PORT_NULL;
1075 unsigned int rec_size;
1076 recnum_t count;
1077 int clsize;
1078 MACH_PORT_FACE reply_port;
1079
1080 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1081 null_security_token, dev_name, &device))
1082 return FALSE;
1083
1084 info_count = DEV_GET_SIZE_COUNT;
1085 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1086 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1087 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1088 clsize = bs_get_global_clsize(0);
1089 if (!default_pager_backing_store_create(
1090 default_pager_object,
1091 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1092 (clsize * vm_page_size),
1093 &bs)) {
1094 if (!default_pager_add_segment(bs, device,
1095 0, count, rec_size)) {
1096 return TRUE;
1097 }
1098 ipc_port_release_receive(bs);
1099 }
1100 }
1101
1102 ipc_port_release_send(device);
1103 return FALSE;
1104 }
1105 #endif /* DEVICE_PAGING */
1106
1107 #if VS_ASYNC_REUSE
1108
1109 struct vs_async *
1110 vs_alloc_async(void)
1111 {
1112 struct vs_async *vsa;
1113 MACH_PORT_FACE reply_port;
1114 // kern_return_t kr;
1115
1116 VS_ASYNC_LOCK();
1117 if (vs_async_free_list == NULL) {
1118 VS_ASYNC_UNLOCK();
1119 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1120 if (vsa != NULL) {
1121 /*
1122 * Try allocating a reply port named after the
1123 * address of the vs_async structure.
1124 */
1125 struct vstruct_alias *alias_struct;
1126
1127 reply_port = ipc_port_alloc_kernel();
1128 alias_struct = (struct vstruct_alias *)
1129 kalloc(sizeof (struct vstruct_alias));
1130 if(alias_struct != NULL) {
1131 alias_struct->vs = (struct vstruct *)vsa;
1132 alias_struct->name = ISVS;
1133 reply_port->alias = (int) alias_struct;
1134 vsa->reply_port = reply_port;
1135 vs_alloc_async_count++;
1136 }
1137 else {
1138 vs_alloc_async_failed++;
1139 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1140 (reply_port));
1141 kfree(vsa, sizeof (struct vs_async));
1142 vsa = NULL;
1143 }
1144 }
1145 } else {
1146 vsa = vs_async_free_list;
1147 vs_async_free_list = vs_async_free_list->vsa_next;
1148 VS_ASYNC_UNLOCK();
1149 }
1150
1151 return vsa;
1152 }
1153
1154 void
1155 vs_free_async(
1156 struct vs_async *vsa)
1157 {
1158 VS_ASYNC_LOCK();
1159 vsa->vsa_next = vs_async_free_list;
1160 vs_async_free_list = vsa;
1161 VS_ASYNC_UNLOCK();
1162 }
1163
1164 #else /* VS_ASYNC_REUSE */
1165
1166 struct vs_async *
1167 vs_alloc_async(void)
1168 {
1169 struct vs_async *vsa;
1170 MACH_PORT_FACE reply_port;
1171 kern_return_t kr;
1172
1173 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1174 if (vsa != NULL) {
1175 /*
1176 * Try allocating a reply port named after the
1177 * address of the vs_async structure.
1178 */
1179 reply_port = ipc_port_alloc_kernel();
1180 alias_struct = (vstruct_alias *)
1181 kalloc(sizeof (struct vstruct_alias));
1182 if(alias_struct != NULL) {
1183 alias_struct->vs = reply_port;
1184 alias_struct->name = ISVS;
1185 reply_port->alias = (int) vsa;
1186 vsa->reply_port = reply_port;
1187 vs_alloc_async_count++;
1188 }
1189 else {
1190 vs_alloc_async_failed++;
1191 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1192 (reply_port));
1193 kfree(vsa, sizeof (struct vs_async));
1194 vsa = NULL;
1195 }
1196 }
1197
1198 return vsa;
1199 }
1200
1201 void
1202 vs_free_async(
1203 struct vs_async *vsa)
1204 {
1205 MACH_PORT_FACE reply_port;
1206 kern_return_t kr;
1207
1208 reply_port = vsa->reply_port;
1209 kfree(reply_port->alias, sizeof (struct vstuct_alias));
1210 kfree(vsa, sizeof (struct vs_async));
1211 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1212 #if 0
1213 VS_ASYNC_LOCK();
1214 vs_alloc_async_count--;
1215 VS_ASYNC_UNLOCK();
1216 #endif
1217 }
1218
1219 #endif /* VS_ASYNC_REUSE */
1220
1221 zone_t vstruct_zone;
1222
1223 vstruct_t
1224 ps_vstruct_create(
1225 vm_size_t size)
1226 {
1227 vstruct_t vs;
1228 unsigned int i;
1229
1230 vs = (vstruct_t) zalloc(vstruct_zone);
1231 if (vs == VSTRUCT_NULL) {
1232 return VSTRUCT_NULL;
1233 }
1234
1235 VS_LOCK_INIT(vs);
1236
1237 /*
1238 * The following fields will be provided later.
1239 */
1240 vs->vs_mem_obj = NULL;
1241 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1242 vs->vs_references = 1;
1243 vs->vs_seqno = 0;
1244
1245 #ifdef MACH_KERNEL
1246 vs->vs_waiting_seqno = FALSE;
1247 vs->vs_waiting_read = FALSE;
1248 vs->vs_waiting_write = FALSE;
1249 vs->vs_waiting_async = FALSE;
1250 #else
1251 mutex_init(&vs->vs_waiting_seqno, 0);
1252 mutex_init(&vs->vs_waiting_read, 0);
1253 mutex_init(&vs->vs_waiting_write, 0);
1254 mutex_init(&vs->vs_waiting_refs, 0);
1255 mutex_init(&vs->vs_waiting_async, 0);
1256 #endif
1257
1258 vs->vs_readers = 0;
1259 vs->vs_writers = 0;
1260
1261 vs->vs_errors = 0;
1262
1263 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1264 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1265 vs->vs_async_pending = 0;
1266
1267 /*
1268 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1269 * depending on the size of the memory object.
1270 */
1271 if (INDIRECT_CLMAP(vs->vs_size)) {
1272 vs->vs_imap = (struct vs_map **)
1273 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1274 vs->vs_indirect = TRUE;
1275 } else {
1276 vs->vs_dmap = (struct vs_map *)
1277 kalloc(CLMAP_SIZE(vs->vs_size));
1278 vs->vs_indirect = FALSE;
1279 }
1280 vs->vs_xfer_pending = FALSE;
1281 DP_DEBUG(DEBUG_VS_INTERNAL,
1282 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1283
1284 /*
1285 * Check to see that we got the space.
1286 */
1287 if (!vs->vs_dmap) {
1288 kfree(vs, sizeof *vs);
1289 return VSTRUCT_NULL;
1290 }
1291
1292 /*
1293 * Zero the indirect pointers, or clear the direct pointers.
1294 */
1295 if (vs->vs_indirect)
1296 memset(vs->vs_imap, 0,
1297 INDIRECT_CLMAP_SIZE(vs->vs_size));
1298 else
1299 for (i = 0; i < vs->vs_size; i++)
1300 VSM_CLR(vs->vs_dmap[i]);
1301
1302 VS_MAP_LOCK_INIT(vs);
1303
1304 bs_commit(vs->vs_size);
1305
1306 return vs;
1307 }
1308
1309 paging_segment_t ps_select_segment(unsigned int, int *); /* forward */
1310
1311 paging_segment_t
1312 ps_select_segment(
1313 unsigned int shift,
1314 int *psindex)
1315 {
1316 paging_segment_t ps;
1317 int i;
1318 int j;
1319
1320 /*
1321 * Optimize case where there's only one segment.
1322 * paging_segment_max will index the one and only segment.
1323 */
1324
1325 PSL_LOCK();
1326 if (paging_segment_count == 1) {
1327 paging_segment_t lps; /* used to avoid extra PS_UNLOCK */
1328 ipc_port_t trigger = IP_NULL;
1329
1330 ps = paging_segments[paging_segment_max];
1331 *psindex = paging_segment_max;
1332 PS_LOCK(ps);
1333 if (ps->ps_going_away) {
1334 /* this segment is being turned off */
1335 lps = PAGING_SEGMENT_NULL;
1336 } else {
1337 ASSERT(ps->ps_clshift >= shift);
1338 if (ps->ps_clcount) {
1339 ps->ps_clcount--;
1340 dp_pages_free -= 1 << ps->ps_clshift;
1341 if(min_pages_trigger_port &&
1342 (dp_pages_free < minimum_pages_remaining)) {
1343 trigger = min_pages_trigger_port;
1344 min_pages_trigger_port = NULL;
1345 bs_low = TRUE;
1346 }
1347 lps = ps;
1348 } else
1349 lps = PAGING_SEGMENT_NULL;
1350 }
1351 PS_UNLOCK(ps);
1352 PSL_UNLOCK();
1353
1354 if (trigger != IP_NULL) {
1355 default_pager_space_alert(trigger, HI_WAT_ALERT);
1356 ipc_port_release_send(trigger);
1357 }
1358 return lps;
1359 }
1360
1361 if (paging_segment_count == 0) {
1362 PSL_UNLOCK();
1363 return PAGING_SEGMENT_NULL;
1364 }
1365
1366 for (i = BS_MAXPRI;
1367 i >= BS_MINPRI; i--) {
1368 int start_index;
1369
1370 if ((ps_select_array[i] == BS_NOPRI) ||
1371 (ps_select_array[i] == BS_FULLPRI))
1372 continue;
1373 start_index = ps_select_array[i];
1374
1375 if(!(paging_segments[start_index])) {
1376 j = start_index+1;
1377 physical_transfer_cluster_count = 0;
1378 }
1379 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1380 (((paging_segments[start_index])->ps_clshift)
1381 + vm_page_shift))) {
1382 physical_transfer_cluster_count = 0;
1383 j = start_index + 1;
1384 } else {
1385 physical_transfer_cluster_count+=1;
1386 j = start_index;
1387 if(start_index == 0)
1388 start_index = paging_segment_max;
1389 else
1390 start_index = start_index - 1;
1391 }
1392
1393 while (1) {
1394 if (j > paging_segment_max)
1395 j = 0;
1396 if ((ps = paging_segments[j]) &&
1397 (ps->ps_bs->bs_priority == i)) {
1398 /*
1399 * Force the ps cluster size to be
1400 * >= that of the vstruct.
1401 */
1402 PS_LOCK(ps);
1403 if (ps->ps_going_away) {
1404 /* this segment is being turned off */
1405 } else if ((ps->ps_clcount) &&
1406 (ps->ps_clshift >= shift)) {
1407 ipc_port_t trigger = IP_NULL;
1408
1409 ps->ps_clcount--;
1410 dp_pages_free -= 1 << ps->ps_clshift;
1411 if(min_pages_trigger_port &&
1412 (dp_pages_free <
1413 minimum_pages_remaining)) {
1414 trigger = min_pages_trigger_port;
1415 min_pages_trigger_port = NULL;
1416 }
1417 PS_UNLOCK(ps);
1418 /*
1419 * found one, quit looking.
1420 */
1421 ps_select_array[i] = j;
1422 PSL_UNLOCK();
1423
1424 if (trigger != IP_NULL) {
1425 default_pager_space_alert(
1426 trigger,
1427 HI_WAT_ALERT);
1428 ipc_port_release_send(trigger);
1429 }
1430 *psindex = j;
1431 return ps;
1432 }
1433 PS_UNLOCK(ps);
1434 }
1435 if (j == start_index) {
1436 /*
1437 * none at this priority -- mark it full
1438 */
1439 ps_select_array[i] = BS_FULLPRI;
1440 break;
1441 }
1442 j++;
1443 }
1444 }
1445 PSL_UNLOCK();
1446 return PAGING_SEGMENT_NULL;
1447 }
1448
1449 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1450
1451 vm_offset_t
1452 ps_allocate_cluster(
1453 vstruct_t vs,
1454 int *psindex,
1455 paging_segment_t use_ps)
1456 {
1457 unsigned int byte_num;
1458 int bit_num = 0;
1459 paging_segment_t ps;
1460 vm_offset_t cluster;
1461 ipc_port_t trigger = IP_NULL;
1462
1463 /*
1464 * Find best paging segment.
1465 * ps_select_segment will decrement cluster count on ps.
1466 * Must pass cluster shift to find the most appropriate segment.
1467 */
1468 /* NOTE: The addition of paging segment delete capability threatened
1469 * to seriously complicate the treatment of paging segments in this
1470 * module and the ones that call it (notably ps_clmap), because of the
1471 * difficulty in assuring that the paging segment would continue to
1472 * exist between being unlocked and locked. This was
1473 * avoided because all calls to this module are based in either
1474 * dp_memory_object calls which rely on the vs lock, or by
1475 * the transfer function which is part of the segment delete path.
1476 * The transfer function which is part of paging segment delete is
1477 * protected from multiple callers by the backing store lock.
1478 * The paging segment delete function treats mappings to a paging
1479 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1480 * while data is transferred to the remaining segments. This is in
1481 * line with the view that incomplete or in-transition mappings between
1482 * data, a vstruct, and backing store are protected by the vs lock.
1483 * This and the ordering of the paging segment "going_away" bit setting
1484 * protects us.
1485 */
1486 if (use_ps != PAGING_SEGMENT_NULL) {
1487 ps = use_ps;
1488 PSL_LOCK();
1489 PS_LOCK(ps);
1490
1491 ASSERT(ps->ps_clcount != 0);
1492
1493 ps->ps_clcount--;
1494 dp_pages_free -= 1 << ps->ps_clshift;
1495 if(min_pages_trigger_port &&
1496 (dp_pages_free < minimum_pages_remaining)) {
1497 trigger = min_pages_trigger_port;
1498 min_pages_trigger_port = NULL;
1499 }
1500 PSL_UNLOCK();
1501 PS_UNLOCK(ps);
1502 if (trigger != IP_NULL) {
1503 default_pager_space_alert(trigger, HI_WAT_ALERT);
1504 ipc_port_release_send(trigger);
1505 }
1506
1507 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1508 PAGING_SEGMENT_NULL) {
1509 static uint32_t lastnotify = 0;
1510 uint32_t now, nanoseconds_dummy;
1511
1512 /*
1513 * Emit a notification of the low-paging resource condition
1514 * but don't issue it more than once every five seconds. This
1515 * prevents us from overflowing logs with thousands of
1516 * repetitions of the message.
1517 */
1518 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1519 if (now > lastnotify + 5) {
1520 dprintf(("no space in available paging segments\n"));
1521 lastnotify = now;
1522 }
1523
1524 /* the count got off maybe, reset to zero */
1525 PSL_LOCK();
1526 dp_pages_free = 0;
1527 if(min_pages_trigger_port) {
1528 trigger = min_pages_trigger_port;
1529 min_pages_trigger_port = NULL;
1530 bs_low = TRUE;
1531 }
1532 PSL_UNLOCK();
1533 if (trigger != IP_NULL) {
1534 default_pager_space_alert(trigger, HI_WAT_ALERT);
1535 ipc_port_release_send(trigger);
1536 }
1537 return (vm_offset_t) -1;
1538 }
1539
1540 /*
1541 * Look for an available cluster. At the end of the loop,
1542 * byte_num is the byte offset and bit_num is the bit offset of the
1543 * first zero bit in the paging segment bitmap.
1544 */
1545 PS_LOCK(ps);
1546 byte_num = ps->ps_hint;
1547 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1548 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1549 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1550 if (isclr((ps->ps_bmap + byte_num), bit_num))
1551 break;
1552 }
1553 ASSERT(bit_num != NBBY);
1554 break;
1555 }
1556 }
1557 ps->ps_hint = byte_num;
1558 cluster = (byte_num*NBBY) + bit_num;
1559
1560 /* Space was reserved, so this must be true */
1561 ASSERT(cluster < ps->ps_ncls);
1562
1563 setbit(ps->ps_bmap, cluster);
1564 PS_UNLOCK(ps);
1565
1566 return cluster;
1567 }
1568
1569 void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */
1570
1571 void
1572 ps_deallocate_cluster(
1573 paging_segment_t ps,
1574 vm_offset_t cluster)
1575 {
1576
1577 if (cluster >= (vm_offset_t) ps->ps_ncls)
1578 panic("ps_deallocate_cluster: Invalid cluster number");
1579
1580 /*
1581 * Lock the paging segment, clear the cluster's bitmap and increment the
1582 * number of free cluster.
1583 */
1584 PSL_LOCK();
1585 PS_LOCK(ps);
1586 clrbit(ps->ps_bmap, cluster);
1587 ++ps->ps_clcount;
1588 dp_pages_free += 1 << ps->ps_clshift;
1589 PSL_UNLOCK();
1590
1591 /*
1592 * Move the hint down to the freed cluster if it is
1593 * less than the current hint.
1594 */
1595 if ((cluster/NBBY) < ps->ps_hint) {
1596 ps->ps_hint = (cluster/NBBY);
1597 }
1598
1599 PS_UNLOCK(ps);
1600
1601 /*
1602 * If we're freeing space on a full priority, reset the array.
1603 */
1604 PSL_LOCK();
1605 if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1606 ps_select_array[ps->ps_bs->bs_priority] = 0;
1607 PSL_UNLOCK();
1608
1609 return;
1610 }
1611
1612 void ps_dealloc_vsmap(struct vs_map *, vm_size_t); /* forward */
1613
1614 void
1615 ps_dealloc_vsmap(
1616 struct vs_map *vsmap,
1617 vm_size_t size)
1618 {
1619 unsigned int i;
1620 for (i = 0; i < size; i++)
1621 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1622 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1623 VSM_CLOFF(vsmap[i]));
1624 }
1625
1626 void
1627 ps_vstruct_dealloc(
1628 vstruct_t vs)
1629 {
1630 unsigned int i;
1631 // spl_t s;
1632
1633 VS_MAP_LOCK(vs);
1634
1635 /*
1636 * If this is an indirect structure, then we walk through the valid
1637 * (non-zero) indirect pointers and deallocate the clusters
1638 * associated with each used map entry (via ps_dealloc_vsmap).
1639 * When all of the clusters in an indirect block have been
1640 * freed, we deallocate the block. When all of the indirect
1641 * blocks have been deallocated we deallocate the memory
1642 * holding the indirect pointers.
1643 */
1644 if (vs->vs_indirect) {
1645 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1646 if (vs->vs_imap[i] != NULL) {
1647 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1648 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1649 }
1650 }
1651 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1652 } else {
1653 /*
1654 * Direct map. Free used clusters, then memory.
1655 */
1656 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1657 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1658 }
1659 VS_MAP_UNLOCK(vs);
1660
1661 bs_commit(- vs->vs_size);
1662
1663 zfree(vstruct_zone, vs);
1664 }
1665
1666 int ps_map_extend(vstruct_t, unsigned int); /* forward */
1667
1668 int ps_map_extend(
1669 vstruct_t vs,
1670 unsigned int new_size)
1671 {
1672 struct vs_map **new_imap;
1673 struct vs_map *new_dmap = NULL;
1674 int newdsize;
1675 int i;
1676 void *old_map = NULL;
1677 int old_map_size = 0;
1678
1679 if (vs->vs_size >= new_size) {
1680 /*
1681 * Someone has already done the work.
1682 */
1683 return 0;
1684 }
1685
1686 /*
1687 * If the new size extends into the indirect range, then we have one
1688 * of two cases: we are going from indirect to indirect, or we are
1689 * going from direct to indirect. If we are going from indirect to
1690 * indirect, then it is possible that the new size will fit in the old
1691 * indirect map. If this is the case, then just reset the size of the
1692 * vstruct map and we are done. If the new size will not
1693 * fit into the old indirect map, then we have to allocate a new
1694 * indirect map and copy the old map pointers into this new map.
1695 *
1696 * If we are going from direct to indirect, then we have to allocate a
1697 * new indirect map and copy the old direct pages into the first
1698 * indirect page of the new map.
1699 * NOTE: allocating memory here is dangerous, as we're in the
1700 * pageout path.
1701 */
1702 if (INDIRECT_CLMAP(new_size)) {
1703 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1704
1705 /*
1706 * Get a new indirect map and zero it.
1707 */
1708 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1709 if (vs->vs_indirect &&
1710 (new_map_size == old_map_size)) {
1711 bs_commit(new_size - vs->vs_size);
1712 vs->vs_size = new_size;
1713 return 0;
1714 }
1715
1716 new_imap = (struct vs_map **)kalloc(new_map_size);
1717 if (new_imap == NULL) {
1718 return -1;
1719 }
1720 memset(new_imap, 0, new_map_size);
1721
1722 if (vs->vs_indirect) {
1723 /* Copy old entries into new map */
1724 memcpy(new_imap, vs->vs_imap, old_map_size);
1725 /* Arrange to free the old map */
1726 old_map = (void *) vs->vs_imap;
1727 newdsize = 0;
1728 } else { /* Old map was a direct map */
1729 /* Allocate an indirect page */
1730 if ((new_imap[0] = (struct vs_map *)
1731 kalloc(CLMAP_THRESHOLD)) == NULL) {
1732 kfree(new_imap, new_map_size);
1733 return -1;
1734 }
1735 new_dmap = new_imap[0];
1736 newdsize = CLMAP_ENTRIES;
1737 }
1738 } else {
1739 new_imap = NULL;
1740 newdsize = new_size;
1741 /*
1742 * If the new map is a direct map, then the old map must
1743 * also have been a direct map. All we have to do is
1744 * to allocate a new direct map, copy the old entries
1745 * into it and free the old map.
1746 */
1747 if ((new_dmap = (struct vs_map *)
1748 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1749 return -1;
1750 }
1751 }
1752 if (newdsize) {
1753
1754 /* Free the old map */
1755 old_map = (void *) vs->vs_dmap;
1756 old_map_size = CLMAP_SIZE(vs->vs_size);
1757
1758 /* Copy info from the old map into the new map */
1759 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1760
1761 /* Initialize the rest of the new map */
1762 for (i = vs->vs_size; i < newdsize; i++)
1763 VSM_CLR(new_dmap[i]);
1764 }
1765 if (new_imap) {
1766 vs->vs_imap = new_imap;
1767 vs->vs_indirect = TRUE;
1768 } else
1769 vs->vs_dmap = new_dmap;
1770 bs_commit(new_size - vs->vs_size);
1771 vs->vs_size = new_size;
1772 if (old_map)
1773 kfree(old_map, old_map_size);
1774 return 0;
1775 }
1776
1777 vm_offset_t
1778 ps_clmap(
1779 vstruct_t vs,
1780 vm_offset_t offset,
1781 struct clmap *clmap,
1782 int flag,
1783 vm_size_t size,
1784 int error)
1785 {
1786 vm_offset_t cluster; /* The cluster of offset. */
1787 vm_offset_t newcl; /* The new cluster allocated. */
1788 vm_offset_t newoff;
1789 unsigned int i;
1790 struct vs_map *vsmap;
1791
1792 VS_MAP_LOCK(vs);
1793
1794 ASSERT(vs->vs_dmap);
1795 cluster = atop_32(offset) >> vs->vs_clshift;
1796
1797 /*
1798 * Initialize cluster error value
1799 */
1800 clmap->cl_error = 0;
1801
1802 /*
1803 * If the object has grown, extend the page map.
1804 */
1805 if (cluster >= vs->vs_size) {
1806 if (flag == CL_FIND) {
1807 /* Do not allocate if just doing a lookup */
1808 VS_MAP_UNLOCK(vs);
1809 return (vm_offset_t) -1;
1810 }
1811 if (ps_map_extend(vs, cluster + 1)) {
1812 VS_MAP_UNLOCK(vs);
1813 return (vm_offset_t) -1;
1814 }
1815 }
1816
1817 /*
1818 * Look for the desired cluster. If the map is indirect, then we
1819 * have a two level lookup. First find the indirect block, then
1820 * find the actual cluster. If the indirect block has not yet
1821 * been allocated, then do so. If the cluster has not yet been
1822 * allocated, then do so.
1823 *
1824 * If any of the allocations fail, then return an error.
1825 * Don't allocate if just doing a lookup.
1826 */
1827 if (vs->vs_indirect) {
1828 long ind_block = cluster/CLMAP_ENTRIES;
1829
1830 /* Is the indirect block allocated? */
1831 vsmap = vs->vs_imap[ind_block];
1832 if (vsmap == NULL) {
1833 if (flag == CL_FIND) {
1834 VS_MAP_UNLOCK(vs);
1835 return (vm_offset_t) -1;
1836 }
1837
1838 /* Allocate the indirect block */
1839 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1840 if (vsmap == NULL) {
1841 VS_MAP_UNLOCK(vs);
1842 return (vm_offset_t) -1;
1843 }
1844 /* Initialize the cluster offsets */
1845 for (i = 0; i < CLMAP_ENTRIES; i++)
1846 VSM_CLR(vsmap[i]);
1847 vs->vs_imap[ind_block] = vsmap;
1848 }
1849 } else
1850 vsmap = vs->vs_dmap;
1851
1852 ASSERT(vsmap);
1853 vsmap += cluster%CLMAP_ENTRIES;
1854
1855 /*
1856 * At this point, vsmap points to the struct vs_map desired.
1857 *
1858 * Look in the map for the cluster, if there was an error on a
1859 * previous write, flag it and return. If it is not yet
1860 * allocated, then allocate it, if we're writing; if we're
1861 * doing a lookup and the cluster's not allocated, return error.
1862 */
1863 if (VSM_ISERR(*vsmap)) {
1864 clmap->cl_error = VSM_GETERR(*vsmap);
1865 VS_MAP_UNLOCK(vs);
1866 return (vm_offset_t) -1;
1867 } else if (VSM_ISCLR(*vsmap)) {
1868 int psindex;
1869
1870 if (flag == CL_FIND) {
1871 /*
1872 * If there's an error and the entry is clear, then
1873 * we've run out of swap space. Record the error
1874 * here and return.
1875 */
1876 if (error) {
1877 VSM_SETERR(*vsmap, error);
1878 }
1879 VS_MAP_UNLOCK(vs);
1880 return (vm_offset_t) -1;
1881 } else {
1882 /*
1883 * Attempt to allocate a cluster from the paging segment
1884 */
1885 newcl = ps_allocate_cluster(vs, &psindex,
1886 PAGING_SEGMENT_NULL);
1887 if (newcl == (vm_offset_t) -1) {
1888 VS_MAP_UNLOCK(vs);
1889 return (vm_offset_t) -1;
1890 }
1891 VSM_CLR(*vsmap);
1892 VSM_SETCLOFF(*vsmap, newcl);
1893 VSM_SETPS(*vsmap, psindex);
1894 }
1895 } else
1896 newcl = VSM_CLOFF(*vsmap);
1897
1898 /*
1899 * Fill in pertinent fields of the clmap
1900 */
1901 clmap->cl_ps = VSM_PS(*vsmap);
1902 clmap->cl_numpages = VSCLSIZE(vs);
1903 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1904
1905 /*
1906 * Byte offset in paging segment is byte offset to cluster plus
1907 * byte offset within cluster. It looks ugly, but should be
1908 * relatively quick.
1909 */
1910 ASSERT(trunc_page(offset) == offset);
1911 newcl = ptoa_32(newcl) << vs->vs_clshift;
1912 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1913 if (flag == CL_ALLOC) {
1914 /*
1915 * set bits in the allocation bitmap according to which
1916 * pages were requested. size is in bytes.
1917 */
1918 i = atop_32(newoff);
1919 while ((size > 0) && (i < VSCLSIZE(vs))) {
1920 VSM_SETALLOC(*vsmap, i);
1921 i++;
1922 size -= vm_page_size;
1923 }
1924 }
1925 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1926 if (newoff) {
1927 /*
1928 * Offset is not cluster aligned, so number of pages
1929 * and bitmaps must be adjusted
1930 */
1931 clmap->cl_numpages -= atop_32(newoff);
1932 CLMAP_SHIFT(clmap, vs);
1933 CLMAP_SHIFTALLOC(clmap, vs);
1934 }
1935
1936 /*
1937 *
1938 * The setting of valid bits and handling of write errors
1939 * must be done here, while we hold the lock on the map.
1940 * It logically should be done in ps_vs_write_complete().
1941 * The size and error information has been passed from
1942 * ps_vs_write_complete(). If the size parameter is non-zero,
1943 * then there is work to be done. If error is also non-zero,
1944 * then the error number is recorded in the cluster and the
1945 * entire cluster is in error.
1946 */
1947 if (size && flag == CL_FIND) {
1948 vm_offset_t off = (vm_offset_t) 0;
1949
1950 if (!error) {
1951 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1952 i++) {
1953 VSM_SETPG(*vsmap, i);
1954 size -= vm_page_size;
1955 }
1956 ASSERT(i <= VSCLSIZE(vs));
1957 } else {
1958 BS_STAT(clmap->cl_ps->ps_bs,
1959 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1960 atop_32(size));
1961 off = VSM_CLOFF(*vsmap);
1962 VSM_SETERR(*vsmap, error);
1963 }
1964 /*
1965 * Deallocate cluster if error, and no valid pages
1966 * already present.
1967 */
1968 if (off != (vm_offset_t) 0)
1969 ps_deallocate_cluster(clmap->cl_ps, off);
1970 VS_MAP_UNLOCK(vs);
1971 return (vm_offset_t) 0;
1972 } else
1973 VS_MAP_UNLOCK(vs);
1974
1975 DP_DEBUG(DEBUG_VS_INTERNAL,
1976 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1977 newcl+newoff, (int) vs, (int) vsmap, flag));
1978 DP_DEBUG(DEBUG_VS_INTERNAL,
1979 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1980 (int) clmap->cl_ps, clmap->cl_numpages,
1981 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1982
1983 return (newcl + newoff);
1984 }
1985
1986 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */
1987
1988 void
1989 ps_clunmap(
1990 vstruct_t vs,
1991 vm_offset_t offset,
1992 vm_size_t length)
1993 {
1994 vm_offset_t cluster; /* The cluster number of offset */
1995 struct vs_map *vsmap;
1996
1997 VS_MAP_LOCK(vs);
1998
1999 /*
2000 * Loop through all clusters in this range, freeing paging segment
2001 * clusters and map entries as encountered.
2002 */
2003 while (length > 0) {
2004 vm_offset_t newoff;
2005 unsigned int i;
2006
2007 cluster = atop_32(offset) >> vs->vs_clshift;
2008 if (vs->vs_indirect) /* indirect map */
2009 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2010 else
2011 vsmap = vs->vs_dmap;
2012 if (vsmap == NULL) {
2013 VS_MAP_UNLOCK(vs);
2014 return;
2015 }
2016 vsmap += cluster%CLMAP_ENTRIES;
2017 if (VSM_ISCLR(*vsmap)) {
2018 length -= vm_page_size;
2019 offset += vm_page_size;
2020 continue;
2021 }
2022 /*
2023 * We've got a valid mapping. Clear it and deallocate
2024 * paging segment cluster pages.
2025 * Optimize for entire cluster cleraing.
2026 */
2027 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2028 /*
2029 * Not cluster aligned.
2030 */
2031 ASSERT(trunc_page(newoff) == newoff);
2032 i = atop_32(newoff);
2033 } else
2034 i = 0;
2035 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2036 VSM_CLRPG(*vsmap, i);
2037 VSM_CLRALLOC(*vsmap, i);
2038 length -= vm_page_size;
2039 offset += vm_page_size;
2040 i++;
2041 }
2042
2043 /*
2044 * If map entry is empty, clear and deallocate cluster.
2045 */
2046 if (!VSM_ALLOC(*vsmap)) {
2047 ps_deallocate_cluster(VSM_PS(*vsmap),
2048 VSM_CLOFF(*vsmap));
2049 VSM_CLR(*vsmap);
2050 }
2051 }
2052
2053 VS_MAP_UNLOCK(vs);
2054 }
2055
2056 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2057
2058 void
2059 ps_vs_write_complete(
2060 vstruct_t vs,
2061 vm_offset_t offset,
2062 vm_size_t size,
2063 int error)
2064 {
2065 struct clmap clmap;
2066
2067 /*
2068 * Get the struct vsmap for this cluster.
2069 * Use READ, even though it was written, because the
2070 * cluster MUST be present, unless there was an error
2071 * in the original ps_clmap (e.g. no space), in which
2072 * case, nothing happens.
2073 *
2074 * Must pass enough information to ps_clmap to allow it
2075 * to set the vs_map structure bitmap under lock.
2076 */
2077 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2078 }
2079
2080 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */
2081
2082 void
2083 vs_cl_write_complete(
2084 vstruct_t vs,
2085 __unused paging_segment_t ps,
2086 vm_offset_t offset,
2087 __unused vm_offset_t addr,
2088 vm_size_t size,
2089 boolean_t async,
2090 int error)
2091 {
2092 // kern_return_t kr;
2093
2094 if (error) {
2095 /*
2096 * For internal objects, the error is recorded on a
2097 * per-cluster basis by ps_clmap() which is called
2098 * by ps_vs_write_complete() below.
2099 */
2100 dprintf(("write failed error = 0x%x\n", error));
2101 /* add upl_abort code here */
2102 } else
2103 GSTAT(global_stats.gs_pages_out += atop_32(size));
2104 /*
2105 * Notify the vstruct mapping code, so it can do its accounting.
2106 */
2107 ps_vs_write_complete(vs, offset, size, error);
2108
2109 if (async) {
2110 VS_LOCK(vs);
2111 ASSERT(vs->vs_async_pending > 0);
2112 vs->vs_async_pending -= size;
2113 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2114 vs->vs_waiting_async = FALSE;
2115 VS_UNLOCK(vs);
2116 /* mutex_unlock(&vs->vs_waiting_async); */
2117 thread_wakeup(&vs->vs_async_pending);
2118 } else {
2119 VS_UNLOCK(vs);
2120 }
2121 }
2122 }
2123
2124 #ifdef DEVICE_PAGING
2125 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2126
2127 kern_return_t
2128 device_write_reply(
2129 MACH_PORT_FACE reply_port,
2130 kern_return_t device_code,
2131 io_buf_len_t bytes_written)
2132 {
2133 struct vs_async *vsa;
2134
2135 vsa = (struct vs_async *)
2136 ((struct vstruct_alias *)(reply_port->alias))->vs;
2137
2138 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2139 device_code = KERN_FAILURE;
2140 }
2141
2142 vsa->vsa_error = device_code;
2143
2144
2145 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2146 if(vsa->vsa_flags & VSA_TRANSFER) {
2147 /* revisit when async disk segments redone */
2148 if(vsa->vsa_error) {
2149 /* need to consider error condition. re-write data or */
2150 /* throw it away here. */
2151 vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2152 }
2153 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2154 vsa->vsa_size, vsa->vsa_error);
2155 } else {
2156 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2157 vsa->vsa_addr, vsa->vsa_size, TRUE,
2158 vsa->vsa_error);
2159 }
2160 VS_FREE_ASYNC(vsa);
2161
2162 return KERN_SUCCESS;
2163 }
2164
2165 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2166 kern_return_t
2167 device_write_reply_inband(
2168 MACH_PORT_FACE reply_port,
2169 kern_return_t return_code,
2170 io_buf_len_t bytes_written)
2171 {
2172 panic("device_write_reply_inband: illegal");
2173 return KERN_SUCCESS;
2174 }
2175
2176 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2177 kern_return_t
2178 device_read_reply(
2179 MACH_PORT_FACE reply_port,
2180 kern_return_t return_code,
2181 io_buf_ptr_t data,
2182 mach_msg_type_number_t dataCnt)
2183 {
2184 struct vs_async *vsa;
2185 vsa = (struct vs_async *)
2186 ((struct vstruct_alias *)(reply_port->alias))->vs;
2187 vsa->vsa_addr = (vm_offset_t)data;
2188 vsa->vsa_size = (vm_size_t)dataCnt;
2189 vsa->vsa_error = return_code;
2190 thread_wakeup(&vsa->vsa_lock);
2191 return KERN_SUCCESS;
2192 }
2193
2194 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2195 kern_return_t
2196 device_read_reply_inband(
2197 MACH_PORT_FACE reply_port,
2198 kern_return_t return_code,
2199 io_buf_ptr_inband_t data,
2200 mach_msg_type_number_t dataCnt)
2201 {
2202 panic("device_read_reply_inband: illegal");
2203 return KERN_SUCCESS;
2204 }
2205
2206 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2207 kern_return_t
2208 device_read_reply_overwrite(
2209 MACH_PORT_FACE reply_port,
2210 kern_return_t return_code,
2211 io_buf_len_t bytes_read)
2212 {
2213 panic("device_read_reply_overwrite: illegal\n");
2214 return KERN_SUCCESS;
2215 }
2216
2217 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2218 kern_return_t
2219 device_open_reply(
2220 MACH_PORT_FACE reply_port,
2221 kern_return_t return_code,
2222 MACH_PORT_FACE device_port)
2223 {
2224 panic("device_open_reply: illegal\n");
2225 return KERN_SUCCESS;
2226 }
2227
2228 kern_return_t
2229 ps_read_device(
2230 paging_segment_t ps,
2231 vm_offset_t offset,
2232 vm_offset_t *bufferp,
2233 unsigned int size,
2234 unsigned int *residualp,
2235 int flags)
2236 {
2237 kern_return_t kr;
2238 recnum_t dev_offset;
2239 unsigned int bytes_wanted;
2240 unsigned int bytes_read;
2241 unsigned int total_read;
2242 vm_offset_t dev_buffer;
2243 vm_offset_t buf_ptr;
2244 unsigned int records_read;
2245 struct vs_async *vsa;
2246 mutex_t vs_waiting_read_reply;
2247
2248 device_t device;
2249 vm_map_copy_t device_data = NULL;
2250 default_pager_thread_t *dpt = NULL;
2251
2252 device = dev_port_lookup(ps->ps_device);
2253 clustered_reads[atop_32(size)]++;
2254
2255 dev_offset = (ps->ps_offset +
2256 (offset >> (vm_page_shift - ps->ps_record_shift)));
2257 bytes_wanted = size;
2258 total_read = 0;
2259 *bufferp = (vm_offset_t)NULL;
2260
2261 do {
2262 vsa = VS_ALLOC_ASYNC();
2263 if (vsa) {
2264 vsa->vsa_vs = NULL;
2265 vsa->vsa_addr = 0;
2266 vsa->vsa_offset = 0;
2267 vsa->vsa_size = 0;
2268 vsa->vsa_ps = NULL;
2269 }
2270 mutex_init(&vsa->vsa_lock, 0);
2271 ip_lock(vsa->reply_port);
2272 vsa->reply_port->ip_sorights++;
2273 ip_reference(vsa->reply_port);
2274 ip_unlock(vsa->reply_port);
2275 kr = ds_device_read_common(device,
2276 vsa->reply_port,
2277 (mach_msg_type_name_t)
2278 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2279 (dev_mode_t) 0,
2280 dev_offset,
2281 bytes_wanted,
2282 (IO_READ | IO_CALL),
2283 (io_buf_ptr_t *) &dev_buffer,
2284 (mach_msg_type_number_t *) &bytes_read);
2285 if(kr == MIG_NO_REPLY) {
2286 assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2287 thread_block(THREAD_CONTINUE_NULL);
2288
2289 dev_buffer = vsa->vsa_addr;
2290 bytes_read = (unsigned int)vsa->vsa_size;
2291 kr = vsa->vsa_error;
2292 }
2293 VS_FREE_ASYNC(vsa);
2294 if (kr != KERN_SUCCESS || bytes_read == 0) {
2295 break;
2296 }
2297 total_read += bytes_read;
2298
2299 /*
2300 * If we got the entire range, use the returned dev_buffer.
2301 */
2302 if (bytes_read == size) {
2303 *bufferp = (vm_offset_t)dev_buffer;
2304 break;
2305 }
2306
2307 #if 1
2308 dprintf(("read only %d bytes out of %d\n",
2309 bytes_read, bytes_wanted));
2310 #endif
2311 if(dpt == NULL) {
2312 dpt = get_read_buffer();
2313 buf_ptr = dpt->dpt_buffer;
2314 *bufferp = (vm_offset_t)buf_ptr;
2315 }
2316 /*
2317 * Otherwise, copy the data into the provided buffer (*bufferp)
2318 * and append the rest of the range as it comes in.
2319 */
2320 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2321 buf_ptr += bytes_read;
2322 bytes_wanted -= bytes_read;
2323 records_read = (bytes_read >>
2324 (vm_page_shift - ps->ps_record_shift));
2325 dev_offset += records_read;
2326 DP_DEBUG(DEBUG_VS_INTERNAL,
2327 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2328 dev_buffer, bytes_read));
2329 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2330 != KERN_SUCCESS)
2331 Panic("dealloc buf");
2332 } while (bytes_wanted);
2333
2334 *residualp = size - total_read;
2335 if((dev_buffer != *bufferp) && (total_read != 0)) {
2336 vm_offset_t temp_buffer;
2337 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2338 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2339 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2340 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2341 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2342 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2343 (vm_map_copy_t *)&device_data, FALSE))
2344 panic("ps_read_device: cannot copyin locally provided buffer\n");
2345 }
2346 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2347 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2348 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2349 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2350 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2351 (vm_map_copy_t *)&device_data, FALSE))
2352 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2353 }
2354 else {
2355 device_data = NULL;
2356 }
2357 *bufferp = (vm_offset_t)device_data;
2358
2359 if(dpt != NULL) {
2360 /* Free the receive buffer */
2361 dpt->checked_out = 0;
2362 thread_wakeup(&dpt_array);
2363 }
2364 return KERN_SUCCESS;
2365 }
2366
2367 kern_return_t
2368 ps_write_device(
2369 paging_segment_t ps,
2370 vm_offset_t offset,
2371 vm_offset_t addr,
2372 unsigned int size,
2373 struct vs_async *vsa)
2374 {
2375 recnum_t dev_offset;
2376 io_buf_len_t bytes_to_write, bytes_written;
2377 recnum_t records_written;
2378 kern_return_t kr;
2379 MACH_PORT_FACE reply_port;
2380
2381
2382
2383 clustered_writes[atop_32(size)]++;
2384
2385 dev_offset = (ps->ps_offset +
2386 (offset >> (vm_page_shift - ps->ps_record_shift)));
2387 bytes_to_write = size;
2388
2389 if (vsa) {
2390 /*
2391 * Asynchronous write.
2392 */
2393 reply_port = vsa->reply_port;
2394 ip_lock(reply_port);
2395 reply_port->ip_sorights++;
2396 ip_reference(reply_port);
2397 ip_unlock(reply_port);
2398 {
2399 device_t device;
2400 device = dev_port_lookup(ps->ps_device);
2401
2402 vsa->vsa_addr = addr;
2403 kr=ds_device_write_common(device,
2404 reply_port,
2405 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2406 (dev_mode_t) 0,
2407 dev_offset,
2408 (io_buf_ptr_t) addr,
2409 size,
2410 (IO_WRITE | IO_CALL),
2411 &bytes_written);
2412 }
2413 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2414 if (verbose)
2415 dprintf(("%s0x%x, addr=0x%x,"
2416 "size=0x%x,offset=0x%x\n",
2417 "device_write_request returned ",
2418 kr, addr, size, offset));
2419 BS_STAT(ps->ps_bs,
2420 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2421 /* do the completion notification to free resources */
2422 device_write_reply(reply_port, kr, 0);
2423 return PAGER_ERROR;
2424 }
2425 } else do {
2426 /*
2427 * Synchronous write.
2428 */
2429 {
2430 device_t device;
2431 device = dev_port_lookup(ps->ps_device);
2432 kr=ds_device_write_common(device,
2433 IP_NULL, 0,
2434 (dev_mode_t) 0,
2435 dev_offset,
2436 (io_buf_ptr_t) addr,
2437 size,
2438 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2439 &bytes_written);
2440 }
2441 if (kr != KERN_SUCCESS) {
2442 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2443 "device_write returned ",
2444 kr, addr, size, offset));
2445 BS_STAT(ps->ps_bs,
2446 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2447 return PAGER_ERROR;
2448 }
2449 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2450 Panic("fragmented write");
2451 records_written = (bytes_written >>
2452 (vm_page_shift - ps->ps_record_shift));
2453 dev_offset += records_written;
2454 #if 1
2455 if (bytes_written != bytes_to_write) {
2456 dprintf(("wrote only %d bytes out of %d\n",
2457 bytes_written, bytes_to_write));
2458 }
2459 #endif
2460 bytes_to_write -= bytes_written;
2461 addr += bytes_written;
2462 } while (bytes_to_write > 0);
2463
2464 return PAGER_SUCCESS;
2465 }
2466
2467
2468 #else /* !DEVICE_PAGING */
2469
2470 kern_return_t
2471 ps_read_device(
2472 __unused paging_segment_t ps,
2473 __unused vm_offset_t offset,
2474 __unused vm_offset_t *bufferp,
2475 __unused unsigned int size,
2476 __unused unsigned int *residualp,
2477 __unused int flags)
2478 {
2479 panic("ps_read_device not supported");
2480 }
2481
2482 kern_return_t
2483 ps_write_device(
2484 __unused paging_segment_t ps,
2485 __unused vm_offset_t offset,
2486 __unused vm_offset_t addr,
2487 __unused unsigned int size,
2488 __unused struct vs_async *vsa)
2489 {
2490 panic("ps_write_device not supported");
2491 }
2492
2493 #endif /* DEVICE_PAGING */
2494 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t); /* forward */
2495
2496 void
2497 pvs_object_data_provided(
2498 __unused vstruct_t vs,
2499 __unused upl_t upl,
2500 __unused upl_offset_t offset,
2501 upl_size_t size)
2502 {
2503
2504 DP_DEBUG(DEBUG_VS_INTERNAL,
2505 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2506 upl, offset, size));
2507
2508 ASSERT(size > 0);
2509 GSTAT(global_stats.gs_pages_in += atop_32(size));
2510
2511
2512 #if USE_PRECIOUS
2513 ps_clunmap(vs, offset, size);
2514 #endif /* USE_PRECIOUS */
2515
2516 }
2517
2518 kern_return_t
2519 pvs_cluster_read(
2520 vstruct_t vs,
2521 vm_offset_t vs_offset,
2522 vm_size_t cnt)
2523 {
2524 upl_t upl;
2525 kern_return_t error = KERN_SUCCESS;
2526 int size;
2527 int residual;
2528 unsigned int request_flags;
2529 int seg_index;
2530 int pages_in_cl;
2531 int cl_size;
2532 int cl_mask;
2533 int cl_index;
2534 int xfer_size;
2535 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2536 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2537 struct clmap clmap;
2538
2539 pages_in_cl = 1 << vs->vs_clshift;
2540 cl_size = pages_in_cl * vm_page_size;
2541 cl_mask = cl_size - 1;
2542
2543 /*
2544 * This loop will be executed multiple times until the entire
2545 * request has been satisfied... if the request spans cluster
2546 * boundaries, the clusters will be checked for logical continunity,
2547 * if contiguous the I/O request will span multiple clusters, otherwise
2548 * it will be broken up into the minimal set of I/O's
2549 *
2550 * If there are holes in a request (either unallocated pages in a paging
2551 * segment or an unallocated paging segment), we stop
2552 * reading at the hole, inform the VM of any data read, inform
2553 * the VM of an unavailable range, then loop again, hoping to
2554 * find valid pages later in the requested range. This continues until
2555 * the entire range has been examined, and read, if present.
2556 */
2557
2558 #if USE_PRECIOUS
2559 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
2560 #else
2561 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
2562 #endif
2563
2564 assert(dp_encryption_inited);
2565 if (dp_encryption) {
2566 /*
2567 * ENCRYPTED SWAP:
2568 * request that the UPL be prepared for
2569 * decryption.
2570 */
2571 request_flags |= UPL_ENCRYPT;
2572 }
2573
2574 while (cnt && (error == KERN_SUCCESS)) {
2575 int ps_info_valid;
2576 int page_list_count;
2577
2578 if((vs_offset & cl_mask) &&
2579 (cnt > (VM_SUPER_CLUSTER -
2580 (vs_offset & cl_mask)))) {
2581 size = VM_SUPER_CLUSTER;
2582 size -= vs_offset & cl_mask;
2583 } else if (cnt > VM_SUPER_CLUSTER) {
2584 size = VM_SUPER_CLUSTER;
2585 } else {
2586 size = cnt;
2587 }
2588 cnt -= size;
2589
2590 ps_info_valid = 0;
2591 seg_index = 0;
2592
2593 while (size > 0 && error == KERN_SUCCESS) {
2594 int abort_size;
2595 int failed_size;
2596 int beg_pseg;
2597 int beg_indx;
2598 vm_offset_t cur_offset;
2599
2600
2601 if ( !ps_info_valid) {
2602 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2603 psp[seg_index] = CLMAP_PS(clmap);
2604 ps_info_valid = 1;
2605 }
2606 /*
2607 * skip over unallocated physical segments
2608 */
2609 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2610 abort_size = cl_size - (vs_offset & cl_mask);
2611 abort_size = MIN(abort_size, size);
2612
2613 page_list_count = 0;
2614 memory_object_super_upl_request(
2615 vs->vs_control,
2616 (memory_object_offset_t)vs_offset,
2617 abort_size, abort_size,
2618 &upl, NULL, &page_list_count,
2619 request_flags);
2620
2621 if (clmap.cl_error) {
2622 upl_abort(upl, UPL_ABORT_ERROR);
2623 } else {
2624 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2625 }
2626 upl_deallocate(upl);
2627
2628 size -= abort_size;
2629 vs_offset += abort_size;
2630
2631 seg_index++;
2632 ps_info_valid = 0;
2633 continue;
2634 }
2635 cl_index = (vs_offset & cl_mask) / vm_page_size;
2636
2637 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2638 /*
2639 * skip over unallocated pages
2640 */
2641 if (CLMAP_ISSET(clmap, cl_index))
2642 break;
2643 abort_size += vm_page_size;
2644 }
2645 if (abort_size) {
2646 /*
2647 * Let VM system know about holes in clusters.
2648 */
2649 GSTAT(global_stats.gs_pages_unavail += atop_32(abort_size));
2650
2651 page_list_count = 0;
2652 memory_object_super_upl_request(
2653 vs->vs_control,
2654 (memory_object_offset_t)vs_offset,
2655 abort_size, abort_size,
2656 &upl, NULL, &page_list_count,
2657 request_flags);
2658
2659 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2660 upl_deallocate(upl);
2661
2662 size -= abort_size;
2663 vs_offset += abort_size;
2664
2665 if (cl_index == pages_in_cl) {
2666 /*
2667 * if we're at the end of this physical cluster
2668 * then bump to the next one and continue looking
2669 */
2670 seg_index++;
2671 ps_info_valid = 0;
2672 continue;
2673 }
2674 if (size == 0)
2675 break;
2676 }
2677 /*
2678 * remember the starting point of the first allocated page
2679 * for the I/O we're about to issue
2680 */
2681 beg_pseg = seg_index;
2682 beg_indx = cl_index;
2683 cur_offset = vs_offset;
2684
2685 /*
2686 * calculate the size of the I/O that we can do...
2687 * this may span multiple physical segments if
2688 * they are contiguous
2689 */
2690 for (xfer_size = 0; xfer_size < size; ) {
2691
2692 while (cl_index < pages_in_cl
2693 && xfer_size < size) {
2694 /*
2695 * accumulate allocated pages within
2696 * a physical segment
2697 */
2698 if (CLMAP_ISSET(clmap, cl_index)) {
2699 xfer_size += vm_page_size;
2700 cur_offset += vm_page_size;
2701 cl_index++;
2702
2703 BS_STAT(psp[seg_index]->ps_bs,
2704 psp[seg_index]->ps_bs->bs_pages_in++);
2705 } else
2706 break;
2707 }
2708 if (cl_index < pages_in_cl
2709 || xfer_size >= size) {
2710 /*
2711 * we've hit an unallocated page or
2712 * the end of this request... go fire
2713 * the I/O
2714 */
2715 break;
2716 }
2717 /*
2718 * we've hit the end of the current physical
2719 * segment and there's more to do, so try
2720 * moving to the next one
2721 */
2722 seg_index++;
2723
2724 ps_offset[seg_index] =
2725 ps_clmap(vs,
2726 cur_offset & ~cl_mask,
2727 &clmap, CL_FIND, 0, 0);
2728 psp[seg_index] = CLMAP_PS(clmap);
2729 ps_info_valid = 1;
2730
2731 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2732 /*
2733 * if the physical segment we're about
2734 * to step into is not contiguous to
2735 * the one we're currently in, or it's
2736 * in a different paging file, or
2737 * it hasn't been allocated....
2738 * we stop here and generate the I/O
2739 */
2740 break;
2741 }
2742 /*
2743 * start with first page of the next physical
2744 * segment
2745 */
2746 cl_index = 0;
2747 }
2748 if (xfer_size) {
2749 /*
2750 * we have a contiguous range of allocated pages
2751 * to read from
2752 */
2753 page_list_count = 0;
2754 memory_object_super_upl_request(vs->vs_control,
2755 (memory_object_offset_t)vs_offset,
2756 xfer_size, xfer_size,
2757 &upl, NULL, &page_list_count,
2758 request_flags | UPL_SET_INTERNAL);
2759
2760 error = ps_read_file(psp[beg_pseg],
2761 upl, (upl_offset_t) 0,
2762 ps_offset[beg_pseg] +
2763 (beg_indx * vm_page_size),
2764 xfer_size, &residual, 0);
2765 } else
2766 continue;
2767
2768 failed_size = 0;
2769
2770 /*
2771 * Adjust counts and send response to VM. Optimize
2772 * for the common case, i.e. no error and/or partial
2773 * data. If there was an error, then we need to error
2774 * the entire range, even if some data was successfully
2775 * read. If there was a partial read we may supply some
2776 * data and may error some as well. In all cases the
2777 * VM must receive some notification for every page
2778 * in the range.
2779 */
2780 if ((error == KERN_SUCCESS) && (residual == 0)) {
2781 /*
2782 * Got everything we asked for, supply the data
2783 * to the VM. Note that as a side effect of
2784 * supplying the data, the buffer holding the
2785 * supplied data is deallocated from the pager's
2786 * address space.
2787 */
2788 pvs_object_data_provided(
2789 vs, upl, vs_offset, xfer_size);
2790 } else {
2791 failed_size = xfer_size;
2792
2793 if (error == KERN_SUCCESS) {
2794 if (residual == xfer_size) {
2795 /*
2796 * If a read operation returns no error
2797 * and no data moved, we turn it into
2798 * an error, assuming we're reading at
2799 * or beyong EOF.
2800 * Fall through and error the entire
2801 * range.
2802 */
2803 error = KERN_FAILURE;
2804 } else {
2805 /*
2806 * Otherwise, we have partial read. If
2807 * the part read is a integral number
2808 * of pages supply it. Otherwise round
2809 * it up to a page boundary, zero fill
2810 * the unread part, and supply it.
2811 * Fall through and error the remainder
2812 * of the range, if any.
2813 */
2814 int fill, lsize;
2815
2816 fill = residual
2817 & ~vm_page_size;
2818 lsize = (xfer_size - residual)
2819 + fill;
2820 pvs_object_data_provided(
2821 vs, upl,
2822 vs_offset, lsize);
2823
2824 if (lsize < xfer_size) {
2825 failed_size =
2826 xfer_size - lsize;
2827 error = KERN_FAILURE;
2828 }
2829 }
2830 }
2831 }
2832 /*
2833 * If there was an error in any part of the range, tell
2834 * the VM. Note that error is explicitly checked again
2835 * since it can be modified above.
2836 */
2837 if (error != KERN_SUCCESS) {
2838 BS_STAT(psp[beg_pseg]->ps_bs,
2839 psp[beg_pseg]->ps_bs->bs_pages_in_fail
2840 += atop_32(failed_size));
2841 }
2842 size -= xfer_size;
2843 vs_offset += xfer_size;
2844 }
2845
2846 } /* END while (cnt && (error == 0)) */
2847 return error;
2848 }
2849
2850 int vs_do_async_write = 1;
2851
2852 kern_return_t
2853 vs_cluster_write(
2854 vstruct_t vs,
2855 upl_t internal_upl,
2856 upl_offset_t offset,
2857 upl_size_t cnt,
2858 boolean_t dp_internal,
2859 int flags)
2860 {
2861 upl_size_t transfer_size;
2862 int error = 0;
2863 struct clmap clmap;
2864
2865 vm_offset_t actual_offset; /* Offset within paging segment */
2866 paging_segment_t ps;
2867 vm_offset_t mobj_base_addr;
2868 vm_offset_t mobj_target_addr;
2869
2870 upl_t upl;
2871 upl_page_info_t *pl;
2872 int page_index;
2873 int list_size;
2874 int pages_in_cl;
2875 unsigned int cl_size;
2876 int base_index;
2877 unsigned int seg_size;
2878
2879 pages_in_cl = 1 << vs->vs_clshift;
2880 cl_size = pages_in_cl * vm_page_size;
2881
2882 if (!dp_internal) {
2883 int page_list_count;
2884 int request_flags;
2885 unsigned int super_size;
2886 int first_dirty;
2887 int num_dirty;
2888 int num_of_pages;
2889 int seg_index;
2890 upl_offset_t upl_offset;
2891 vm_offset_t seg_offset;
2892 vm_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2893 paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2894
2895
2896 if (bs_low) {
2897 super_size = cl_size;
2898
2899 request_flags = UPL_NOBLOCK |
2900 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2901 UPL_NO_SYNC | UPL_SET_INTERNAL;
2902 } else {
2903 super_size = VM_SUPER_CLUSTER;
2904
2905 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2906 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2907 UPL_NO_SYNC | UPL_SET_INTERNAL;
2908 }
2909
2910 if (!dp_encryption_inited) {
2911 /*
2912 * ENCRYPTED SWAP:
2913 * Once we've started using swap, we
2914 * can't change our mind on whether
2915 * it needs to be encrypted or
2916 * not.
2917 */
2918 dp_encryption_inited = TRUE;
2919 }
2920 if (dp_encryption) {
2921 /*
2922 * ENCRYPTED SWAP:
2923 * request that the UPL be prepared for
2924 * encryption.
2925 */
2926 request_flags |= UPL_ENCRYPT;
2927 flags |= UPL_PAGING_ENCRYPTED;
2928 }
2929
2930 page_list_count = 0;
2931 memory_object_super_upl_request(vs->vs_control,
2932 (memory_object_offset_t)offset,
2933 cnt, super_size,
2934 &upl, NULL, &page_list_count,
2935 request_flags | UPL_FOR_PAGEOUT);
2936
2937 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2938
2939 seg_size = cl_size - (upl->offset % cl_size);
2940 upl_offset = upl->offset & ~(cl_size - 1);
2941
2942 for (seg_index = 0, transfer_size = upl->size;
2943 transfer_size > 0; ) {
2944 ps_offset[seg_index] =
2945 ps_clmap(vs,
2946 upl_offset,
2947 &clmap, CL_ALLOC,
2948 cl_size, 0);
2949
2950 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2951 upl_abort(upl, 0);
2952 upl_deallocate(upl);
2953
2954 return KERN_FAILURE;
2955
2956 }
2957 psp[seg_index] = CLMAP_PS(clmap);
2958
2959 if (transfer_size > seg_size) {
2960 transfer_size -= seg_size;
2961 upl_offset += cl_size;
2962 seg_size = cl_size;
2963 seg_index++;
2964 } else
2965 transfer_size = 0;
2966 }
2967 /*
2968 * Ignore any non-present pages at the end of the
2969 * UPL.
2970 */
2971 for (page_index = upl->size / vm_page_size; page_index > 0;)
2972 if (UPL_PAGE_PRESENT(pl, --page_index))
2973 break;
2974 num_of_pages = page_index + 1;
2975
2976 base_index = (upl->offset % cl_size) / PAGE_SIZE;
2977
2978 for (page_index = 0; page_index < num_of_pages; ) {
2979 /*
2980 * skip over non-dirty pages
2981 */
2982 for ( ; page_index < num_of_pages; page_index++) {
2983 if (UPL_DIRTY_PAGE(pl, page_index)
2984 || UPL_PRECIOUS_PAGE(pl, page_index))
2985 /*
2986 * this is a page we need to write
2987 * go see if we can buddy it up with
2988 * others that are contiguous to it
2989 */
2990 break;
2991 /*
2992 * if the page is not-dirty, but present we
2993 * need to commit it... This is an unusual
2994 * case since we only asked for dirty pages
2995 */
2996 if (UPL_PAGE_PRESENT(pl, page_index)) {
2997 boolean_t empty = FALSE;
2998 upl_commit_range(upl,
2999 page_index * vm_page_size,
3000 vm_page_size,
3001 UPL_COMMIT_NOTIFY_EMPTY,
3002 pl,
3003 page_list_count,
3004 &empty);
3005 if (empty) {
3006 assert(page_index ==
3007 num_of_pages - 1);
3008 upl_deallocate(upl);
3009 }
3010 }
3011 }
3012 if (page_index == num_of_pages)
3013 /*
3014 * no more pages to look at, we're out of here
3015 */
3016 break;
3017
3018 /*
3019 * gather up contiguous dirty pages... we have at
3020 * least 1 * otherwise we would have bailed above
3021 * make sure that each physical segment that we step
3022 * into is contiguous to the one we're currently in
3023 * if it's not, we have to stop and write what we have
3024 */
3025 for (first_dirty = page_index;
3026 page_index < num_of_pages; ) {
3027 if ( !UPL_DIRTY_PAGE(pl, page_index)
3028 && !UPL_PRECIOUS_PAGE(pl, page_index))
3029 break;
3030 page_index++;
3031 /*
3032 * if we just looked at the last page in the UPL
3033 * we don't need to check for physical segment
3034 * continuity
3035 */
3036 if (page_index < num_of_pages) {
3037 int cur_seg;
3038 int nxt_seg;
3039
3040 cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3041 nxt_seg = (base_index + page_index)/pages_in_cl;
3042
3043 if (cur_seg != nxt_seg) {
3044 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3045 /*
3046 * if the segment we're about
3047 * to step into is not
3048 * contiguous to the one we're
3049 * currently in, or it's in a
3050 * different paging file....
3051 * we stop here and generate
3052 * the I/O
3053 */
3054 break;
3055 }
3056 }
3057 }
3058 num_dirty = page_index - first_dirty;
3059
3060 if (num_dirty) {
3061 upl_offset = first_dirty * vm_page_size;
3062 transfer_size = num_dirty * vm_page_size;
3063
3064 while (transfer_size) {
3065
3066 if ((seg_size = cl_size -
3067 ((upl->offset + upl_offset) % cl_size))
3068 > transfer_size)
3069 seg_size = transfer_size;
3070
3071 ps_vs_write_complete(vs,
3072 upl->offset + upl_offset,
3073 seg_size, error);
3074
3075 transfer_size -= seg_size;
3076 upl_offset += seg_size;
3077 }
3078 upl_offset = first_dirty * vm_page_size;
3079 transfer_size = num_dirty * vm_page_size;
3080
3081 seg_index = (base_index + first_dirty) / pages_in_cl;
3082 seg_offset = (upl->offset + upl_offset) % cl_size;
3083
3084 error = ps_write_file(psp[seg_index],
3085 upl, upl_offset,
3086 ps_offset[seg_index]
3087 + seg_offset,
3088 transfer_size, flags);
3089 } else {
3090 boolean_t empty = FALSE;
3091 upl_abort_range(upl,
3092 first_dirty * vm_page_size,
3093 num_dirty * vm_page_size,
3094 UPL_ABORT_NOTIFY_EMPTY,
3095 &empty);
3096 if (empty) {
3097 assert(page_index == num_of_pages);
3098 upl_deallocate(upl);
3099 }
3100 }
3101 }
3102
3103 } else {
3104 assert(cnt <= (vm_page_size << vs->vs_clshift));
3105 list_size = cnt;
3106
3107 page_index = 0;
3108 /* The caller provides a mapped_data which is derived */
3109 /* from a temporary object. The targeted pages are */
3110 /* guaranteed to be set at offset 0 in the mapped_data */
3111 /* The actual offset however must still be derived */
3112 /* from the offset in the vs in question */
3113 mobj_base_addr = offset;
3114 mobj_target_addr = mobj_base_addr;
3115
3116 for (transfer_size = list_size; transfer_size != 0;) {
3117 actual_offset = ps_clmap(vs, mobj_target_addr,
3118 &clmap, CL_ALLOC,
3119 transfer_size < cl_size ?
3120 transfer_size : cl_size, 0);
3121 if(actual_offset == (vm_offset_t) -1) {
3122 error = 1;
3123 break;
3124 }
3125 cnt = MIN(transfer_size,
3126 CLMAP_NPGS(clmap) * vm_page_size);
3127 ps = CLMAP_PS(clmap);
3128 /* Assume that the caller has given us contiguous */
3129 /* pages */
3130 if(cnt) {
3131 ps_vs_write_complete(vs, mobj_target_addr,
3132 cnt, error);
3133 error = ps_write_file(ps, internal_upl,
3134 0, actual_offset,
3135 cnt, flags);
3136 if (error)
3137 break;
3138 }
3139 if (error)
3140 break;
3141 actual_offset += cnt;
3142 mobj_target_addr += cnt;
3143 transfer_size -= cnt;
3144 cnt = 0;
3145
3146 if (error)
3147 break;
3148 }
3149 }
3150 if(error)
3151 return KERN_FAILURE;
3152 else
3153 return KERN_SUCCESS;
3154 }
3155
3156 vm_size_t
3157 ps_vstruct_allocated_size(
3158 vstruct_t vs)
3159 {
3160 int num_pages;
3161 struct vs_map *vsmap;
3162 unsigned int i, j, k;
3163
3164 num_pages = 0;
3165 if (vs->vs_indirect) {
3166 /* loop on indirect maps */
3167 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3168 vsmap = vs->vs_imap[i];
3169 if (vsmap == NULL)
3170 continue;
3171 /* loop on clusters in this indirect map */
3172 for (j = 0; j < CLMAP_ENTRIES; j++) {
3173 if (VSM_ISCLR(vsmap[j]) ||
3174 VSM_ISERR(vsmap[j]))
3175 continue;
3176 /* loop on pages in this cluster */
3177 for (k = 0; k < VSCLSIZE(vs); k++) {
3178 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3179 num_pages++;
3180 }
3181 }
3182 }
3183 } else {
3184 vsmap = vs->vs_dmap;
3185 if (vsmap == NULL)
3186 return 0;
3187 /* loop on clusters in the direct map */
3188 for (j = 0; j < CLMAP_ENTRIES; j++) {
3189 if (VSM_ISCLR(vsmap[j]) ||
3190 VSM_ISERR(vsmap[j]))
3191 continue;
3192 /* loop on pages in this cluster */
3193 for (k = 0; k < VSCLSIZE(vs); k++) {
3194 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3195 num_pages++;
3196 }
3197 }
3198 }
3199
3200 return ptoa_32(num_pages);
3201 }
3202
3203 size_t
3204 ps_vstruct_allocated_pages(
3205 vstruct_t vs,
3206 default_pager_page_t *pages,
3207 size_t pages_size)
3208 {
3209 unsigned int num_pages;
3210 struct vs_map *vsmap;
3211 vm_offset_t offset;
3212 unsigned int i, j, k;
3213
3214 num_pages = 0;
3215 offset = 0;
3216 if (vs->vs_indirect) {
3217 /* loop on indirect maps */
3218 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3219 vsmap = vs->vs_imap[i];
3220 if (vsmap == NULL) {
3221 offset += (vm_page_size * CLMAP_ENTRIES *
3222 VSCLSIZE(vs));
3223 continue;
3224 }
3225 /* loop on clusters in this indirect map */
3226 for (j = 0; j < CLMAP_ENTRIES; j++) {
3227 if (VSM_ISCLR(vsmap[j]) ||
3228 VSM_ISERR(vsmap[j])) {
3229 offset += vm_page_size * VSCLSIZE(vs);
3230 continue;
3231 }
3232 /* loop on pages in this cluster */
3233 for (k = 0; k < VSCLSIZE(vs); k++) {
3234 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3235 num_pages++;
3236 if (num_pages < pages_size)
3237 pages++->dpp_offset =
3238 offset;
3239 }
3240 offset += vm_page_size;
3241 }
3242 }
3243 }
3244 } else {
3245 vsmap = vs->vs_dmap;
3246 if (vsmap == NULL)
3247 return 0;
3248 /* loop on clusters in the direct map */
3249 for (j = 0; j < CLMAP_ENTRIES; j++) {
3250 if (VSM_ISCLR(vsmap[j]) ||
3251 VSM_ISERR(vsmap[j])) {
3252 offset += vm_page_size * VSCLSIZE(vs);
3253 continue;
3254 }
3255 /* loop on pages in this cluster */
3256 for (k = 0; k < VSCLSIZE(vs); k++) {
3257 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3258 num_pages++;
3259 if (num_pages < pages_size)
3260 pages++->dpp_offset = offset;
3261 }
3262 offset += vm_page_size;
3263 }
3264 }
3265 }
3266
3267 return num_pages;
3268 }
3269
3270
3271 kern_return_t
3272 ps_vstruct_transfer_from_segment(
3273 vstruct_t vs,
3274 paging_segment_t segment,
3275 upl_t upl)
3276 {
3277 struct vs_map *vsmap;
3278 // struct vs_map old_vsmap;
3279 // struct vs_map new_vsmap;
3280 unsigned int i, j;
3281
3282 VS_LOCK(vs); /* block all work on this vstruct */
3283 /* can't allow the normal multiple write */
3284 /* semantic because writes may conflict */
3285 vs->vs_xfer_pending = TRUE;
3286 vs_wait_for_sync_writers(vs);
3287 vs_start_write(vs);
3288 vs_wait_for_readers(vs);
3289 /* we will unlock the vs to allow other writes while transferring */
3290 /* and will be guaranteed of the persistance of the vs struct */
3291 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3292 /* vs_async_pending */
3293 /* OK we now have guaranteed no other parties are accessing this */
3294 /* vs. Now that we are also supporting simple lock versions of */
3295 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3296 /* our purpose in holding it before was the multiple write case */
3297 /* we now use the boolean xfer_pending to do that. We can use */
3298 /* a boolean instead of a count because we have guaranteed single */
3299 /* file access to this code in its caller */
3300 VS_UNLOCK(vs);
3301 vs_changed:
3302 if (vs->vs_indirect) {
3303 unsigned int vsmap_size;
3304 int clmap_off;
3305 /* loop on indirect maps */
3306 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3307 vsmap = vs->vs_imap[i];
3308 if (vsmap == NULL)
3309 continue;
3310 /* loop on clusters in this indirect map */
3311 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3312 VSCLSIZE(vs) * i);
3313 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3314 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3315 else
3316 vsmap_size = CLMAP_ENTRIES;
3317 for (j = 0; j < vsmap_size; j++) {
3318 if (VSM_ISCLR(vsmap[j]) ||
3319 VSM_ISERR(vsmap[j]) ||
3320 (VSM_PS(vsmap[j]) != segment))
3321 continue;
3322 if(vs_cluster_transfer(vs,
3323 (vm_page_size * (j << vs->vs_clshift))
3324 + clmap_off,
3325 vm_page_size << vs->vs_clshift,
3326 upl)
3327 != KERN_SUCCESS) {
3328 VS_LOCK(vs);
3329 vs->vs_xfer_pending = FALSE;
3330 VS_UNLOCK(vs);
3331 vs_finish_write(vs);
3332 return KERN_FAILURE;
3333 }
3334 /* allow other readers/writers during transfer*/
3335 VS_LOCK(vs);
3336 vs->vs_xfer_pending = FALSE;
3337 VS_UNLOCK(vs);
3338 vs_finish_write(vs);
3339 VS_LOCK(vs);
3340 vs->vs_xfer_pending = TRUE;
3341 vs_wait_for_sync_writers(vs);
3342 vs_start_write(vs);
3343 vs_wait_for_readers(vs);
3344 VS_UNLOCK(vs);
3345 if (!(vs->vs_indirect)) {
3346 goto vs_changed;
3347 }
3348 }
3349 }
3350 } else {
3351 vsmap = vs->vs_dmap;
3352 if (vsmap == NULL) {
3353 VS_LOCK(vs);
3354 vs->vs_xfer_pending = FALSE;
3355 VS_UNLOCK(vs);
3356 vs_finish_write(vs);
3357 return KERN_SUCCESS;
3358 }
3359 /* loop on clusters in the direct map */
3360 for (j = 0; j < vs->vs_size; j++) {
3361 if (VSM_ISCLR(vsmap[j]) ||
3362 VSM_ISERR(vsmap[j]) ||
3363 (VSM_PS(vsmap[j]) != segment))
3364 continue;
3365 if(vs_cluster_transfer(vs,
3366 vm_page_size * (j << vs->vs_clshift),
3367 vm_page_size << vs->vs_clshift,
3368 upl) != KERN_SUCCESS) {
3369 VS_LOCK(vs);
3370 vs->vs_xfer_pending = FALSE;
3371 VS_UNLOCK(vs);
3372 vs_finish_write(vs);
3373 return KERN_FAILURE;
3374 }
3375 /* allow other readers/writers during transfer*/
3376 VS_LOCK(vs);
3377 vs->vs_xfer_pending = FALSE;
3378 VS_UNLOCK(vs);
3379 vs_finish_write(vs);
3380 VS_LOCK(vs);
3381 vs->vs_xfer_pending = TRUE;
3382 VS_UNLOCK(vs);
3383 vs_wait_for_sync_writers(vs);
3384 vs_start_write(vs);
3385 vs_wait_for_readers(vs);
3386 if (vs->vs_indirect) {
3387 goto vs_changed;
3388 }
3389 }
3390 }
3391
3392 VS_LOCK(vs);
3393 vs->vs_xfer_pending = FALSE;
3394 VS_UNLOCK(vs);
3395 vs_finish_write(vs);
3396 return KERN_SUCCESS;
3397 }
3398
3399
3400
3401 vs_map_t
3402 vs_get_map_entry(
3403 vstruct_t vs,
3404 vm_offset_t offset)
3405 {
3406 struct vs_map *vsmap;
3407 vm_offset_t cluster;
3408
3409 cluster = atop_32(offset) >> vs->vs_clshift;
3410 if (vs->vs_indirect) {
3411 long ind_block = cluster/CLMAP_ENTRIES;
3412
3413 /* Is the indirect block allocated? */
3414 vsmap = vs->vs_imap[ind_block];
3415 if(vsmap == (vs_map_t) NULL)
3416 return vsmap;
3417 } else
3418 vsmap = vs->vs_dmap;
3419 vsmap += cluster%CLMAP_ENTRIES;
3420 return vsmap;
3421 }
3422
3423 kern_return_t
3424 vs_cluster_transfer(
3425 vstruct_t vs,
3426 vm_offset_t offset,
3427 vm_size_t cnt,
3428 upl_t upl)
3429 {
3430 vm_offset_t actual_offset;
3431 paging_segment_t ps;
3432 struct clmap clmap;
3433 kern_return_t error = KERN_SUCCESS;
3434 unsigned int size, size_wanted;
3435 int i;
3436 unsigned int residual;
3437 unsigned int unavail_size;
3438 // default_pager_thread_t *dpt;
3439 // boolean_t dealloc;
3440 struct vs_map *vsmap_ptr = NULL;
3441 struct vs_map read_vsmap;
3442 struct vs_map original_read_vsmap;
3443 struct vs_map write_vsmap;
3444 // upl_t sync_upl;
3445 // vm_offset_t ioaddr;
3446
3447 /* vs_cluster_transfer reads in the pages of a cluster and
3448 * then writes these pages back to new backing store. The
3449 * segment the pages are being read from is assumed to have
3450 * been taken off-line and is no longer considered for new
3451 * space requests.
3452 */
3453
3454 /*
3455 * This loop will be executed once per cluster referenced.
3456 * Typically this means once, since it's unlikely that the
3457 * VM system will ask for anything spanning cluster boundaries.
3458 *
3459 * If there are holes in a cluster (in a paging segment), we stop
3460 * reading at the hole, then loop again, hoping to
3461 * find valid pages later in the cluster. This continues until
3462 * the entire range has been examined, and read, if present. The
3463 * pages are written as they are read. If a failure occurs after
3464 * some pages are written the unmap call at the bottom of the loop
3465 * recovers the backing store and the old backing store remains
3466 * in effect.
3467 */
3468
3469 VSM_CLR(write_vsmap);
3470 VSM_CLR(original_read_vsmap);
3471 /* grab the actual object's pages to sync with I/O */
3472 while (cnt && (error == KERN_SUCCESS)) {
3473 vsmap_ptr = vs_get_map_entry(vs, offset);
3474 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3475
3476 if (actual_offset == (vm_offset_t) -1) {
3477
3478 /*
3479 * Nothing left to write in this cluster at least
3480 * set write cluster information for any previous
3481 * write, clear for next cluster, if there is one
3482 */
3483 unsigned int local_size, clmask, clsize;
3484
3485 clsize = vm_page_size << vs->vs_clshift;
3486 clmask = clsize - 1;
3487 local_size = clsize - (offset & clmask);
3488 ASSERT(local_size);
3489 local_size = MIN(local_size, cnt);
3490
3491 /* This cluster has no data in it beyond what may */
3492 /* have been found on a previous iteration through */
3493 /* the loop "write_vsmap" */
3494 *vsmap_ptr = write_vsmap;
3495 VSM_CLR(write_vsmap);
3496 VSM_CLR(original_read_vsmap);
3497
3498 cnt -= local_size;
3499 offset += local_size;
3500 continue;
3501 }
3502
3503 /*
3504 * Count up contiguous available or unavailable
3505 * pages.
3506 */
3507 ps = CLMAP_PS(clmap);
3508 ASSERT(ps);
3509 size = 0;
3510 unavail_size = 0;
3511 for (i = 0;
3512 (size < cnt) && (unavail_size < cnt) &&
3513 (i < CLMAP_NPGS(clmap)); i++) {
3514 if (CLMAP_ISSET(clmap, i)) {
3515 if (unavail_size != 0)
3516 break;
3517 size += vm_page_size;
3518 BS_STAT(ps->ps_bs,
3519 ps->ps_bs->bs_pages_in++);
3520 } else {
3521 if (size != 0)
3522 break;
3523 unavail_size += vm_page_size;
3524 }
3525 }
3526
3527 if (size == 0) {
3528 ASSERT(unavail_size);
3529 cnt -= unavail_size;
3530 offset += unavail_size;
3531 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3532 == 0) {
3533 /* There is no more to transfer in this
3534 cluster
3535 */
3536 *vsmap_ptr = write_vsmap;
3537 VSM_CLR(write_vsmap);
3538 VSM_CLR(original_read_vsmap);
3539 }
3540 continue;
3541 }
3542
3543 if(VSM_ISCLR(original_read_vsmap))
3544 original_read_vsmap = *vsmap_ptr;
3545
3546 if(ps->ps_segtype == PS_PARTITION) {
3547 /*
3548 NEED TO ISSUE WITH SYNC & NO COMMIT
3549 error = ps_read_device(ps, actual_offset, &buffer,
3550 size, &residual, flags);
3551 */
3552 } else {
3553 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3554 error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
3555 size, &residual,
3556 (UPL_IOSYNC | UPL_NOCOMMIT));
3557 }
3558
3559 read_vsmap = *vsmap_ptr;
3560
3561
3562 /*
3563 * Adjust counts and put data in new BS. Optimize for the
3564 * common case, i.e. no error and/or partial data.
3565 * If there was an error, then we need to error the entire
3566 * range, even if some data was successfully read.
3567 *
3568 */
3569 if ((error == KERN_SUCCESS) && (residual == 0)) {
3570
3571 /*
3572 * Got everything we asked for, supply the data to
3573 * the new BS. Note that as a side effect of supplying
3574 * the data, the buffer holding the supplied data is
3575 * deallocated from the pager's address space unless
3576 * the write is unsuccessful.
3577 */
3578
3579 /* note buffer will be cleaned up in all cases by */
3580 /* internal_cluster_write or if an error on write */
3581 /* the vm_map_copy_page_discard call */
3582 *vsmap_ptr = write_vsmap;
3583
3584 if(vs_cluster_write(vs, upl, offset,
3585 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3586 error = KERN_FAILURE;
3587 if(!(VSM_ISCLR(*vsmap_ptr))) {
3588 /* unmap the new backing store object */
3589 ps_clunmap(vs, offset, size);
3590 }
3591 /* original vsmap */
3592 *vsmap_ptr = original_read_vsmap;
3593 VSM_CLR(write_vsmap);
3594 } else {
3595 if((offset + size) &
3596 ((vm_page_size << vs->vs_clshift)
3597 - 1)) {
3598 /* There is more to transfer in this
3599 cluster
3600 */
3601 write_vsmap = *vsmap_ptr;
3602 *vsmap_ptr = read_vsmap;
3603 } else {
3604 /* discard the old backing object */
3605 write_vsmap = *vsmap_ptr;
3606 *vsmap_ptr = read_vsmap;
3607 ps_clunmap(vs, offset, size);
3608 *vsmap_ptr = write_vsmap;
3609 VSM_CLR(write_vsmap);
3610 VSM_CLR(original_read_vsmap);
3611 }
3612 }
3613 } else {
3614 size_wanted = size;
3615 if (error == KERN_SUCCESS) {
3616 if (residual == size) {
3617 /*
3618 * If a read operation returns no error
3619 * and no data moved, we turn it into
3620 * an error, assuming we're reading at
3621 * or beyond EOF.
3622 * Fall through and error the entire
3623 * range.
3624 */
3625 error = KERN_FAILURE;
3626 *vsmap_ptr = write_vsmap;
3627 if(!(VSM_ISCLR(*vsmap_ptr))) {
3628 /* unmap the new backing store object */
3629 ps_clunmap(vs, offset, size);
3630 }
3631 *vsmap_ptr = original_read_vsmap;
3632 VSM_CLR(write_vsmap);
3633 continue;
3634 } else {
3635 /*
3636 * Otherwise, we have partial read.
3637 * This is also considered an error
3638 * for the purposes of cluster transfer
3639 */
3640 error = KERN_FAILURE;
3641 *vsmap_ptr = write_vsmap;
3642 if(!(VSM_ISCLR(*vsmap_ptr))) {
3643 /* unmap the new backing store object */
3644 ps_clunmap(vs, offset, size);
3645 }
3646 *vsmap_ptr = original_read_vsmap;
3647 VSM_CLR(write_vsmap);
3648 continue;
3649 }
3650 }
3651
3652 }
3653 cnt -= size;
3654 offset += size;
3655
3656 } /* END while (cnt && (error == 0)) */
3657 if(!VSM_ISCLR(write_vsmap))
3658 *vsmap_ptr = write_vsmap;
3659
3660 return error;
3661 }
3662
3663 kern_return_t
3664 default_pager_add_file(
3665 MACH_PORT_FACE backing_store,
3666 vnode_ptr_t vp,
3667 int record_size,
3668 vm_size_t size)
3669 {
3670 backing_store_t bs;
3671 paging_segment_t ps;
3672 int i;
3673 unsigned int j;
3674 int error;
3675
3676 if ((bs = backing_store_lookup(backing_store))
3677 == BACKING_STORE_NULL)
3678 return KERN_INVALID_ARGUMENT;
3679
3680 PSL_LOCK();
3681 for (i = 0; i <= paging_segment_max; i++) {
3682 ps = paging_segments[i];
3683 if (ps == PAGING_SEGMENT_NULL)
3684 continue;
3685 if (ps->ps_segtype != PS_FILE)
3686 continue;
3687
3688 /*
3689 * Check for overlap on same device.
3690 */
3691 if (ps->ps_vnode == (struct vnode *)vp) {
3692 PSL_UNLOCK();
3693 BS_UNLOCK(bs);
3694 return KERN_INVALID_ARGUMENT;
3695 }
3696 }
3697 PSL_UNLOCK();
3698
3699 /*
3700 * Set up the paging segment
3701 */
3702 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3703 if (ps == PAGING_SEGMENT_NULL) {
3704 BS_UNLOCK(bs);
3705 return KERN_RESOURCE_SHORTAGE;
3706 }
3707
3708 ps->ps_segtype = PS_FILE;
3709 ps->ps_vnode = (struct vnode *)vp;
3710 ps->ps_offset = 0;
3711 ps->ps_record_shift = local_log2(vm_page_size / record_size);
3712 ps->ps_recnum = size;
3713 ps->ps_pgnum = size >> ps->ps_record_shift;
3714
3715 ps->ps_pgcount = ps->ps_pgnum;
3716 ps->ps_clshift = local_log2(bs->bs_clsize);
3717 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3718 ps->ps_hint = 0;
3719
3720 PS_LOCK_INIT(ps);
3721 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3722 if (!ps->ps_bmap) {
3723 kfree(ps, sizeof *ps);
3724 BS_UNLOCK(bs);
3725 return KERN_RESOURCE_SHORTAGE;
3726 }
3727 for (j = 0; j < ps->ps_ncls; j++) {
3728 clrbit(ps->ps_bmap, j);
3729 }
3730
3731 ps->ps_going_away = FALSE;
3732 ps->ps_bs = bs;
3733
3734 if ((error = ps_enter(ps)) != 0) {
3735 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3736 kfree(ps, sizeof *ps);
3737 BS_UNLOCK(bs);
3738 return KERN_RESOURCE_SHORTAGE;
3739 }
3740
3741 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3742 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3743 PSL_LOCK();
3744 dp_pages_free += ps->ps_pgcount;
3745 PSL_UNLOCK();
3746
3747 BS_UNLOCK(bs);
3748
3749 bs_more_space(ps->ps_clcount);
3750
3751 DP_DEBUG(DEBUG_BS_INTERNAL,
3752 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3753 device, offset, size, record_size,
3754 ps->ps_record_shift, ps->ps_pgnum));
3755
3756 return KERN_SUCCESS;
3757 }
3758
3759
3760
3761 kern_return_t
3762 ps_read_file(
3763 paging_segment_t ps,
3764 upl_t upl,
3765 upl_offset_t upl_offset,
3766 vm_offset_t offset,
3767 upl_size_t size,
3768 unsigned int *residualp,
3769 int flags)
3770 {
3771 vm_object_offset_t f_offset;
3772 int error = 0;
3773 int result;
3774
3775 assert(dp_encryption_inited);
3776
3777 clustered_reads[atop_32(size)]++;
3778
3779 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3780
3781 /* for transfer case we need to pass uploffset and flags */
3782 error = vnode_pagein(ps->ps_vnode,
3783 upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
3784
3785 /* The vnode_pagein semantic is somewhat at odds with the existing */
3786 /* device_read semantic. Partial reads are not experienced at this */
3787 /* level. It is up to the bit map code and cluster read code to */
3788 /* check that requested data locations are actually backed, and the */
3789 /* pagein code to either read all of the requested data or return an */
3790 /* error. */
3791
3792 if (error)
3793 result = KERN_FAILURE;
3794 else {
3795 *residualp = 0;
3796 result = KERN_SUCCESS;
3797 }
3798 return result;
3799 }
3800
3801 kern_return_t
3802 ps_write_file(
3803 paging_segment_t ps,
3804 upl_t upl,
3805 upl_offset_t upl_offset,
3806 vm_offset_t offset,
3807 unsigned int size,
3808 int flags)
3809 {
3810 vm_object_offset_t f_offset;
3811 kern_return_t result;
3812
3813 assert(dp_encryption_inited);
3814
3815 clustered_writes[atop_32(size)]++;
3816 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3817
3818 if (flags & UPL_PAGING_ENCRYPTED) {
3819 /*
3820 * ENCRYPTED SWAP:
3821 * encrypt all the pages that we're going
3822 * to pageout.
3823 */
3824 upl_encrypt(upl, upl_offset, size);
3825 }
3826
3827 if (vnode_pageout(ps->ps_vnode,
3828 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3829 result = KERN_FAILURE;
3830 else
3831 result = KERN_SUCCESS;
3832
3833 return result;
3834 }
3835
3836 kern_return_t
3837 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
3838 int hi_wat,
3839 int lo_wat,
3840 int flags,
3841 MACH_PORT_FACE trigger_port)
3842 {
3843 MACH_PORT_FACE release;
3844 kern_return_t kr;
3845
3846 PSL_LOCK();
3847 if (flags == SWAP_ENCRYPT_ON) {
3848 /* ENCRYPTED SWAP: turn encryption on */
3849 release = trigger_port;
3850 if (!dp_encryption_inited) {
3851 dp_encryption_inited = TRUE;
3852 dp_encryption = TRUE;
3853 kr = KERN_SUCCESS;
3854 } else {
3855 kr = KERN_FAILURE;
3856 }
3857 } else if (flags == SWAP_ENCRYPT_OFF) {
3858 /* ENCRYPTED SWAP: turn encryption off */
3859 release = trigger_port;
3860 if (!dp_encryption_inited) {
3861 dp_encryption_inited = TRUE;
3862 dp_encryption = FALSE;
3863 kr = KERN_SUCCESS;
3864 } else {
3865 kr = KERN_FAILURE;
3866 }
3867 } else if (flags == HI_WAT_ALERT) {
3868 release = min_pages_trigger_port;
3869 min_pages_trigger_port = trigger_port;
3870 minimum_pages_remaining = hi_wat/vm_page_size;
3871 bs_low = FALSE;
3872 kr = KERN_SUCCESS;
3873 } else if (flags == LO_WAT_ALERT) {
3874 release = max_pages_trigger_port;
3875 max_pages_trigger_port = trigger_port;
3876 maximum_pages_free = lo_wat/vm_page_size;
3877 kr = KERN_SUCCESS;
3878 } else {
3879 release = trigger_port;
3880 kr = KERN_INVALID_ARGUMENT;
3881 }
3882 PSL_UNLOCK();
3883
3884 if (IP_VALID(release))
3885 ipc_port_release_send(release);
3886
3887 return kr;
3888 }
3889
3890 /*
3891 * Monitor the amount of available backing store vs. the amount of
3892 * required backing store, notify a listener (if present) when
3893 * backing store may safely be removed.
3894 *
3895 * We attempt to avoid the situation where backing store is
3896 * discarded en masse, as this can lead to thrashing as the
3897 * backing store is compacted.
3898 */
3899
3900 #define PF_INTERVAL 3 /* time between free level checks */
3901 #define PF_LATENCY 10 /* number of intervals before release */
3902
3903 static int dp_pages_free_low_count = 0;
3904 thread_call_t default_pager_backing_store_monitor_callout;
3905
3906 void
3907 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
3908 __unused thread_call_param_t p2)
3909 {
3910 // unsigned long long average;
3911 ipc_port_t trigger;
3912 uint64_t deadline;
3913
3914 /*
3915 * We determine whether it will be safe to release some
3916 * backing store by watching the free page level. If
3917 * it remains below the maximum_pages_free threshold for
3918 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3919 * then we deem it safe.
3920 *
3921 * Note that this establishes a maximum rate at which backing
3922 * store will be released, as each notification (currently)
3923 * only results in a single backing store object being
3924 * released.
3925 */
3926 if (dp_pages_free > maximum_pages_free) {
3927 dp_pages_free_low_count++;
3928 } else {
3929 dp_pages_free_low_count = 0;
3930 }
3931
3932 /* decide whether to send notification */
3933 trigger = IP_NULL;
3934 if (max_pages_trigger_port &&
3935 (backing_store_release_trigger_disable == 0) &&
3936 (dp_pages_free_low_count > PF_LATENCY)) {
3937 trigger = max_pages_trigger_port;
3938 max_pages_trigger_port = NULL;
3939 }
3940
3941 /* send notification */
3942 if (trigger != IP_NULL) {
3943 VSL_LOCK();
3944 if(backing_store_release_trigger_disable != 0) {
3945 assert_wait((event_t)
3946 &backing_store_release_trigger_disable,
3947 THREAD_UNINT);
3948 VSL_UNLOCK();
3949 thread_block(THREAD_CONTINUE_NULL);
3950 } else {
3951 VSL_UNLOCK();
3952 }
3953 default_pager_space_alert(trigger, LO_WAT_ALERT);
3954 ipc_port_release_send(trigger);
3955 dp_pages_free_low_count = 0;
3956 }
3957
3958 clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
3959 thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
3960 }