]> git.saurik.com Git - apple/xnu.git/blob - osfmk/default_pager/dp_backing_store.c
xnu-3248.60.10.tar.gz
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
1 /*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57 /*
58 * Default Pager.
59 * Paging File Management.
60 */
61
62 #include <mach/host_priv.h>
63 #include <mach/memory_object_control.h>
64 #include <mach/memory_object_server.h>
65 #include <mach/upl.h>
66 #include <default_pager/default_pager_internal.h>
67 #include <default_pager/default_pager_alerts.h>
68 #include <default_pager/default_pager_object_server.h>
69
70 #include <ipc/ipc_types.h>
71 #include <ipc/ipc_port.h>
72 #include <ipc/ipc_space.h>
73
74 #include <kern/kern_types.h>
75 #include <kern/host.h>
76 #include <kern/queue.h>
77 #include <kern/counters.h>
78 #include <kern/sched_prim.h>
79
80 #include <vm/vm_kern.h>
81 #include <vm/vm_pageout.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_protos.h>
85
86
87 /* todo - need large internal object support */
88
89 /*
90 * ALLOC_STRIDE... the maximum number of bytes allocated from
91 * a swap file before moving on to the next swap file... if
92 * all swap files reside on a single disk, this value should
93 * be very large (this is the default assumption)... if the
94 * swap files are spread across multiple disks, than this value
95 * should be small (128 * 1024)...
96 *
97 * This should be determined dynamically in the future
98 */
99
100 #define ALLOC_STRIDE (1024 * 1024 * 1024)
101 int physical_transfer_cluster_count = 0;
102
103 #define VM_SUPER_CLUSTER 0x40000
104 #define VM_SUPER_PAGES (VM_SUPER_CLUSTER / PAGE_MIN_SIZE)
105
106 /*
107 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
108 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
109 */
110 #define VSTRUCT_MIN_CLSHIFT 0
111
112 #define VSTRUCT_DEF_CLSHIFT 2
113 int default_pager_clsize = 0;
114
115 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
116
117 /* statistics */
118 unsigned int clustered_writes[VM_SUPER_PAGES+1];
119 unsigned int clustered_reads[VM_SUPER_PAGES+1];
120
121 /*
122 * Globals used for asynchronous paging operations:
123 * vs_async_list: head of list of to-be-completed I/O ops
124 * async_num_queued: number of pages completed, but not yet
125 * processed by async thread.
126 * async_requests_out: number of pages of requests not completed.
127 */
128
129 #if 0
130 struct vs_async *vs_async_list;
131 int async_num_queued;
132 int async_requests_out;
133 #endif
134
135
136 #define VS_ASYNC_REUSE 1
137 struct vs_async *vs_async_free_list;
138
139 lck_mtx_t default_pager_async_lock; /* Protects globals above */
140
141
142 int vs_alloc_async_failed = 0; /* statistics */
143 int vs_alloc_async_count = 0; /* statistics */
144 struct vs_async *vs_alloc_async(void); /* forward */
145 void vs_free_async(struct vs_async *vsa); /* forward */
146
147
148 #define VS_ALLOC_ASYNC() vs_alloc_async()
149 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
150
151 #define VS_ASYNC_LOCK() lck_mtx_lock(&default_pager_async_lock)
152 #define VS_ASYNC_UNLOCK() lck_mtx_unlock(&default_pager_async_lock)
153 #define VS_ASYNC_LOCK_INIT() lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
154 #define VS_ASYNC_LOCK_DESTROY() lck_mtx_destroy(&default_pager_async_lock, &default_pager_lck_grp)
155 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
156 /*
157 * Paging Space Hysteresis triggers and the target notification port
158 *
159 */
160 unsigned int dp_pages_free_drift_count = 0;
161 unsigned int dp_pages_free_drifted_max = 0;
162 unsigned int minimum_pages_remaining = 0;
163 unsigned int maximum_pages_free = 0;
164 ipc_port_t min_pages_trigger_port = NULL;
165 ipc_port_t max_pages_trigger_port = NULL;
166
167 #if CONFIG_FREEZE
168 boolean_t use_emergency_swap_file_first = TRUE;
169 #else
170 boolean_t use_emergency_swap_file_first = FALSE;
171 #endif
172 boolean_t bs_low = FALSE;
173 int backing_store_release_trigger_disable = 0;
174 boolean_t backing_store_stop_compaction = FALSE;
175 boolean_t backing_store_abort_compaction = FALSE;
176
177 /* Have we decided if swap needs to be encrypted yet ? */
178 boolean_t dp_encryption_inited = FALSE;
179 /* Should we encrypt swap ? */
180 boolean_t dp_encryption = FALSE;
181
182 boolean_t dp_isssd = FALSE;
183
184 /*
185 * Object sizes are rounded up to the next power of 2,
186 * unless they are bigger than a given maximum size.
187 */
188 vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
189
190 /*
191 * List of all backing store and segments.
192 */
193 MACH_PORT_FACE emergency_segment_backing_store;
194 struct backing_store_list_head backing_store_list;
195 paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
196 lck_mtx_t paging_segments_lock;
197 int paging_segment_max = 0;
198 int paging_segment_count = 0;
199 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
200
201
202 /*
203 * Total pages free in system
204 * This differs from clusters committed/avail which is a measure of the
205 * over commitment of paging segments to backing store. An idea which is
206 * likely to be deprecated.
207 */
208 unsigned int dp_pages_free = 0;
209 unsigned int dp_pages_reserve = 0;
210 unsigned int cluster_transfer_minimum = 100;
211
212 /*
213 * Trim state
214 */
215 struct ps_vnode_trim_data {
216 struct vnode *vp;
217 dp_offset_t offset;
218 dp_size_t length;
219 };
220
221 /* forward declarations */
222 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int); /* forward */
223 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int); /* forward */
224 default_pager_thread_t *get_read_buffer( void );
225 kern_return_t ps_vstruct_transfer_from_segment(
226 vstruct_t vs,
227 paging_segment_t segment,
228 upl_t upl);
229 kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
230 kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
231 kern_return_t vs_cluster_transfer(
232 vstruct_t vs,
233 dp_offset_t offset,
234 dp_size_t cnt,
235 upl_t upl);
236 vs_map_t vs_get_map_entry(
237 vstruct_t vs,
238 dp_offset_t offset);
239
240 kern_return_t
241 default_pager_backing_store_delete_internal( MACH_PORT_FACE );
242
243 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data);
244 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data);
245 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length);
246
247 default_pager_thread_t *
248 get_read_buffer( void )
249 {
250 int i;
251
252 DPT_LOCK(dpt_lock);
253 while(TRUE) {
254 for (i=0; i<default_pager_internal_count; i++) {
255 if(dpt_array[i]->checked_out == FALSE) {
256 dpt_array[i]->checked_out = TRUE;
257 DPT_UNLOCK(dpt_lock);
258 return dpt_array[i];
259 }
260 }
261 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
262 }
263 }
264
265 void
266 bs_initialize(void)
267 {
268 int i;
269
270 /*
271 * List of all backing store.
272 */
273 BSL_LOCK_INIT();
274 queue_init(&backing_store_list.bsl_queue);
275 PSL_LOCK_INIT();
276
277 VS_ASYNC_LOCK_INIT();
278 #if VS_ASYNC_REUSE
279 vs_async_free_list = NULL;
280 #endif /* VS_ASYNC_REUSE */
281
282 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
283 clustered_writes[i] = 0;
284 clustered_reads[i] = 0;
285 }
286
287 }
288
289 /*
290 * When things do not quite workout...
291 */
292 void bs_no_paging_space(boolean_t); /* forward */
293
294 void
295 bs_no_paging_space(
296 boolean_t out_of_memory)
297 {
298
299 if (out_of_memory)
300 dprintf(("*** OUT OF MEMORY ***\n"));
301 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
302 }
303
304 void bs_more_space(int); /* forward */
305 void bs_commit(int); /* forward */
306
307 boolean_t user_warned = FALSE;
308 unsigned int clusters_committed = 0;
309 unsigned int clusters_available = 0;
310 unsigned int clusters_committed_peak = 0;
311
312 void
313 bs_more_space(
314 int nclusters)
315 {
316 BSL_LOCK();
317 /*
318 * Account for new paging space.
319 */
320 clusters_available += nclusters;
321
322 if (clusters_available >= clusters_committed) {
323 if (verbose && user_warned) {
324 printf("%s%s - %d excess clusters now.\n",
325 my_name,
326 "paging space is OK now",
327 clusters_available - clusters_committed);
328 user_warned = FALSE;
329 clusters_committed_peak = 0;
330 }
331 } else {
332 if (verbose && user_warned) {
333 printf("%s%s - still short of %d clusters.\n",
334 my_name,
335 "WARNING: paging space over-committed",
336 clusters_committed - clusters_available);
337 clusters_committed_peak -= nclusters;
338 }
339 }
340 BSL_UNLOCK();
341
342 return;
343 }
344
345 void
346 bs_commit(
347 int nclusters)
348 {
349 BSL_LOCK();
350 clusters_committed += nclusters;
351 if (clusters_committed > clusters_available) {
352 if (verbose && !user_warned) {
353 user_warned = TRUE;
354 printf("%s%s - short of %d clusters.\n",
355 my_name,
356 "WARNING: paging space over-committed",
357 clusters_committed - clusters_available);
358 }
359 if (clusters_committed > clusters_committed_peak) {
360 clusters_committed_peak = clusters_committed;
361 }
362 } else {
363 if (verbose && user_warned) {
364 printf("%s%s - was short of up to %d clusters.\n",
365 my_name,
366 "paging space is OK now",
367 clusters_committed_peak - clusters_available);
368 user_warned = FALSE;
369 clusters_committed_peak = 0;
370 }
371 }
372 BSL_UNLOCK();
373
374 return;
375 }
376
377 int default_pager_info_verbose = 1;
378
379 void
380 bs_global_info(
381 uint64_t *totalp,
382 uint64_t *freep)
383 {
384 uint64_t pages_total, pages_free;
385 paging_segment_t ps;
386 int i;
387
388 PSL_LOCK();
389 pages_total = pages_free = 0;
390 for (i = 0; i <= paging_segment_max; i++) {
391 ps = paging_segments[i];
392 if (ps == PAGING_SEGMENT_NULL)
393 continue;
394
395 /*
396 * no need to lock: by the time this data
397 * gets back to any remote requestor it
398 * will be obsolete anyways
399 */
400 pages_total += ps->ps_pgnum;
401 pages_free += ps->ps_clcount << ps->ps_clshift;
402 DP_DEBUG(DEBUG_BS_INTERNAL,
403 ("segment #%d: %d total, %d free\n",
404 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
405 }
406 *totalp = pages_total;
407 *freep = pages_free;
408 if (verbose && user_warned && default_pager_info_verbose) {
409 if (clusters_available < clusters_committed) {
410 printf("%s %d clusters committed, %d available.\n",
411 my_name,
412 clusters_committed,
413 clusters_available);
414 }
415 }
416 PSL_UNLOCK();
417 }
418
419 backing_store_t backing_store_alloc(void); /* forward */
420
421 backing_store_t
422 backing_store_alloc(void)
423 {
424 backing_store_t bs;
425
426 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
427 if (bs == BACKING_STORE_NULL)
428 panic("backing_store_alloc: no memory");
429
430 BS_LOCK_INIT(bs);
431 bs->bs_port = MACH_PORT_NULL;
432 bs->bs_priority = 0;
433 bs->bs_clsize = 0;
434 bs->bs_pages_total = 0;
435 bs->bs_pages_in = 0;
436 bs->bs_pages_in_fail = 0;
437 bs->bs_pages_out = 0;
438 bs->bs_pages_out_fail = 0;
439
440 return bs;
441 }
442
443 backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
444
445 /* Even in both the component space and external versions of this pager, */
446 /* backing_store_lookup will be called from tasks in the application space */
447 backing_store_t
448 backing_store_lookup(
449 MACH_PORT_FACE port)
450 {
451 backing_store_t bs;
452
453 /*
454 port is currently backed with a vs structure in the alias field
455 we could create an ISBS alias and a port_is_bs call but frankly
456 I see no reason for the test, the bs->port == port check below
457 will work properly on junk entries.
458
459 if ((port == MACH_PORT_NULL) || port_is_vs(port))
460 */
461 if (port == MACH_PORT_NULL)
462 return BACKING_STORE_NULL;
463
464 BSL_LOCK();
465 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
466 bs_links) {
467 BS_LOCK(bs);
468 if (bs->bs_port == port) {
469 BSL_UNLOCK();
470 /* Success, return it locked. */
471 return bs;
472 }
473 BS_UNLOCK(bs);
474 }
475 BSL_UNLOCK();
476 return BACKING_STORE_NULL;
477 }
478
479 void backing_store_add(backing_store_t); /* forward */
480
481 void
482 backing_store_add(
483 __unused backing_store_t bs)
484 {
485 // MACH_PORT_FACE port = bs->bs_port;
486 // MACH_PORT_FACE pset = default_pager_default_set;
487 kern_return_t kr = KERN_SUCCESS;
488
489 if (kr != KERN_SUCCESS)
490 panic("backing_store_add: add to set");
491
492 }
493
494 /*
495 * Set up default page shift, but only if not already
496 * set and argument is within range.
497 */
498 boolean_t
499 bs_set_default_clsize(unsigned int npages)
500 {
501 switch(npages){
502 case 1:
503 case 2:
504 case 4:
505 case 8:
506 if (default_pager_clsize == 0) /* if not yet set */
507 vstruct_def_clshift = local_log2(npages);
508 return(TRUE);
509 }
510 return(FALSE);
511 }
512
513 int bs_get_global_clsize(int clsize); /* forward */
514
515 int
516 bs_get_global_clsize(
517 int clsize)
518 {
519 int i;
520 memory_object_default_t dmm;
521 kern_return_t kr;
522
523 /*
524 * Only allow setting of cluster size once. If called
525 * with no cluster size (default), we use the compiled-in default
526 * for the duration. The same cluster size is used for all
527 * paging segments.
528 */
529 if (default_pager_clsize == 0) {
530 /*
531 * Keep cluster size in bit shift because it's quicker
532 * arithmetic, and easier to keep at a power of 2.
533 */
534 if (clsize != NO_CLSIZE) {
535 for (i = 0; (1 << i) < clsize; i++);
536 if (i > MAX_CLUSTER_SHIFT)
537 i = MAX_CLUSTER_SHIFT;
538 vstruct_def_clshift = i;
539 }
540 default_pager_clsize = (1 << vstruct_def_clshift);
541
542 /*
543 * Let the user know the new (and definitive) cluster size.
544 */
545 if (verbose)
546 printf("%scluster size = %d page%s\n",
547 my_name, default_pager_clsize,
548 (default_pager_clsize == 1) ? "" : "s");
549
550 /*
551 * Let the kernel know too, in case it hasn't used the
552 * default value provided in main() yet.
553 */
554 dmm = default_pager_object;
555 clsize = default_pager_clsize * vm_page_size; /* in bytes */
556 kr = host_default_memory_manager(host_priv_self(),
557 &dmm,
558 clsize);
559 memory_object_default_deallocate(dmm);
560
561 if (kr != KERN_SUCCESS) {
562 panic("bs_get_global_cl_size:host_default_memory_manager");
563 }
564 if (dmm != default_pager_object) {
565 panic("bs_get_global_cl_size:there is another default pager");
566 }
567 }
568 ASSERT(default_pager_clsize > 0 &&
569 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
570
571 return default_pager_clsize;
572 }
573
574 kern_return_t
575 default_pager_backing_store_create(
576 memory_object_default_t pager,
577 int priority,
578 int clsize, /* in bytes */
579 MACH_PORT_FACE *backing_store)
580 {
581 backing_store_t bs;
582 MACH_PORT_FACE port;
583 // kern_return_t kr;
584 struct vstruct_alias *alias_struct;
585
586 if (pager != default_pager_object)
587 return KERN_INVALID_ARGUMENT;
588
589 bs = backing_store_alloc();
590 port = ipc_port_alloc_kernel();
591 ipc_port_make_send(port);
592 assert (port != IP_NULL);
593
594 DP_DEBUG(DEBUG_BS_EXTERNAL,
595 ("priority=%d clsize=%d bs_port=0x%x\n",
596 priority, clsize, (int) backing_store));
597
598 alias_struct = (struct vstruct_alias *)
599 kalloc(sizeof (struct vstruct_alias));
600 if(alias_struct != NULL) {
601 alias_struct->vs = (struct vstruct *)bs;
602 alias_struct->name = &default_pager_ops;
603 port->ip_alias = (uintptr_t) alias_struct;
604 }
605 else {
606 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
607
608 BS_LOCK_DESTROY(bs);
609 kfree(bs, sizeof (struct backing_store));
610
611 return KERN_RESOURCE_SHORTAGE;
612 }
613
614 bs->bs_port = port;
615 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
616 priority = BS_MAXPRI;
617 else if (priority == BS_NOPRI)
618 priority = BS_MAXPRI;
619 else
620 priority = BS_MINPRI;
621 bs->bs_priority = priority;
622
623 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
624
625 BSL_LOCK();
626 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
627 bs_links);
628 BSL_UNLOCK();
629
630 backing_store_add(bs);
631
632 *backing_store = port;
633 return KERN_SUCCESS;
634 }
635
636 kern_return_t
637 default_pager_backing_store_info(
638 MACH_PORT_FACE backing_store,
639 backing_store_flavor_t flavour,
640 backing_store_info_t info,
641 mach_msg_type_number_t *size)
642 {
643 backing_store_t bs;
644 backing_store_basic_info_t basic;
645 int i;
646 paging_segment_t ps;
647
648 if (flavour != BACKING_STORE_BASIC_INFO ||
649 *size < BACKING_STORE_BASIC_INFO_COUNT)
650 return KERN_INVALID_ARGUMENT;
651
652 basic = (backing_store_basic_info_t)info;
653 *size = BACKING_STORE_BASIC_INFO_COUNT;
654
655 VSTATS_LOCK(&global_stats.gs_lock);
656 basic->pageout_calls = global_stats.gs_pageout_calls;
657 basic->pagein_calls = global_stats.gs_pagein_calls;
658 basic->pages_in = global_stats.gs_pages_in;
659 basic->pages_out = global_stats.gs_pages_out;
660 basic->pages_unavail = global_stats.gs_pages_unavail;
661 basic->pages_init = global_stats.gs_pages_init;
662 basic->pages_init_writes= global_stats.gs_pages_init_writes;
663 VSTATS_UNLOCK(&global_stats.gs_lock);
664
665 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
666 return KERN_INVALID_ARGUMENT;
667
668 basic->bs_pages_total = bs->bs_pages_total;
669 PSL_LOCK();
670 bs->bs_pages_free = 0;
671 for (i = 0; i <= paging_segment_max; i++) {
672 ps = paging_segments[i];
673 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
674 PS_LOCK(ps);
675 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
676 PS_UNLOCK(ps);
677 }
678 }
679 PSL_UNLOCK();
680 basic->bs_pages_free = bs->bs_pages_free;
681 basic->bs_pages_in = bs->bs_pages_in;
682 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
683 basic->bs_pages_out = bs->bs_pages_out;
684 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
685
686 basic->bs_priority = bs->bs_priority;
687 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */
688
689 BS_UNLOCK(bs);
690
691 return KERN_SUCCESS;
692 }
693
694 int ps_delete(paging_segment_t); /* forward */
695 boolean_t current_thread_aborted(void);
696
697 int
698 ps_delete(
699 paging_segment_t ps)
700 {
701 vstruct_t vs;
702 kern_return_t error = KERN_SUCCESS;
703 int vs_count;
704
705 VSL_LOCK(); /* get the lock on the list of vs's */
706
707 /* The lock relationship and sequence is farily complicated */
708 /* this code looks at a live list, locking and unlocking the list */
709 /* as it traverses it. It depends on the locking behavior of */
710 /* default_pager_no_senders. no_senders always locks the vstruct */
711 /* targeted for removal before locking the vstruct list. However */
712 /* it will remove that member of the list without locking its */
713 /* neighbors. We can be sure when we hold a lock on a vstruct */
714 /* it cannot be removed from the list but we must hold the list */
715 /* lock to be sure that its pointers to its neighbors are valid. */
716 /* Also, we can hold off destruction of a vstruct when the list */
717 /* lock and the vs locks are not being held by bumping the */
718 /* vs_async_pending count. */
719
720
721 while(backing_store_release_trigger_disable != 0) {
722 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
723 }
724
725 /* we will choose instead to hold a send right */
726 vs_count = vstruct_list.vsl_count;
727 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
728 if(vs == (vstruct_t)&vstruct_list) {
729 VSL_UNLOCK();
730 return KERN_SUCCESS;
731 }
732 VS_LOCK(vs);
733 vs_async_wait(vs); /* wait for any pending async writes */
734 if ((vs_count != 0) && (vs != NULL))
735 vs->vs_async_pending += 1; /* hold parties calling */
736 /* vs_async_wait */
737
738 if (bs_low == FALSE)
739 backing_store_abort_compaction = FALSE;
740
741 VS_UNLOCK(vs);
742 VSL_UNLOCK();
743 while((vs_count != 0) && (vs != NULL)) {
744 /* We take the count of AMO's before beginning the */
745 /* transfer of of the target segment. */
746 /* We are guaranteed that the target segment cannot get */
747 /* more users. We also know that queue entries are */
748 /* made at the back of the list. If some of the entries */
749 /* we would check disappear while we are traversing the */
750 /* list then we will either check new entries which */
751 /* do not have any backing store in the target segment */
752 /* or re-check old entries. This might not be optimal */
753 /* but it will always be correct. The alternative is to */
754 /* take a snapshot of the list. */
755 vstruct_t next_vs;
756
757 if(dp_pages_free < cluster_transfer_minimum)
758 error = KERN_FAILURE;
759 else {
760 vm_object_t transfer_object;
761 unsigned int count;
762 upl_t upl;
763 upl_control_flags_t upl_flags;
764
765 transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
766 count = 0;
767 upl_flags = (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE |
768 UPL_SET_LITE | UPL_SET_INTERNAL);
769 if (dp_encryption) {
770 /* mark the pages as "encrypted" when they come in */
771 upl_flags |= UPL_ENCRYPT;
772 }
773 error = vm_object_upl_request(transfer_object,
774 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
775 &upl, NULL, &count, upl_flags);
776
777 if(error == KERN_SUCCESS) {
778 error = ps_vstruct_transfer_from_segment(
779 vs, ps, upl);
780 upl_commit(upl, NULL, 0);
781 upl_deallocate(upl);
782 } else {
783 error = KERN_FAILURE;
784 }
785 vm_object_deallocate(transfer_object);
786 }
787 if(error || current_thread_aborted()) {
788 VS_LOCK(vs);
789 vs->vs_async_pending -= 1; /* release vs_async_wait */
790 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
791 vs->vs_waiting_async = FALSE;
792 VS_UNLOCK(vs);
793 thread_wakeup(&vs->vs_async_pending);
794 } else {
795 VS_UNLOCK(vs);
796 }
797 return KERN_FAILURE;
798 }
799
800 VSL_LOCK();
801
802 while(backing_store_release_trigger_disable != 0) {
803 VSL_SLEEP(&backing_store_release_trigger_disable,
804 THREAD_UNINT);
805 }
806
807 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
808 if((next_vs != (vstruct_t)&vstruct_list) &&
809 (vs != next_vs) && (vs_count != 1)) {
810 VS_LOCK(next_vs);
811 vs_async_wait(next_vs); /* wait for any */
812 /* pending async writes */
813 next_vs->vs_async_pending += 1; /* hold parties */
814 /* calling vs_async_wait */
815 VS_UNLOCK(next_vs);
816 }
817 VSL_UNLOCK();
818 VS_LOCK(vs);
819 vs->vs_async_pending -= 1;
820 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
821 vs->vs_waiting_async = FALSE;
822 VS_UNLOCK(vs);
823 thread_wakeup(&vs->vs_async_pending);
824 } else {
825 VS_UNLOCK(vs);
826 }
827 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
828 vs = NULL;
829 else
830 vs = next_vs;
831 vs_count--;
832 }
833 return KERN_SUCCESS;
834 }
835
836
837 kern_return_t
838 default_pager_backing_store_delete_internal(
839 MACH_PORT_FACE backing_store)
840 {
841 backing_store_t bs;
842 int i;
843 paging_segment_t ps;
844 int error;
845 int interim_pages_removed = 0;
846 boolean_t dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store );
847
848 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
849 return KERN_INVALID_ARGUMENT;
850
851 restart:
852 PSL_LOCK();
853 error = KERN_SUCCESS;
854 for (i = 0; i <= paging_segment_max; i++) {
855 ps = paging_segments[i];
856 if (ps != PAGING_SEGMENT_NULL &&
857 ps->ps_bs == bs &&
858 ! IS_PS_GOING_AWAY(ps)) {
859 PS_LOCK(ps);
860
861 if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) {
862 /*
863 * Someone is already busy reclamining this paging segment.
864 * If it's the emergency segment we are looking at then check
865 * that someone has not already recovered it and set the right
866 * state i.e. online but not activated.
867 */
868 PS_UNLOCK(ps);
869 continue;
870 }
871
872 /* disable access to this segment */
873 ps->ps_state &= ~PS_CAN_USE;
874 ps->ps_state |= PS_GOING_AWAY;
875 PS_UNLOCK(ps);
876 /*
877 * The "ps" segment is "off-line" now,
878 * we can try and delete it...
879 */
880 if(dp_pages_free < (cluster_transfer_minimum
881 + ps->ps_pgcount)) {
882 error = KERN_FAILURE;
883 PSL_UNLOCK();
884 }
885 else {
886 /* remove all pages associated with the */
887 /* segment from the list of free pages */
888 /* when transfer is through, all target */
889 /* segment pages will appear to be free */
890
891 dp_pages_free -= ps->ps_pgcount;
892 interim_pages_removed += ps->ps_pgcount;
893 PSL_UNLOCK();
894 error = ps_delete(ps);
895 }
896 if (error != KERN_SUCCESS) {
897 /*
898 * We couldn't delete the segment,
899 * probably because there's not enough
900 * virtual memory left.
901 * Re-enable all the segments.
902 */
903 PSL_LOCK();
904 break;
905 }
906 goto restart;
907 }
908 }
909
910 if (error != KERN_SUCCESS) {
911 for (i = 0; i <= paging_segment_max; i++) {
912 ps = paging_segments[i];
913 if (ps != PAGING_SEGMENT_NULL &&
914 ps->ps_bs == bs &&
915 IS_PS_GOING_AWAY(ps)) {
916 PS_LOCK(ps);
917
918 if( !IS_PS_GOING_AWAY(ps)) {
919 PS_UNLOCK(ps);
920 continue;
921 }
922 /* Handle the special clusters that came in while we let go the lock*/
923 if( ps->ps_special_clusters) {
924 dp_pages_free += ps->ps_special_clusters << ps->ps_clshift;
925 ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift;
926 ps->ps_clcount += ps->ps_special_clusters;
927 if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) {
928 ps_select_array[ps->ps_bs->bs_priority] = 0;
929 }
930 ps->ps_special_clusters = 0;
931 }
932 /* re-enable access to this segment */
933 ps->ps_state &= ~PS_GOING_AWAY;
934 ps->ps_state |= PS_CAN_USE;
935 PS_UNLOCK(ps);
936 }
937 }
938 dp_pages_free += interim_pages_removed;
939 PSL_UNLOCK();
940 BS_UNLOCK(bs);
941 return error;
942 }
943
944 for (i = 0; i <= paging_segment_max; i++) {
945 ps = paging_segments[i];
946 if (ps != PAGING_SEGMENT_NULL &&
947 ps->ps_bs == bs) {
948 if(IS_PS_GOING_AWAY(ps)) {
949 if(IS_PS_EMERGENCY_SEGMENT(ps)) {
950 PS_LOCK(ps);
951 ps->ps_state &= ~PS_GOING_AWAY;
952 ps->ps_special_clusters = 0;
953 ps->ps_pgcount = ps->ps_pgnum;
954 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
955 dp_pages_reserve += ps->ps_pgcount;
956 PS_UNLOCK(ps);
957 } else {
958 paging_segments[i] = PAGING_SEGMENT_NULL;
959 paging_segment_count--;
960 PS_LOCK(ps);
961 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
962 kfree(ps, sizeof *ps);
963 }
964 }
965 }
966 }
967
968 /* Scan the entire ps array separately to make certain we find the */
969 /* proper paging_segment_max */
970 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
971 if(paging_segments[i] != PAGING_SEGMENT_NULL)
972 paging_segment_max = i;
973 }
974
975 PSL_UNLOCK();
976
977 if( dealing_with_emergency_segment ) {
978 BS_UNLOCK(bs);
979 return KERN_SUCCESS;
980 }
981
982 /*
983 * All the segments have been deleted.
984 * We can remove the backing store.
985 */
986
987 /*
988 * Disable lookups of this backing store.
989 */
990 if((void *)bs->bs_port->ip_alias != NULL)
991 kfree((void *) bs->bs_port->ip_alias,
992 sizeof (struct vstruct_alias));
993 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
994 bs->bs_port = MACH_PORT_NULL;
995 BS_UNLOCK(bs);
996
997 /*
998 * Remove backing store from backing_store list.
999 */
1000 BSL_LOCK();
1001 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
1002 bs_links);
1003 BSL_UNLOCK();
1004
1005 /*
1006 * Free the backing store structure.
1007 */
1008 BS_LOCK_DESTROY(bs);
1009 kfree(bs, sizeof *bs);
1010
1011 return KERN_SUCCESS;
1012 }
1013
1014 kern_return_t
1015 default_pager_backing_store_delete(
1016 MACH_PORT_FACE backing_store)
1017 {
1018 if( backing_store != emergency_segment_backing_store ) {
1019 default_pager_backing_store_delete_internal(emergency_segment_backing_store);
1020 }
1021 return(default_pager_backing_store_delete_internal(backing_store));
1022 }
1023
1024 int ps_enter(paging_segment_t); /* forward */
1025
1026 int
1027 ps_enter(
1028 paging_segment_t ps)
1029 {
1030 int i;
1031
1032 PSL_LOCK();
1033
1034 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
1035 if (paging_segments[i] == PAGING_SEGMENT_NULL)
1036 break;
1037 }
1038
1039 if (i < MAX_NUM_PAGING_SEGMENTS) {
1040 paging_segments[i] = ps;
1041 if (i > paging_segment_max)
1042 paging_segment_max = i;
1043 paging_segment_count++;
1044 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
1045 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
1046 ps_select_array[ps->ps_bs->bs_priority] = 0;
1047 i = 0;
1048 } else {
1049 PSL_UNLOCK();
1050 return KERN_RESOURCE_SHORTAGE;
1051 }
1052
1053 PSL_UNLOCK();
1054 return i;
1055 }
1056
1057 #ifdef DEVICE_PAGING
1058 kern_return_t
1059 default_pager_add_segment(
1060 MACH_PORT_FACE backing_store,
1061 MACH_PORT_FACE device,
1062 recnum_t offset,
1063 recnum_t count,
1064 int record_size)
1065 {
1066 backing_store_t bs;
1067 paging_segment_t ps;
1068 int i;
1069 int error;
1070
1071 if ((bs = backing_store_lookup(backing_store))
1072 == BACKING_STORE_NULL)
1073 return KERN_INVALID_ARGUMENT;
1074
1075 PSL_LOCK();
1076 for (i = 0; i <= paging_segment_max; i++) {
1077 ps = paging_segments[i];
1078 if (ps == PAGING_SEGMENT_NULL)
1079 continue;
1080
1081 /*
1082 * Check for overlap on same device.
1083 */
1084 if (!(ps->ps_device != device
1085 || offset >= ps->ps_offset + ps->ps_recnum
1086 || offset + count <= ps->ps_offset)) {
1087 PSL_UNLOCK();
1088 BS_UNLOCK(bs);
1089 return KERN_INVALID_ARGUMENT;
1090 }
1091 }
1092 PSL_UNLOCK();
1093
1094 /*
1095 * Set up the paging segment
1096 */
1097 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1098 if (ps == PAGING_SEGMENT_NULL) {
1099 BS_UNLOCK(bs);
1100 return KERN_RESOURCE_SHORTAGE;
1101 }
1102
1103 ps->ps_segtype = PS_PARTITION;
1104 ps->ps_device = device;
1105 ps->ps_offset = offset;
1106 ps->ps_record_shift = local_log2(vm_page_size / record_size);
1107 ps->ps_recnum = count;
1108 ps->ps_pgnum = count >> ps->ps_record_shift;
1109
1110 ps->ps_pgcount = ps->ps_pgnum;
1111 ps->ps_clshift = local_log2(bs->bs_clsize);
1112 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1113 ps->ps_hint = 0;
1114
1115 PS_LOCK_INIT(ps);
1116 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1117 if (!ps->ps_bmap) {
1118 PS_LOCK_DESTROY(ps);
1119 kfree(ps, sizeof *ps);
1120 BS_UNLOCK(bs);
1121 return KERN_RESOURCE_SHORTAGE;
1122 }
1123 for (i = 0; i < ps->ps_ncls; i++) {
1124 clrbit(ps->ps_bmap, i);
1125 }
1126
1127 if(paging_segment_count == 0) {
1128 ps->ps_state = PS_EMERGENCY_SEGMENT;
1129 if(use_emergency_swap_file_first) {
1130 ps->ps_state |= PS_CAN_USE;
1131 }
1132 } else {
1133 ps->ps_state = PS_CAN_USE;
1134 }
1135
1136 ps->ps_bs = bs;
1137
1138 if ((error = ps_enter(ps)) != 0) {
1139 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1140
1141 PS_LOCK_DESTROY(ps);
1142 kfree(ps, sizeof *ps);
1143 BS_UNLOCK(bs);
1144 return KERN_RESOURCE_SHORTAGE;
1145 }
1146
1147 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1148 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1149 BS_UNLOCK(bs);
1150
1151 PSL_LOCK();
1152 if(IS_PS_OK_TO_USE(ps)) {
1153 dp_pages_free += ps->ps_pgcount;
1154 } else {
1155 dp_pages_reserve += ps->ps_pgcount;
1156 }
1157 PSL_UNLOCK();
1158
1159 bs_more_space(ps->ps_clcount);
1160
1161 DP_DEBUG(DEBUG_BS_INTERNAL,
1162 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1163 device, offset, count, record_size,
1164 ps->ps_record_shift, ps->ps_pgnum));
1165
1166 return KERN_SUCCESS;
1167 }
1168
1169 boolean_t
1170 bs_add_device(
1171 char *dev_name,
1172 MACH_PORT_FACE master)
1173 {
1174 security_token_t null_security_token = {
1175 { 0, 0 }
1176 };
1177 MACH_PORT_FACE device;
1178 int info[DEV_GET_SIZE_COUNT];
1179 mach_msg_type_number_t info_count;
1180 MACH_PORT_FACE bs = MACH_PORT_NULL;
1181 unsigned int rec_size;
1182 recnum_t count;
1183 int clsize;
1184 MACH_PORT_FACE reply_port;
1185
1186 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1187 null_security_token, dev_name, &device))
1188 return FALSE;
1189
1190 info_count = DEV_GET_SIZE_COUNT;
1191 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1192 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1193 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1194 clsize = bs_get_global_clsize(0);
1195 if (!default_pager_backing_store_create(
1196 default_pager_object,
1197 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1198 (clsize * vm_page_size),
1199 &bs)) {
1200 if (!default_pager_add_segment(bs, device,
1201 0, count, rec_size)) {
1202 return TRUE;
1203 }
1204 ipc_port_release_receive(bs);
1205 }
1206 }
1207
1208 ipc_port_release_send(device);
1209 return FALSE;
1210 }
1211 #endif /* DEVICE_PAGING */
1212
1213 #if VS_ASYNC_REUSE
1214
1215 struct vs_async *
1216 vs_alloc_async(void)
1217 {
1218 struct vs_async *vsa;
1219 MACH_PORT_FACE reply_port;
1220 // kern_return_t kr;
1221
1222 VS_ASYNC_LOCK();
1223 if (vs_async_free_list == NULL) {
1224 VS_ASYNC_UNLOCK();
1225 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1226 if (vsa != NULL) {
1227 /*
1228 * Try allocating a reply port named after the
1229 * address of the vs_async structure.
1230 */
1231 struct vstruct_alias *alias_struct;
1232
1233 reply_port = ipc_port_alloc_kernel();
1234 alias_struct = (struct vstruct_alias *)
1235 kalloc(sizeof (struct vstruct_alias));
1236 if(alias_struct != NULL) {
1237 __IGNORE_WCASTALIGN(alias_struct->vs = (struct vstruct *)vsa);
1238 alias_struct->name = &default_pager_ops;
1239 reply_port->ip_alias = (uintptr_t) alias_struct;
1240 vsa->reply_port = reply_port;
1241 vs_alloc_async_count++;
1242 }
1243 else {
1244 vs_alloc_async_failed++;
1245 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1246 (reply_port));
1247 kfree(vsa, sizeof (struct vs_async));
1248 vsa = NULL;
1249 }
1250 }
1251 } else {
1252 vsa = vs_async_free_list;
1253 vs_async_free_list = vs_async_free_list->vsa_next;
1254 VS_ASYNC_UNLOCK();
1255 }
1256
1257 return vsa;
1258 }
1259
1260 void
1261 vs_free_async(
1262 struct vs_async *vsa)
1263 {
1264 VS_ASYNC_LOCK();
1265 vsa->vsa_next = vs_async_free_list;
1266 vs_async_free_list = vsa;
1267 VS_ASYNC_UNLOCK();
1268 }
1269
1270 #else /* VS_ASYNC_REUSE */
1271
1272 struct vs_async *
1273 vs_alloc_async(void)
1274 {
1275 struct vs_async *vsa;
1276 MACH_PORT_FACE reply_port;
1277 kern_return_t kr;
1278
1279 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1280 if (vsa != NULL) {
1281 /*
1282 * Try allocating a reply port named after the
1283 * address of the vs_async structure.
1284 */
1285 reply_port = ipc_port_alloc_kernel();
1286 alias_struct = (vstruct_alias *)
1287 kalloc(sizeof (struct vstruct_alias));
1288 if(alias_struct != NULL) {
1289 alias_struct->vs = reply_port;
1290 alias_struct->name = &default_pager_ops;
1291 reply_port->defpager_importance.alias = (int) vsa;
1292 vsa->reply_port = reply_port;
1293 vs_alloc_async_count++;
1294 }
1295 else {
1296 vs_alloc_async_failed++;
1297 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1298 (reply_port));
1299 kfree(vsa, sizeof (struct vs_async));
1300 vsa = NULL;
1301 }
1302 }
1303
1304 return vsa;
1305 }
1306
1307 void
1308 vs_free_async(
1309 struct vs_async *vsa)
1310 {
1311 MACH_PORT_FACE reply_port;
1312 kern_return_t kr;
1313
1314 reply_port = vsa->reply_port;
1315 kfree(reply_port->ip_alias, sizeof (struct vstuct_alias));
1316 kfree(vsa, sizeof (struct vs_async));
1317 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1318 #if 0
1319 VS_ASYNC_LOCK();
1320 vs_alloc_async_count--;
1321 VS_ASYNC_UNLOCK();
1322 #endif
1323 }
1324
1325 #endif /* VS_ASYNC_REUSE */
1326
1327 zone_t vstruct_zone;
1328
1329 vstruct_t
1330 ps_vstruct_create(
1331 dp_size_t size)
1332 {
1333 vstruct_t vs;
1334 unsigned int i;
1335
1336 vs = (vstruct_t) zalloc(vstruct_zone);
1337 if (vs == VSTRUCT_NULL) {
1338 return VSTRUCT_NULL;
1339 }
1340
1341 VS_LOCK_INIT(vs);
1342
1343 /*
1344 * The following fields will be provided later.
1345 */
1346 vs->vs_pager_ops = NULL;
1347 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1348 vs->vs_references = 1;
1349 vs->vs_seqno = 0;
1350
1351 vs->vs_waiting_seqno = FALSE;
1352 vs->vs_waiting_read = FALSE;
1353 vs->vs_waiting_write = FALSE;
1354 vs->vs_waiting_async = FALSE;
1355
1356 vs->vs_readers = 0;
1357 vs->vs_writers = 0;
1358
1359 vs->vs_errors = 0;
1360
1361 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1362 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1363 vs->vs_async_pending = 0;
1364
1365 /*
1366 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1367 * depending on the size of the memory object.
1368 */
1369 if (INDIRECT_CLMAP(vs->vs_size)) {
1370 vs->vs_imap = (struct vs_map **)
1371 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1372 vs->vs_indirect = TRUE;
1373 } else {
1374 vs->vs_dmap = (struct vs_map *)
1375 kalloc(CLMAP_SIZE(vs->vs_size));
1376 vs->vs_indirect = FALSE;
1377 }
1378 vs->vs_xfer_pending = FALSE;
1379 DP_DEBUG(DEBUG_VS_INTERNAL,
1380 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1381
1382 /*
1383 * Check to see that we got the space.
1384 */
1385 if (!vs->vs_dmap) {
1386 kfree(vs, sizeof *vs);
1387 return VSTRUCT_NULL;
1388 }
1389
1390 /*
1391 * Zero the indirect pointers, or clear the direct pointers.
1392 */
1393 if (vs->vs_indirect)
1394 memset(vs->vs_imap, 0,
1395 INDIRECT_CLMAP_SIZE(vs->vs_size));
1396 else
1397 for (i = 0; i < vs->vs_size; i++)
1398 VSM_CLR(vs->vs_dmap[i]);
1399
1400 VS_MAP_LOCK_INIT(vs);
1401
1402 bs_commit(vs->vs_size);
1403
1404 return vs;
1405 }
1406
1407 paging_segment_t ps_select_segment(unsigned int, int *); /* forward */
1408
1409 paging_segment_t
1410 ps_select_segment(
1411 unsigned int shift,
1412 int *psindex)
1413 {
1414 paging_segment_t ps;
1415 int i;
1416 int j;
1417
1418 /*
1419 * Optimize case where there's only one segment.
1420 * paging_segment_max will index the one and only segment.
1421 */
1422
1423 PSL_LOCK();
1424 if (paging_segment_count == 1) {
1425 paging_segment_t lps = PAGING_SEGMENT_NULL; /* used to avoid extra PS_UNLOCK */
1426 ipc_port_t trigger = IP_NULL;
1427
1428 ps = paging_segments[paging_segment_max];
1429 *psindex = paging_segment_max;
1430 PS_LOCK(ps);
1431 if( !IS_PS_EMERGENCY_SEGMENT(ps) ) {
1432 panic("Emergency paging segment missing\n");
1433 }
1434 ASSERT(ps->ps_clshift >= shift);
1435 if(IS_PS_OK_TO_USE(ps)) {
1436 if (ps->ps_clcount) {
1437 ps->ps_clcount--;
1438 dp_pages_free -= 1 << ps->ps_clshift;
1439 ps->ps_pgcount -= 1 << ps->ps_clshift;
1440 if(min_pages_trigger_port &&
1441 (dp_pages_free < minimum_pages_remaining)) {
1442 trigger = min_pages_trigger_port;
1443 min_pages_trigger_port = NULL;
1444 bs_low = TRUE;
1445 backing_store_abort_compaction = TRUE;
1446 }
1447 lps = ps;
1448 }
1449 }
1450 PS_UNLOCK(ps);
1451
1452 if( lps == PAGING_SEGMENT_NULL ) {
1453 if(dp_pages_free) {
1454 dp_pages_free_drift_count++;
1455 if(dp_pages_free > dp_pages_free_drifted_max) {
1456 dp_pages_free_drifted_max = dp_pages_free;
1457 }
1458 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1459 }
1460 dp_pages_free = 0;
1461 }
1462
1463 PSL_UNLOCK();
1464
1465 if (trigger != IP_NULL) {
1466 dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1467
1468 default_pager_space_alert(trigger, HI_WAT_ALERT);
1469 ipc_port_release_send(trigger);
1470 }
1471 return lps;
1472 }
1473
1474 if (paging_segment_count == 0) {
1475 if(dp_pages_free) {
1476 dp_pages_free_drift_count++;
1477 if(dp_pages_free > dp_pages_free_drifted_max) {
1478 dp_pages_free_drifted_max = dp_pages_free;
1479 }
1480 dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1481 }
1482 dp_pages_free = 0;
1483 PSL_UNLOCK();
1484 return PAGING_SEGMENT_NULL;
1485 }
1486
1487 for (i = BS_MAXPRI;
1488 i >= BS_MINPRI; i--) {
1489 int start_index;
1490
1491 if ((ps_select_array[i] == BS_NOPRI) ||
1492 (ps_select_array[i] == BS_FULLPRI))
1493 continue;
1494 start_index = ps_select_array[i];
1495
1496 if(!(paging_segments[start_index])) {
1497 j = start_index+1;
1498 physical_transfer_cluster_count = 0;
1499 }
1500 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1501 (((paging_segments[start_index])->ps_clshift)
1502 + vm_page_shift))) {
1503 physical_transfer_cluster_count = 0;
1504 j = start_index + 1;
1505 } else {
1506 physical_transfer_cluster_count+=1;
1507 j = start_index;
1508 if(start_index == 0)
1509 start_index = paging_segment_max;
1510 else
1511 start_index = start_index - 1;
1512 }
1513
1514 while (1) {
1515 if (j > paging_segment_max)
1516 j = 0;
1517 if ((ps = paging_segments[j]) &&
1518 (ps->ps_bs->bs_priority == i)) {
1519 /*
1520 * Force the ps cluster size to be
1521 * >= that of the vstruct.
1522 */
1523 PS_LOCK(ps);
1524 if (IS_PS_OK_TO_USE(ps)) {
1525 if ((ps->ps_clcount) &&
1526 (ps->ps_clshift >= shift)) {
1527 ipc_port_t trigger = IP_NULL;
1528
1529 ps->ps_clcount--;
1530 dp_pages_free -= 1 << ps->ps_clshift;
1531 ps->ps_pgcount -= 1 << ps->ps_clshift;
1532 if(min_pages_trigger_port &&
1533 (dp_pages_free <
1534 minimum_pages_remaining)) {
1535 trigger = min_pages_trigger_port;
1536 min_pages_trigger_port = NULL;
1537 bs_low = TRUE;
1538 backing_store_abort_compaction = TRUE;
1539 }
1540 PS_UNLOCK(ps);
1541 /*
1542 * found one, quit looking.
1543 */
1544 ps_select_array[i] = j;
1545 PSL_UNLOCK();
1546
1547 if (trigger != IP_NULL) {
1548 dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1549
1550 default_pager_space_alert(
1551 trigger,
1552 HI_WAT_ALERT);
1553 ipc_port_release_send(trigger);
1554 }
1555 *psindex = j;
1556 return ps;
1557 }
1558 }
1559 PS_UNLOCK(ps);
1560 }
1561 if (j == start_index) {
1562 /*
1563 * none at this priority -- mark it full
1564 */
1565 ps_select_array[i] = BS_FULLPRI;
1566 break;
1567 }
1568 j++;
1569 }
1570 }
1571
1572 if(dp_pages_free) {
1573 dp_pages_free_drift_count++;
1574 if(dp_pages_free > dp_pages_free_drifted_max) {
1575 dp_pages_free_drifted_max = dp_pages_free;
1576 }
1577 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free));
1578 }
1579 dp_pages_free = 0;
1580 PSL_UNLOCK();
1581 return PAGING_SEGMENT_NULL;
1582 }
1583
1584 dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1585
1586 dp_offset_t
1587 ps_allocate_cluster(
1588 vstruct_t vs,
1589 int *psindex,
1590 paging_segment_t use_ps)
1591 {
1592 unsigned int byte_num;
1593 int bit_num = 0;
1594 paging_segment_t ps;
1595 dp_offset_t cluster;
1596 ipc_port_t trigger = IP_NULL;
1597
1598 /*
1599 * Find best paging segment.
1600 * ps_select_segment will decrement cluster count on ps.
1601 * Must pass cluster shift to find the most appropriate segment.
1602 */
1603 /* NOTE: The addition of paging segment delete capability threatened
1604 * to seriously complicate the treatment of paging segments in this
1605 * module and the ones that call it (notably ps_clmap), because of the
1606 * difficulty in assuring that the paging segment would continue to
1607 * exist between being unlocked and locked. This was
1608 * avoided because all calls to this module are based in either
1609 * dp_memory_object calls which rely on the vs lock, or by
1610 * the transfer function which is part of the segment delete path.
1611 * The transfer function which is part of paging segment delete is
1612 * protected from multiple callers by the backing store lock.
1613 * The paging segment delete function treats mappings to a paging
1614 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1615 * while data is transferred to the remaining segments. This is in
1616 * line with the view that incomplete or in-transition mappings between
1617 * data, a vstruct, and backing store are protected by the vs lock.
1618 * This and the ordering of the paging segment "going_away" bit setting
1619 * protects us.
1620 */
1621 retry:
1622 if (use_ps != PAGING_SEGMENT_NULL) {
1623 ps = use_ps;
1624 PSL_LOCK();
1625 PS_LOCK(ps);
1626
1627 ASSERT(ps->ps_clcount != 0);
1628
1629 ps->ps_clcount--;
1630 dp_pages_free -= 1 << ps->ps_clshift;
1631 ps->ps_pgcount -= 1 << ps->ps_clshift;
1632 if(min_pages_trigger_port &&
1633 (dp_pages_free < minimum_pages_remaining)) {
1634 trigger = min_pages_trigger_port;
1635 min_pages_trigger_port = NULL;
1636 bs_low = TRUE;
1637 backing_store_abort_compaction = TRUE;
1638 }
1639 PSL_UNLOCK();
1640 PS_UNLOCK(ps);
1641 if (trigger != IP_NULL) {
1642 dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1643
1644 default_pager_space_alert(trigger, HI_WAT_ALERT);
1645 ipc_port_release_send(trigger);
1646 }
1647
1648 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1649 PAGING_SEGMENT_NULL) {
1650 static clock_sec_t lastnotify = 0;
1651 clock_sec_t now;
1652 clock_nsec_t nanoseconds_dummy;
1653
1654 /*
1655 * Don't immediately jump to the emergency segment. Give the
1656 * dynamic pager a chance to create it's first normal swap file.
1657 * Unless, of course the very first normal swap file can't be
1658 * created due to some problem and we didn't expect that problem
1659 * i.e. use_emergency_swap_file_first was never set to true initially.
1660 * It then gets set in the swap file creation error handling.
1661 */
1662 if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) {
1663
1664 ps = paging_segments[EMERGENCY_PSEG_INDEX];
1665 if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) {
1666 PSL_LOCK();
1667 PS_LOCK(ps);
1668
1669 if(IS_PS_GOING_AWAY(ps)) {
1670 /* Someone de-activated the emergency paging segment*/
1671 PS_UNLOCK(ps);
1672 PSL_UNLOCK();
1673
1674 } else if(dp_pages_free) {
1675 /*
1676 * Someone has already activated the emergency paging segment
1677 * OR
1678 * Between us having rec'd a NULL segment from ps_select_segment
1679 * and reaching here a new normal segment could have been added.
1680 * E.g. we get NULL segment and another thread just added the
1681 * new swap file. Hence check to see if we have more dp_pages_free
1682 * before activating the emergency segment.
1683 */
1684 PS_UNLOCK(ps);
1685 PSL_UNLOCK();
1686 goto retry;
1687
1688 } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) {
1689 /*
1690 * PS_CAN_USE is only reset from the emergency segment when it's
1691 * been successfully recovered. So it's legal to have an emergency
1692 * segment that has PS_CAN_USE but no clusters because it's recovery
1693 * failed.
1694 */
1695 backing_store_t bs = ps->ps_bs;
1696 ps->ps_state |= PS_CAN_USE;
1697 if(ps_select_array[bs->bs_priority] == BS_FULLPRI ||
1698 ps_select_array[bs->bs_priority] == BS_NOPRI) {
1699 ps_select_array[bs->bs_priority] = 0;
1700 }
1701 dp_pages_free += ps->ps_pgcount;
1702 dp_pages_reserve -= ps->ps_pgcount;
1703 PS_UNLOCK(ps);
1704 PSL_UNLOCK();
1705 dprintf(("Switching ON Emergency paging segment\n"));
1706 goto retry;
1707 }
1708
1709 PS_UNLOCK(ps);
1710 PSL_UNLOCK();
1711 }
1712 }
1713
1714 /*
1715 * Emit a notification of the low-paging resource condition
1716 * but don't issue it more than once every five seconds. This
1717 * prevents us from overflowing logs with thousands of
1718 * repetitions of the message.
1719 */
1720 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1721 if (paging_segment_count > 1 && (now > lastnotify + 5)) {
1722 /* With an activated emergency paging segment we still
1723 * didn't get any clusters. This could mean that the
1724 * emergency paging segment is exhausted.
1725 */
1726 dprintf(("System is out of paging space.\n"));
1727 lastnotify = now;
1728 }
1729
1730 PSL_LOCK();
1731
1732 if(min_pages_trigger_port) {
1733 trigger = min_pages_trigger_port;
1734 min_pages_trigger_port = NULL;
1735 bs_low = TRUE;
1736 backing_store_abort_compaction = TRUE;
1737 }
1738 PSL_UNLOCK();
1739 if (trigger != IP_NULL) {
1740 dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1741
1742 default_pager_space_alert(trigger, HI_WAT_ALERT);
1743 ipc_port_release_send(trigger);
1744 }
1745 return (dp_offset_t) -1;
1746 }
1747
1748 /*
1749 * Look for an available cluster. At the end of the loop,
1750 * byte_num is the byte offset and bit_num is the bit offset of the
1751 * first zero bit in the paging segment bitmap.
1752 */
1753 PS_LOCK(ps);
1754 byte_num = ps->ps_hint;
1755 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1756 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1757 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1758 if (isclr((ps->ps_bmap + byte_num), bit_num))
1759 break;
1760 }
1761 ASSERT(bit_num != NBBY);
1762 break;
1763 }
1764 }
1765 ps->ps_hint = byte_num;
1766 cluster = (byte_num*NBBY) + bit_num;
1767
1768 /* Space was reserved, so this must be true */
1769 ASSERT(cluster < ps->ps_ncls);
1770
1771 setbit(ps->ps_bmap, cluster);
1772 PS_UNLOCK(ps);
1773
1774 return cluster;
1775 }
1776
1777 void ps_deallocate_cluster(paging_segment_t, dp_offset_t); /* forward */
1778
1779 void
1780 ps_deallocate_cluster(
1781 paging_segment_t ps,
1782 dp_offset_t cluster)
1783 {
1784
1785 if (cluster >= ps->ps_ncls)
1786 panic("ps_deallocate_cluster: Invalid cluster number");
1787
1788 /*
1789 * Lock the paging segment, clear the cluster's bitmap and increment the
1790 * number of free cluster.
1791 */
1792 PSL_LOCK();
1793 PS_LOCK(ps);
1794 clrbit(ps->ps_bmap, cluster);
1795 if( IS_PS_OK_TO_USE(ps)) {
1796 ++ps->ps_clcount;
1797 ps->ps_pgcount += 1 << ps->ps_clshift;
1798 dp_pages_free += 1 << ps->ps_clshift;
1799 } else {
1800 ps->ps_special_clusters += 1;
1801 }
1802
1803 /*
1804 * Move the hint down to the freed cluster if it is
1805 * less than the current hint.
1806 */
1807 if ((cluster/NBBY) < ps->ps_hint) {
1808 ps->ps_hint = (cluster/NBBY);
1809 }
1810
1811
1812 /*
1813 * If we're freeing space on a full priority, reset the array.
1814 */
1815 if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1816 ps_select_array[ps->ps_bs->bs_priority] = 0;
1817 PS_UNLOCK(ps);
1818 PSL_UNLOCK();
1819
1820 return;
1821 }
1822
1823 void ps_dealloc_vsmap(struct vs_map *, dp_size_t); /* forward */
1824
1825 void
1826 ps_dealloc_vsmap(
1827 struct vs_map *vsmap,
1828 dp_size_t size)
1829 {
1830 unsigned int i;
1831 struct ps_vnode_trim_data trim_data;
1832
1833 ps_vnode_trim_init(&trim_data);
1834
1835 for (i = 0; i < size; i++) {
1836 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) {
1837 ps_vnode_trim_more(&trim_data,
1838 &vsmap[i],
1839 VSM_PS(vsmap[i])->ps_clshift,
1840 vm_page_size << VSM_PS(vsmap[i])->ps_clshift);
1841 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1842 VSM_CLOFF(vsmap[i]));
1843 } else {
1844 ps_vnode_trim_now(&trim_data);
1845 }
1846 }
1847 ps_vnode_trim_now(&trim_data);
1848 }
1849
1850 void
1851 ps_vstruct_dealloc(
1852 vstruct_t vs)
1853 {
1854 unsigned int i;
1855 // spl_t s;
1856
1857 VS_MAP_LOCK(vs);
1858
1859 /*
1860 * If this is an indirect structure, then we walk through the valid
1861 * (non-zero) indirect pointers and deallocate the clusters
1862 * associated with each used map entry (via ps_dealloc_vsmap).
1863 * When all of the clusters in an indirect block have been
1864 * freed, we deallocate the block. When all of the indirect
1865 * blocks have been deallocated we deallocate the memory
1866 * holding the indirect pointers.
1867 */
1868 if (vs->vs_indirect) {
1869 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1870 if (vs->vs_imap[i] != NULL) {
1871 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1872 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1873 }
1874 }
1875 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1876 } else {
1877 /*
1878 * Direct map. Free used clusters, then memory.
1879 */
1880 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1881 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1882 }
1883 VS_MAP_UNLOCK(vs);
1884
1885 bs_commit(- vs->vs_size);
1886
1887 VS_MAP_LOCK_DESTROY(vs);
1888
1889 zfree(vstruct_zone, vs);
1890 }
1891
1892 kern_return_t
1893 ps_vstruct_reclaim(
1894 vstruct_t vs,
1895 boolean_t return_to_vm,
1896 boolean_t reclaim_backing_store)
1897 {
1898 unsigned int i, j;
1899 struct vs_map *vsmap;
1900 boolean_t vsmap_all_clear, vsimap_all_clear;
1901 struct vm_object_fault_info fault_info;
1902 int clmap_off;
1903 unsigned int vsmap_size;
1904 kern_return_t kr = KERN_SUCCESS;
1905
1906 VS_MAP_LOCK(vs);
1907
1908 fault_info.cluster_size = VM_SUPER_CLUSTER;
1909 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
1910 fault_info.user_tag = 0;
1911 fault_info.pmap_options = 0;
1912 fault_info.lo_offset = 0;
1913 fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift);
1914 fault_info.io_sync = reclaim_backing_store;
1915 fault_info.batch_pmap_op = FALSE;
1916
1917 /*
1918 * If this is an indirect structure, then we walk through the valid
1919 * (non-zero) indirect pointers and deallocate the clusters
1920 * associated with each used map entry (via ps_dealloc_vsmap).
1921 * When all of the clusters in an indirect block have been
1922 * freed, we deallocate the block. When all of the indirect
1923 * blocks have been deallocated we deallocate the memory
1924 * holding the indirect pointers.
1925 */
1926 if (vs->vs_indirect) {
1927 vsimap_all_clear = TRUE;
1928 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1929 vsmap = vs->vs_imap[i];
1930 if (vsmap == NULL)
1931 continue;
1932 /* loop on clusters in this indirect map */
1933 clmap_off = (vm_page_size * CLMAP_ENTRIES *
1934 VSCLSIZE(vs) * i);
1935 if (i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
1936 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
1937 else
1938 vsmap_size = CLMAP_ENTRIES;
1939 vsmap_all_clear = TRUE;
1940 if (return_to_vm) {
1941 for (j = 0; j < vsmap_size;) {
1942 if (VSM_ISCLR(vsmap[j]) ||
1943 VSM_ISERR(vsmap[j])) {
1944 j++;
1945 clmap_off += vm_page_size * VSCLSIZE(vs);
1946 continue;
1947 }
1948 VS_MAP_UNLOCK(vs);
1949 kr = pvs_cluster_read(
1950 vs,
1951 clmap_off,
1952 (dp_size_t) -1, /* read whole cluster */
1953 &fault_info);
1954
1955 VS_MAP_LOCK(vs); /* XXX what if it changed ? */
1956 if (kr != KERN_SUCCESS) {
1957 vsmap_all_clear = FALSE;
1958 vsimap_all_clear = FALSE;
1959
1960 kr = KERN_MEMORY_ERROR;
1961 goto out;
1962 }
1963 }
1964 }
1965 if (vsmap_all_clear) {
1966 ps_dealloc_vsmap(vsmap, CLMAP_ENTRIES);
1967 kfree(vsmap, CLMAP_THRESHOLD);
1968 vs->vs_imap[i] = NULL;
1969 }
1970 }
1971 if (vsimap_all_clear) {
1972 // kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1973 }
1974 } else {
1975 /*
1976 * Direct map. Free used clusters, then memory.
1977 */
1978 vsmap = vs->vs_dmap;
1979 if (vsmap == NULL) {
1980 goto out;
1981 }
1982 vsmap_all_clear = TRUE;
1983 /* loop on clusters in the direct map */
1984 if (return_to_vm) {
1985 for (j = 0; j < vs->vs_size;) {
1986 if (VSM_ISCLR(vsmap[j]) ||
1987 VSM_ISERR(vsmap[j])) {
1988 j++;
1989 continue;
1990 }
1991 clmap_off = vm_page_size * (j << vs->vs_clshift);
1992 VS_MAP_UNLOCK(vs);
1993 kr = pvs_cluster_read(
1994 vs,
1995 clmap_off,
1996 (dp_size_t) -1, /* read whole cluster */
1997 &fault_info);
1998
1999 VS_MAP_LOCK(vs); /* XXX what if it changed ? */
2000 if (kr != KERN_SUCCESS) {
2001 vsmap_all_clear = FALSE;
2002
2003 kr = KERN_MEMORY_ERROR;
2004 goto out;
2005 } else {
2006 // VSM_CLR(vsmap[j]);
2007 }
2008 }
2009 }
2010 if (vsmap_all_clear) {
2011 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
2012 // kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
2013 }
2014 }
2015 out:
2016 VS_MAP_UNLOCK(vs);
2017
2018 return kr;
2019 }
2020
2021 int ps_map_extend(vstruct_t, unsigned int); /* forward */
2022
2023 int ps_map_extend(
2024 vstruct_t vs,
2025 unsigned int new_size)
2026 {
2027 struct vs_map **new_imap;
2028 struct vs_map *new_dmap = NULL;
2029 int newdsize;
2030 int i;
2031 void *old_map = NULL;
2032 int old_map_size = 0;
2033
2034 if (vs->vs_size >= new_size) {
2035 /*
2036 * Someone has already done the work.
2037 */
2038 return 0;
2039 }
2040
2041 /*
2042 * If the new size extends into the indirect range, then we have one
2043 * of two cases: we are going from indirect to indirect, or we are
2044 * going from direct to indirect. If we are going from indirect to
2045 * indirect, then it is possible that the new size will fit in the old
2046 * indirect map. If this is the case, then just reset the size of the
2047 * vstruct map and we are done. If the new size will not
2048 * fit into the old indirect map, then we have to allocate a new
2049 * indirect map and copy the old map pointers into this new map.
2050 *
2051 * If we are going from direct to indirect, then we have to allocate a
2052 * new indirect map and copy the old direct pages into the first
2053 * indirect page of the new map.
2054 * NOTE: allocating memory here is dangerous, as we're in the
2055 * pageout path.
2056 */
2057 if (INDIRECT_CLMAP(new_size)) {
2058 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
2059
2060 /*
2061 * Get a new indirect map and zero it.
2062 */
2063 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
2064 if (vs->vs_indirect &&
2065 (new_map_size == old_map_size)) {
2066 bs_commit(new_size - vs->vs_size);
2067 vs->vs_size = new_size;
2068 return 0;
2069 }
2070
2071 new_imap = (struct vs_map **)kalloc(new_map_size);
2072 if (new_imap == NULL) {
2073 return -1;
2074 }
2075 memset(new_imap, 0, new_map_size);
2076
2077 if (vs->vs_indirect) {
2078 /* Copy old entries into new map */
2079 memcpy(new_imap, vs->vs_imap, old_map_size);
2080 /* Arrange to free the old map */
2081 old_map = (void *) vs->vs_imap;
2082 newdsize = 0;
2083 } else { /* Old map was a direct map */
2084 /* Allocate an indirect page */
2085 if ((new_imap[0] = (struct vs_map *)
2086 kalloc(CLMAP_THRESHOLD)) == NULL) {
2087 kfree(new_imap, new_map_size);
2088 return -1;
2089 }
2090 new_dmap = new_imap[0];
2091 newdsize = CLMAP_ENTRIES;
2092 }
2093 } else {
2094 new_imap = NULL;
2095 newdsize = new_size;
2096 /*
2097 * If the new map is a direct map, then the old map must
2098 * also have been a direct map. All we have to do is
2099 * to allocate a new direct map, copy the old entries
2100 * into it and free the old map.
2101 */
2102 if ((new_dmap = (struct vs_map *)
2103 kalloc(CLMAP_SIZE(new_size))) == NULL) {
2104 return -1;
2105 }
2106 }
2107 if (newdsize) {
2108
2109 /* Free the old map */
2110 old_map = (void *) vs->vs_dmap;
2111 old_map_size = CLMAP_SIZE(vs->vs_size);
2112
2113 /* Copy info from the old map into the new map */
2114 memcpy(new_dmap, vs->vs_dmap, old_map_size);
2115
2116 /* Initialize the rest of the new map */
2117 for (i = vs->vs_size; i < newdsize; i++)
2118 VSM_CLR(new_dmap[i]);
2119 }
2120 if (new_imap) {
2121 vs->vs_imap = new_imap;
2122 vs->vs_indirect = TRUE;
2123 } else
2124 vs->vs_dmap = new_dmap;
2125 bs_commit(new_size - vs->vs_size);
2126 vs->vs_size = new_size;
2127 if (old_map)
2128 kfree(old_map, old_map_size);
2129 return 0;
2130 }
2131
2132 dp_offset_t
2133 ps_clmap(
2134 vstruct_t vs,
2135 dp_offset_t offset,
2136 struct clmap *clmap,
2137 int flag,
2138 dp_size_t size,
2139 int error)
2140 {
2141 dp_offset_t cluster; /* The cluster of offset. */
2142 dp_offset_t newcl; /* The new cluster allocated. */
2143 dp_offset_t newoff;
2144 unsigned int i;
2145 struct vs_map *vsmap;
2146
2147 VS_MAP_LOCK(vs);
2148
2149 ASSERT(vs->vs_dmap);
2150 cluster = atop_32(offset) >> vs->vs_clshift;
2151
2152 /*
2153 * Initialize cluster error value
2154 */
2155 clmap->cl_error = 0;
2156
2157 /*
2158 * If the object has grown, extend the page map.
2159 */
2160 if (cluster >= vs->vs_size) {
2161 if (flag == CL_FIND) {
2162 /* Do not allocate if just doing a lookup */
2163 VS_MAP_UNLOCK(vs);
2164 return (dp_offset_t) -1;
2165 }
2166 if (ps_map_extend(vs, cluster + 1)) {
2167 VS_MAP_UNLOCK(vs);
2168 return (dp_offset_t) -1;
2169 }
2170 }
2171
2172 /*
2173 * Look for the desired cluster. If the map is indirect, then we
2174 * have a two level lookup. First find the indirect block, then
2175 * find the actual cluster. If the indirect block has not yet
2176 * been allocated, then do so. If the cluster has not yet been
2177 * allocated, then do so.
2178 *
2179 * If any of the allocations fail, then return an error.
2180 * Don't allocate if just doing a lookup.
2181 */
2182 if (vs->vs_indirect) {
2183 long ind_block = cluster/CLMAP_ENTRIES;
2184
2185 /* Is the indirect block allocated? */
2186 vsmap = vs->vs_imap[ind_block];
2187 if (vsmap == NULL) {
2188 if (flag == CL_FIND) {
2189 VS_MAP_UNLOCK(vs);
2190 return (dp_offset_t) -1;
2191 }
2192
2193 /* Allocate the indirect block */
2194 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
2195 if (vsmap == NULL) {
2196 VS_MAP_UNLOCK(vs);
2197 return (dp_offset_t) -1;
2198 }
2199 /* Initialize the cluster offsets */
2200 for (i = 0; i < CLMAP_ENTRIES; i++)
2201 VSM_CLR(vsmap[i]);
2202 vs->vs_imap[ind_block] = vsmap;
2203 }
2204 } else
2205 vsmap = vs->vs_dmap;
2206
2207 ASSERT(vsmap);
2208 vsmap += cluster%CLMAP_ENTRIES;
2209
2210 /*
2211 * At this point, vsmap points to the struct vs_map desired.
2212 *
2213 * Look in the map for the cluster, if there was an error on a
2214 * previous write, flag it and return. If it is not yet
2215 * allocated, then allocate it, if we're writing; if we're
2216 * doing a lookup and the cluster's not allocated, return error.
2217 */
2218 if (VSM_ISERR(*vsmap)) {
2219 clmap->cl_error = VSM_GETERR(*vsmap);
2220 VS_MAP_UNLOCK(vs);
2221 return (dp_offset_t) -1;
2222 } else if (VSM_ISCLR(*vsmap)) {
2223 int psindex;
2224
2225 if (flag == CL_FIND) {
2226 /*
2227 * If there's an error and the entry is clear, then
2228 * we've run out of swap space. Record the error
2229 * here and return.
2230 */
2231 if (error) {
2232 VSM_SETERR(*vsmap, error);
2233 }
2234 VS_MAP_UNLOCK(vs);
2235 return (dp_offset_t) -1;
2236 } else {
2237 /*
2238 * Attempt to allocate a cluster from the paging segment
2239 */
2240 newcl = ps_allocate_cluster(vs, &psindex,
2241 PAGING_SEGMENT_NULL);
2242 if (newcl == (dp_offset_t) -1) {
2243 VS_MAP_UNLOCK(vs);
2244 return (dp_offset_t) -1;
2245 }
2246 VSM_CLR(*vsmap);
2247 VSM_SETCLOFF(*vsmap, newcl);
2248 VSM_SETPS(*vsmap, psindex);
2249 }
2250 } else
2251 newcl = VSM_CLOFF(*vsmap);
2252
2253 /*
2254 * Fill in pertinent fields of the clmap
2255 */
2256 clmap->cl_ps = VSM_PS(*vsmap);
2257 clmap->cl_numpages = VSCLSIZE(vs);
2258 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
2259
2260 /*
2261 * Byte offset in paging segment is byte offset to cluster plus
2262 * byte offset within cluster. It looks ugly, but should be
2263 * relatively quick.
2264 */
2265 ASSERT(trunc_page(offset) == offset);
2266 newcl = ptoa_32(newcl) << vs->vs_clshift;
2267 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
2268 if (flag == CL_ALLOC) {
2269 /*
2270 * set bits in the allocation bitmap according to which
2271 * pages were requested. size is in bytes.
2272 */
2273 i = atop_32(newoff);
2274 while ((size > 0) && (i < VSCLSIZE(vs))) {
2275 VSM_SETALLOC(*vsmap, i);
2276 i++;
2277 size -= vm_page_size;
2278 }
2279 }
2280 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
2281 if (newoff) {
2282 /*
2283 * Offset is not cluster aligned, so number of pages
2284 * and bitmaps must be adjusted
2285 */
2286 clmap->cl_numpages -= atop_32(newoff);
2287 CLMAP_SHIFT(clmap, vs);
2288 CLMAP_SHIFTALLOC(clmap, vs);
2289 }
2290
2291 /*
2292 *
2293 * The setting of valid bits and handling of write errors
2294 * must be done here, while we hold the lock on the map.
2295 * It logically should be done in ps_vs_write_complete().
2296 * The size and error information has been passed from
2297 * ps_vs_write_complete(). If the size parameter is non-zero,
2298 * then there is work to be done. If error is also non-zero,
2299 * then the error number is recorded in the cluster and the
2300 * entire cluster is in error.
2301 */
2302 if (size && flag == CL_FIND) {
2303 dp_offset_t off = (dp_offset_t) 0;
2304
2305 if (!error) {
2306 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
2307 i++) {
2308 VSM_SETPG(*vsmap, i);
2309 size -= vm_page_size;
2310 }
2311 ASSERT(i <= VSCLSIZE(vs));
2312 } else {
2313 BS_STAT(clmap->cl_ps->ps_bs,
2314 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
2315 atop_32(size));
2316 off = VSM_CLOFF(*vsmap);
2317 VSM_SETERR(*vsmap, error);
2318 }
2319 /*
2320 * Deallocate cluster if error, and no valid pages
2321 * already present.
2322 */
2323 if (off != (dp_offset_t) 0)
2324 ps_deallocate_cluster(clmap->cl_ps, off);
2325 VS_MAP_UNLOCK(vs);
2326 return (dp_offset_t) 0;
2327 } else
2328 VS_MAP_UNLOCK(vs);
2329
2330 DP_DEBUG(DEBUG_VS_INTERNAL,
2331 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2332 newcl+newoff, (int) vs, (int) vsmap, flag));
2333 DP_DEBUG(DEBUG_VS_INTERNAL,
2334 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2335 (int) clmap->cl_ps, clmap->cl_numpages,
2336 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
2337
2338 return (newcl + newoff);
2339 }
2340
2341 void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t); /* forward */
2342
2343 void
2344 ps_clunmap(
2345 vstruct_t vs,
2346 dp_offset_t offset,
2347 dp_size_t length)
2348 {
2349 dp_offset_t cluster; /* The cluster number of offset */
2350 struct vs_map *vsmap;
2351 struct ps_vnode_trim_data trim_data;
2352
2353 ps_vnode_trim_init(&trim_data);
2354
2355 VS_MAP_LOCK(vs);
2356
2357 /*
2358 * Loop through all clusters in this range, freeing paging segment
2359 * clusters and map entries as encountered.
2360 */
2361 while (length > 0) {
2362 dp_offset_t newoff;
2363 unsigned int i;
2364
2365 cluster = atop_32(offset) >> vs->vs_clshift;
2366 if (vs->vs_indirect) /* indirect map */
2367 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2368 else
2369 vsmap = vs->vs_dmap;
2370 if (vsmap == NULL) {
2371 ps_vnode_trim_now(&trim_data);
2372 VS_MAP_UNLOCK(vs);
2373 return;
2374 }
2375 vsmap += cluster%CLMAP_ENTRIES;
2376 if (VSM_ISCLR(*vsmap)) {
2377 ps_vnode_trim_now(&trim_data);
2378 length -= vm_page_size;
2379 offset += vm_page_size;
2380 continue;
2381 }
2382 /*
2383 * We've got a valid mapping. Clear it and deallocate
2384 * paging segment cluster pages.
2385 * Optimize for entire cluster cleraing.
2386 */
2387 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2388 /*
2389 * Not cluster aligned.
2390 */
2391 ASSERT(trunc_page(newoff) == newoff);
2392 i = atop_32(newoff);
2393 } else
2394 i = 0;
2395 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2396 VSM_CLRPG(*vsmap, i);
2397 VSM_CLRALLOC(*vsmap, i);
2398 length -= vm_page_size;
2399 offset += vm_page_size;
2400 i++;
2401 }
2402
2403 /*
2404 * If map entry is empty, clear and deallocate cluster.
2405 */
2406 if (!VSM_BMAP(*vsmap)) {
2407 ps_vnode_trim_more(&trim_data,
2408 vsmap,
2409 vs->vs_clshift,
2410 VSCLSIZE(vs) * vm_page_size);
2411 ps_deallocate_cluster(VSM_PS(*vsmap),
2412 VSM_CLOFF(*vsmap));
2413 VSM_CLR(*vsmap);
2414 } else {
2415 ps_vnode_trim_now(&trim_data);
2416 }
2417 }
2418 ps_vnode_trim_now(&trim_data);
2419
2420 VS_MAP_UNLOCK(vs);
2421 }
2422
2423 void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */
2424
2425 void
2426 ps_vs_write_complete(
2427 vstruct_t vs,
2428 dp_offset_t offset,
2429 dp_size_t size,
2430 int error)
2431 {
2432 struct clmap clmap;
2433
2434 /*
2435 * Get the struct vsmap for this cluster.
2436 * Use READ, even though it was written, because the
2437 * cluster MUST be present, unless there was an error
2438 * in the original ps_clmap (e.g. no space), in which
2439 * case, nothing happens.
2440 *
2441 * Must pass enough information to ps_clmap to allow it
2442 * to set the vs_map structure bitmap under lock.
2443 */
2444 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2445 }
2446
2447 void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int); /* forward */
2448
2449 void
2450 vs_cl_write_complete(
2451 vstruct_t vs,
2452 __unused paging_segment_t ps,
2453 dp_offset_t offset,
2454 __unused vm_offset_t addr,
2455 dp_size_t size,
2456 boolean_t async,
2457 int error)
2458 {
2459 // kern_return_t kr;
2460
2461 if (error) {
2462 /*
2463 * For internal objects, the error is recorded on a
2464 * per-cluster basis by ps_clmap() which is called
2465 * by ps_vs_write_complete() below.
2466 */
2467 dprintf(("write failed error = 0x%x\n", error));
2468 /* add upl_abort code here */
2469 } else
2470 GSTAT(global_stats.gs_pages_out += atop_32(size));
2471 /*
2472 * Notify the vstruct mapping code, so it can do its accounting.
2473 */
2474 ps_vs_write_complete(vs, offset, size, error);
2475
2476 if (async) {
2477 VS_LOCK(vs);
2478 ASSERT(vs->vs_async_pending > 0);
2479 vs->vs_async_pending -= size;
2480 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2481 vs->vs_waiting_async = FALSE;
2482 VS_UNLOCK(vs);
2483 thread_wakeup(&vs->vs_async_pending);
2484 } else {
2485 VS_UNLOCK(vs);
2486 }
2487 }
2488 }
2489
2490 #ifdef DEVICE_PAGING
2491 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2492
2493 kern_return_t
2494 device_write_reply(
2495 MACH_PORT_FACE reply_port,
2496 kern_return_t device_code,
2497 io_buf_len_t bytes_written)
2498 {
2499 struct vs_async *vsa;
2500
2501 vsa = (struct vs_async *)
2502 ((struct vstruct_alias *)(reply_port->ip_alias))->vs;
2503
2504 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2505 device_code = KERN_FAILURE;
2506 }
2507
2508 vsa->vsa_error = device_code;
2509
2510
2511 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2512 if(vsa->vsa_flags & VSA_TRANSFER) {
2513 /* revisit when async disk segments redone */
2514 if(vsa->vsa_error) {
2515 /* need to consider error condition. re-write data or */
2516 /* throw it away here. */
2517 vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2518 }
2519 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2520 vsa->vsa_size, vsa->vsa_error);
2521 } else {
2522 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2523 vsa->vsa_addr, vsa->vsa_size, TRUE,
2524 vsa->vsa_error);
2525 }
2526 VS_FREE_ASYNC(vsa);
2527
2528 return KERN_SUCCESS;
2529 }
2530
2531 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2532 kern_return_t
2533 device_write_reply_inband(
2534 MACH_PORT_FACE reply_port,
2535 kern_return_t return_code,
2536 io_buf_len_t bytes_written)
2537 {
2538 panic("device_write_reply_inband: illegal");
2539 return KERN_SUCCESS;
2540 }
2541
2542 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2543 kern_return_t
2544 device_read_reply(
2545 MACH_PORT_FACE reply_port,
2546 kern_return_t return_code,
2547 io_buf_ptr_t data,
2548 mach_msg_type_number_t dataCnt)
2549 {
2550 struct vs_async *vsa;
2551 vsa = (struct vs_async *)
2552 ((struct vstruct_alias *)(reply_port->defpager_importance.alias))->vs;
2553 vsa->vsa_addr = (vm_offset_t)data;
2554 vsa->vsa_size = (vm_size_t)dataCnt;
2555 vsa->vsa_error = return_code;
2556 thread_wakeup(&vsa);
2557 return KERN_SUCCESS;
2558 }
2559
2560 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2561 kern_return_t
2562 device_read_reply_inband(
2563 MACH_PORT_FACE reply_port,
2564 kern_return_t return_code,
2565 io_buf_ptr_inband_t data,
2566 mach_msg_type_number_t dataCnt)
2567 {
2568 panic("device_read_reply_inband: illegal");
2569 return KERN_SUCCESS;
2570 }
2571
2572 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2573 kern_return_t
2574 device_read_reply_overwrite(
2575 MACH_PORT_FACE reply_port,
2576 kern_return_t return_code,
2577 io_buf_len_t bytes_read)
2578 {
2579 panic("device_read_reply_overwrite: illegal\n");
2580 return KERN_SUCCESS;
2581 }
2582
2583 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2584 kern_return_t
2585 device_open_reply(
2586 MACH_PORT_FACE reply_port,
2587 kern_return_t return_code,
2588 MACH_PORT_FACE device_port)
2589 {
2590 panic("device_open_reply: illegal\n");
2591 return KERN_SUCCESS;
2592 }
2593
2594 kern_return_t
2595 ps_read_device(
2596 paging_segment_t ps,
2597 dp_offset_t offset,
2598 vm_offset_t *bufferp,
2599 unsigned int size,
2600 unsigned int *residualp,
2601 int flags)
2602 {
2603 kern_return_t kr;
2604 recnum_t dev_offset;
2605 unsigned int bytes_wanted;
2606 unsigned int bytes_read;
2607 unsigned int total_read;
2608 vm_offset_t dev_buffer;
2609 vm_offset_t buf_ptr;
2610 unsigned int records_read;
2611 struct vs_async *vsa;
2612
2613 device_t device;
2614 vm_map_copy_t device_data = NULL;
2615 default_pager_thread_t *dpt = NULL;
2616
2617 device = dev_port_lookup(ps->ps_device);
2618 clustered_reads[atop_32(size)]++;
2619
2620 dev_offset = (ps->ps_offset +
2621 (offset >> (vm_page_shift - ps->ps_record_shift)));
2622 bytes_wanted = size;
2623 total_read = 0;
2624 *bufferp = (vm_offset_t)NULL;
2625
2626 do {
2627 vsa = VS_ALLOC_ASYNC();
2628 if (vsa) {
2629 vsa->vsa_vs = NULL;
2630 vsa->vsa_addr = 0;
2631 vsa->vsa_offset = 0;
2632 vsa->vsa_size = 0;
2633 vsa->vsa_ps = NULL;
2634 }
2635 ip_lock(vsa->reply_port);
2636 vsa->reply_port->ip_sorights++;
2637 ip_reference(vsa->reply_port);
2638 ip_unlock(vsa->reply_port);
2639 kr = ds_device_read_common(device,
2640 vsa->reply_port,
2641 (mach_msg_type_name_t)
2642 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2643 (dev_mode_t) 0,
2644 dev_offset,
2645 bytes_wanted,
2646 (IO_READ | IO_CALL),
2647 (io_buf_ptr_t *) &dev_buffer,
2648 (mach_msg_type_number_t *) &bytes_read);
2649 if(kr == MIG_NO_REPLY) {
2650 assert_wait(&vsa, THREAD_UNINT);
2651 thread_block(THREAD_CONTINUE_NULL);
2652
2653 dev_buffer = vsa->vsa_addr;
2654 bytes_read = (unsigned int)vsa->vsa_size;
2655 kr = vsa->vsa_error;
2656 }
2657 VS_FREE_ASYNC(vsa);
2658 if (kr != KERN_SUCCESS || bytes_read == 0) {
2659 break;
2660 }
2661 total_read += bytes_read;
2662
2663 /*
2664 * If we got the entire range, use the returned dev_buffer.
2665 */
2666 if (bytes_read == size) {
2667 *bufferp = (vm_offset_t)dev_buffer;
2668 break;
2669 }
2670
2671 #if 1
2672 dprintf(("read only %d bytes out of %d\n",
2673 bytes_read, bytes_wanted));
2674 #endif
2675 if(dpt == NULL) {
2676 dpt = get_read_buffer();
2677 buf_ptr = dpt->dpt_buffer;
2678 *bufferp = (vm_offset_t)buf_ptr;
2679 }
2680 /*
2681 * Otherwise, copy the data into the provided buffer (*bufferp)
2682 * and append the rest of the range as it comes in.
2683 */
2684 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2685 buf_ptr += bytes_read;
2686 bytes_wanted -= bytes_read;
2687 records_read = (bytes_read >>
2688 (vm_page_shift - ps->ps_record_shift));
2689 dev_offset += records_read;
2690 DP_DEBUG(DEBUG_VS_INTERNAL,
2691 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2692 dev_buffer, bytes_read));
2693 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2694 != KERN_SUCCESS)
2695 Panic("dealloc buf");
2696 } while (bytes_wanted);
2697
2698 *residualp = size - total_read;
2699 if((dev_buffer != *bufferp) && (total_read != 0)) {
2700 vm_offset_t temp_buffer;
2701 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK));
2702 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2703 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2704 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2705 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2706 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2707 (vm_map_copy_t *)&device_data, FALSE))
2708 panic("ps_read_device: cannot copyin locally provided buffer\n");
2709 }
2710 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2711 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2712 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2713 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2714 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2715 (vm_map_copy_t *)&device_data, FALSE))
2716 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2717 }
2718 else {
2719 device_data = NULL;
2720 }
2721 *bufferp = (vm_offset_t)device_data;
2722
2723 if(dpt != NULL) {
2724 /* Free the receive buffer */
2725 dpt->checked_out = 0;
2726 thread_wakeup(&dpt_array);
2727 }
2728 return KERN_SUCCESS;
2729 }
2730
2731 kern_return_t
2732 ps_write_device(
2733 paging_segment_t ps,
2734 dp_offset_t offset,
2735 vm_offset_t addr,
2736 unsigned int size,
2737 struct vs_async *vsa)
2738 {
2739 recnum_t dev_offset;
2740 io_buf_len_t bytes_to_write, bytes_written;
2741 recnum_t records_written;
2742 kern_return_t kr;
2743 MACH_PORT_FACE reply_port;
2744
2745
2746
2747 clustered_writes[atop_32(size)]++;
2748
2749 dev_offset = (ps->ps_offset +
2750 (offset >> (vm_page_shift - ps->ps_record_shift)));
2751 bytes_to_write = size;
2752
2753 if (vsa) {
2754 /*
2755 * Asynchronous write.
2756 */
2757 reply_port = vsa->reply_port;
2758 ip_lock(reply_port);
2759 reply_port->ip_sorights++;
2760 ip_reference(reply_port);
2761 ip_unlock(reply_port);
2762 {
2763 device_t device;
2764 device = dev_port_lookup(ps->ps_device);
2765
2766 vsa->vsa_addr = addr;
2767 kr=ds_device_write_common(device,
2768 reply_port,
2769 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2770 (dev_mode_t) 0,
2771 dev_offset,
2772 (io_buf_ptr_t) addr,
2773 size,
2774 (IO_WRITE | IO_CALL),
2775 &bytes_written);
2776 }
2777 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2778 if (verbose)
2779 dprintf(("%s0x%x, addr=0x%x,"
2780 "size=0x%x,offset=0x%x\n",
2781 "device_write_request returned ",
2782 kr, addr, size, offset));
2783 BS_STAT(ps->ps_bs,
2784 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2785 /* do the completion notification to free resources */
2786 device_write_reply(reply_port, kr, 0);
2787 return PAGER_ERROR;
2788 }
2789 } else do {
2790 /*
2791 * Synchronous write.
2792 */
2793 {
2794 device_t device;
2795 device = dev_port_lookup(ps->ps_device);
2796 kr=ds_device_write_common(device,
2797 IP_NULL, 0,
2798 (dev_mode_t) 0,
2799 dev_offset,
2800 (io_buf_ptr_t) addr,
2801 size,
2802 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2803 &bytes_written);
2804 }
2805 if (kr != KERN_SUCCESS) {
2806 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2807 "device_write returned ",
2808 kr, addr, size, offset));
2809 BS_STAT(ps->ps_bs,
2810 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2811 return PAGER_ERROR;
2812 }
2813 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2814 Panic("fragmented write");
2815 records_written = (bytes_written >>
2816 (vm_page_shift - ps->ps_record_shift));
2817 dev_offset += records_written;
2818 #if 1
2819 if (bytes_written != bytes_to_write) {
2820 dprintf(("wrote only %d bytes out of %d\n",
2821 bytes_written, bytes_to_write));
2822 }
2823 #endif
2824 bytes_to_write -= bytes_written;
2825 addr += bytes_written;
2826 } while (bytes_to_write > 0);
2827
2828 return PAGER_SUCCESS;
2829 }
2830
2831
2832 #else /* !DEVICE_PAGING */
2833
2834 kern_return_t
2835 ps_read_device(
2836 __unused paging_segment_t ps,
2837 __unused dp_offset_t offset,
2838 __unused vm_offset_t *bufferp,
2839 __unused unsigned int size,
2840 __unused unsigned int *residualp,
2841 __unused int flags)
2842 {
2843 panic("ps_read_device not supported");
2844 return KERN_FAILURE;
2845 }
2846
2847 kern_return_t
2848 ps_write_device(
2849 __unused paging_segment_t ps,
2850 __unused dp_offset_t offset,
2851 __unused vm_offset_t addr,
2852 __unused unsigned int size,
2853 __unused struct vs_async *vsa)
2854 {
2855 panic("ps_write_device not supported");
2856 return KERN_FAILURE;
2857 }
2858
2859 #endif /* DEVICE_PAGING */
2860 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t); /* forward */
2861
2862 void
2863 pvs_object_data_provided(
2864 __unused vstruct_t vs,
2865 __unused upl_t upl,
2866 __unused upl_offset_t offset,
2867 upl_size_t size)
2868 {
2869 #if RECLAIM_SWAP
2870 boolean_t empty;
2871 #endif
2872
2873 DP_DEBUG(DEBUG_VS_INTERNAL,
2874 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2875 upl, offset, size));
2876
2877 ASSERT(size > 0);
2878 GSTAT(global_stats.gs_pages_in += atop_32(size));
2879
2880 /* check upl iosync flag instead of using RECLAIM_SWAP*/
2881 #if RECLAIM_SWAP
2882 if (size != upl->size) {
2883 if (size) {
2884 ps_clunmap(vs, offset, size);
2885 upl_commit_range(upl, 0, size, 0, NULL, 0, &empty);
2886 }
2887 upl_abort(upl, UPL_ABORT_ERROR);
2888 upl_deallocate(upl);
2889 } else {
2890 ps_clunmap(vs, offset, size);
2891 upl_commit(upl, NULL, 0);
2892 upl_deallocate(upl);
2893 }
2894 #endif /* RECLAIM_SWAP */
2895
2896 }
2897
2898 static memory_object_offset_t last_start;
2899 static vm_size_t last_length;
2900
2901 /*
2902 * A "cnt" of 0 means that the caller just wants to check if the page at
2903 * offset "vs_offset" exists in the backing store. That page hasn't been
2904 * prepared, so no need to release it.
2905 *
2906 * A "cnt" of -1 means that the caller wants to bring back from the backing
2907 * store all existing pages in the cluster containing "vs_offset".
2908 */
2909 kern_return_t
2910 pvs_cluster_read(
2911 vstruct_t vs,
2912 dp_offset_t vs_offset,
2913 dp_size_t cnt,
2914 void *fault_info)
2915 {
2916 kern_return_t error = KERN_SUCCESS;
2917 unsigned int size;
2918 unsigned int residual;
2919 unsigned int request_flags;
2920 int io_flags = 0;
2921 int seg_index;
2922 int pages_in_cl;
2923 int cl_size;
2924 int cl_mask;
2925 int cl_index;
2926 unsigned int xfer_size;
2927 dp_offset_t orig_vs_offset;
2928 dp_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2929 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2930 struct clmap clmap;
2931 upl_t upl;
2932 unsigned int page_list_count;
2933 memory_object_offset_t cluster_start;
2934 vm_size_t cluster_length;
2935 uint32_t io_streaming;
2936 int i;
2937 boolean_t io_sync = FALSE;
2938 boolean_t reclaim_all = FALSE;
2939
2940 pages_in_cl = 1 << vs->vs_clshift;
2941 cl_size = pages_in_cl * vm_page_size;
2942 cl_mask = cl_size - 1;
2943
2944 request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2945
2946 if (cnt == (dp_size_t) -1)
2947 reclaim_all = TRUE;
2948
2949 if (reclaim_all == TRUE) {
2950 /*
2951 * We've been called from ps_vstruct_reclaim() to move all
2952 * the object's swapped pages back to VM pages.
2953 * This can put memory pressure on the system, so we do want
2954 * to wait for free pages, to avoid getting in the way of the
2955 * vm_pageout_scan() thread.
2956 * Let's not use UPL_NOBLOCK in this case.
2957 */
2958 vs_offset &= ~cl_mask;
2959 i = pages_in_cl;
2960 } else {
2961 i = 1;
2962
2963 /*
2964 * if the I/O cluster size == PAGE_SIZE, we don't want to set
2965 * the UPL_NOBLOCK since we may be trying to recover from a
2966 * previous partial pagein I/O that occurred because we were low
2967 * on memory and bailed early in order to honor the UPL_NOBLOCK...
2968 * since we're only asking for a single page, we can block w/o fear
2969 * of tying up pages while waiting for more to become available
2970 */
2971 if (fault_info == NULL || ((vm_object_fault_info_t)fault_info)->cluster_size > PAGE_SIZE)
2972 request_flags |= UPL_NOBLOCK;
2973 }
2974
2975 again:
2976 cl_index = (vs_offset & cl_mask) / vm_page_size;
2977
2978 if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) ||
2979 !CLMAP_ISSET(clmap, cl_index)) {
2980 /*
2981 * the needed page doesn't exist in the backing store...
2982 * we don't want to try to do any I/O, just abort the
2983 * page and let the fault handler provide a zero-fill
2984 */
2985 if (cnt == 0) {
2986 /*
2987 * The caller was just poking at us to see if
2988 * the page has been paged out. No need to
2989 * mess with the page at all.
2990 * Just let the caller know we don't have that page.
2991 */
2992 return KERN_FAILURE;
2993 }
2994 if (reclaim_all == TRUE) {
2995 i--;
2996 if (i == 0) {
2997 /* no more pages in this cluster */
2998 return KERN_FAILURE;
2999 }
3000 /* try the next page in this cluster */
3001 vs_offset += vm_page_size;
3002 goto again;
3003 }
3004
3005 page_list_count = 0;
3006
3007 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
3008 PAGE_SIZE, PAGE_SIZE,
3009 &upl, NULL, &page_list_count,
3010 request_flags | UPL_SET_INTERNAL);
3011 upl_range_needed(upl, 0, 1);
3012
3013 if (clmap.cl_error)
3014 upl_abort(upl, UPL_ABORT_ERROR);
3015 else
3016 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
3017 upl_deallocate(upl);
3018
3019 return KERN_SUCCESS;
3020 }
3021
3022 if (cnt == 0) {
3023 /*
3024 * The caller was just poking at us to see if
3025 * the page has been paged out. No need to
3026 * mess with the page at all.
3027 * Just let the caller know we do have that page.
3028 */
3029 return KERN_SUCCESS;
3030 }
3031
3032 if(((vm_object_fault_info_t)fault_info)->io_sync == TRUE ) {
3033 io_sync = TRUE;
3034 } else {
3035 #if RECLAIM_SWAP
3036 io_sync = TRUE;
3037 #endif /* RECLAIM_SWAP */
3038 }
3039
3040 if( io_sync == TRUE ) {
3041
3042 io_flags |= UPL_IOSYNC | UPL_NOCOMMIT;
3043 #if USE_PRECIOUS
3044 request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE;
3045 #else /* USE_PRECIOUS */
3046 request_flags |= UPL_REQUEST_SET_DIRTY;
3047 #endif /* USE_PRECIOUS */
3048 }
3049
3050 assert(dp_encryption_inited);
3051 if (dp_encryption) {
3052 /*
3053 * ENCRYPTED SWAP:
3054 * request that the UPL be prepared for
3055 * decryption.
3056 */
3057 request_flags |= UPL_ENCRYPT;
3058 io_flags |= UPL_PAGING_ENCRYPTED;
3059 }
3060 orig_vs_offset = vs_offset;
3061
3062 assert(cnt != 0);
3063 cnt = VM_SUPER_CLUSTER;
3064 cluster_start = (memory_object_offset_t) vs_offset;
3065 cluster_length = (vm_size_t) cnt;
3066 io_streaming = 0;
3067
3068 /*
3069 * determine how big a speculative I/O we should try for...
3070 */
3071 if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
3072 assert(vs_offset >= (dp_offset_t) cluster_start &&
3073 vs_offset < (dp_offset_t) (cluster_start + cluster_length));
3074 vs_offset = (dp_offset_t) cluster_start;
3075 cnt = (dp_size_t) cluster_length;
3076 } else {
3077 cluster_length = PAGE_SIZE;
3078 cnt = PAGE_SIZE;
3079 }
3080
3081 if (io_streaming)
3082 io_flags |= UPL_IOSTREAMING;
3083
3084 last_start = cluster_start;
3085 last_length = cluster_length;
3086
3087 /*
3088 * This loop will be executed multiple times until the entire
3089 * range has been looked at or we issue an I/O... if the request spans cluster
3090 * boundaries, the clusters will be checked for logical continunity,
3091 * if contiguous the I/O request will span multiple clusters...
3092 * at most only 1 I/O will be issued... it will encompass the original offset
3093 */
3094 while (cnt && error == KERN_SUCCESS) {
3095 int ps_info_valid;
3096
3097 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
3098 size = VM_SUPER_CLUSTER;
3099 size -= vs_offset & cl_mask;
3100 } else if (cnt > VM_SUPER_CLUSTER)
3101 size = VM_SUPER_CLUSTER;
3102 else
3103 size = cnt;
3104
3105 cnt -= size;
3106
3107 ps_info_valid = 0;
3108 seg_index = 0;
3109
3110 while (size > 0 && error == KERN_SUCCESS) {
3111 unsigned int abort_size;
3112 unsigned int lsize;
3113 int failed_size;
3114 int beg_pseg;
3115 int beg_indx;
3116 dp_offset_t cur_offset;
3117
3118 if ( !ps_info_valid) {
3119 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3120 psp[seg_index] = CLMAP_PS(clmap);
3121 ps_info_valid = 1;
3122 }
3123 /*
3124 * skip over unallocated physical segments
3125 */
3126 if (ps_offset[seg_index] == (dp_offset_t) -1) {
3127 abort_size = cl_size - (vs_offset & cl_mask);
3128 abort_size = MIN(abort_size, size);
3129
3130 size -= abort_size;
3131 vs_offset += abort_size;
3132
3133 seg_index++;
3134 ps_info_valid = 0;
3135
3136 continue;
3137 }
3138 cl_index = (vs_offset & cl_mask) / vm_page_size;
3139
3140 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
3141 /*
3142 * skip over unallocated pages
3143 */
3144 if (CLMAP_ISSET(clmap, cl_index))
3145 break;
3146 abort_size += vm_page_size;
3147 }
3148 if (abort_size) {
3149 size -= abort_size;
3150 vs_offset += abort_size;
3151
3152 if (cl_index == pages_in_cl) {
3153 /*
3154 * if we're at the end of this physical cluster
3155 * then bump to the next one and continue looking
3156 */
3157 seg_index++;
3158 ps_info_valid = 0;
3159
3160 continue;
3161 }
3162 if (size == 0)
3163 break;
3164 }
3165 /*
3166 * remember the starting point of the first allocated page
3167 * for the I/O we're about to issue
3168 */
3169 beg_pseg = seg_index;
3170 beg_indx = cl_index;
3171 cur_offset = vs_offset;
3172
3173 /*
3174 * calculate the size of the I/O that we can do...
3175 * this may span multiple physical segments if
3176 * they are contiguous
3177 */
3178 for (xfer_size = 0; xfer_size < size; ) {
3179
3180 while (cl_index < pages_in_cl && xfer_size < size) {
3181 /*
3182 * accumulate allocated pages within
3183 * a physical segment
3184 */
3185 if (CLMAP_ISSET(clmap, cl_index)) {
3186 xfer_size += vm_page_size;
3187 cur_offset += vm_page_size;
3188 cl_index++;
3189
3190 BS_STAT(psp[seg_index]->ps_bs,
3191 psp[seg_index]->ps_bs->bs_pages_in++);
3192 } else
3193 break;
3194 }
3195 if (cl_index < pages_in_cl || xfer_size >= size) {
3196 /*
3197 * we've hit an unallocated page or
3198 * the end of this request... see if
3199 * it's time to fire the I/O
3200 */
3201 break;
3202 }
3203 /*
3204 * we've hit the end of the current physical
3205 * segment and there's more to do, so try
3206 * moving to the next one
3207 */
3208 seg_index++;
3209
3210 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3211 psp[seg_index] = CLMAP_PS(clmap);
3212 ps_info_valid = 1;
3213
3214 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
3215 /*
3216 * if the physical segment we're about
3217 * to step into is not contiguous to
3218 * the one we're currently in, or it's
3219 * in a different paging file, or
3220 * it hasn't been allocated....
3221 * we stop this run and go check
3222 * to see if it's time to fire the I/O
3223 */
3224 break;
3225 }
3226 /*
3227 * start with first page of the next physical
3228 * segment
3229 */
3230 cl_index = 0;
3231 }
3232 if (xfer_size == 0) {
3233 /*
3234 * no I/O to generate for this segment
3235 */
3236 continue;
3237 }
3238 if (cur_offset <= orig_vs_offset) {
3239 /*
3240 * we've hit a hole in our speculative cluster
3241 * before the offset that we're really after...
3242 * don't issue the I/O since it doesn't encompass
3243 * the original offset and we're looking to only
3244 * pull in the speculative pages if they can be
3245 * made part of a single I/O
3246 */
3247 size -= xfer_size;
3248 vs_offset += xfer_size;
3249
3250 continue;
3251 }
3252 /*
3253 * we have a contiguous range of allocated pages
3254 * to read from that encompasses the original offset
3255 */
3256 page_list_count = 0;
3257 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
3258 xfer_size, xfer_size,
3259 &upl, NULL, &page_list_count,
3260 request_flags | UPL_SET_INTERNAL);
3261
3262 error = ps_read_file(psp[beg_pseg],
3263 upl, (upl_offset_t) 0,
3264 ps_offset[beg_pseg] + (beg_indx * vm_page_size),
3265 xfer_size, &residual, io_flags);
3266
3267
3268 /*
3269 * Adjust counts and send response to VM. Optimize
3270 * for the common case, i.e. no error and/or partial
3271 * data. If there was an error, then we need to error
3272 * the entire range, even if some data was successfully
3273 * read. If there was a partial read we may supply some
3274 * data and may error some as well. In all cases the
3275 * VM must receive some notification for every page
3276 * in the range.
3277 */
3278 if ((error == KERN_SUCCESS) && (residual == 0)) {
3279 /*
3280 * Got everything we asked for, supply the data
3281 * to the VM. Note that as a side effect of
3282 * supplying the data, the buffer holding the
3283 * supplied data is deallocated from the pager's
3284 * address space.
3285 */
3286 lsize = xfer_size;
3287 failed_size = 0;
3288 } else {
3289 lsize = 0;
3290 failed_size = xfer_size;
3291
3292 if (error == KERN_SUCCESS) {
3293 if (residual == xfer_size) {
3294 /*
3295 * If a read operation returns no error
3296 * and no data moved, we turn it into
3297 * an error, assuming we're reading at
3298 * or beyong EOF.
3299 * Fall through and error the entire range.
3300 */
3301 error = KERN_FAILURE;
3302 } else {
3303 /*
3304 * Otherwise, we have partial read. If
3305 * the part read is a integral number
3306 * of pages supply it. Otherwise round
3307 * it up to a page boundary, zero fill
3308 * the unread part, and supply it.
3309 * Fall through and error the remainder
3310 * of the range, if any.
3311 */
3312 int fill;
3313
3314 fill = residual & (vm_page_size - 1);
3315 lsize = (xfer_size - residual) + fill;
3316
3317 if (lsize < xfer_size)
3318 failed_size = xfer_size - lsize;
3319
3320 if (reclaim_all == FALSE)
3321 error = KERN_FAILURE;
3322 }
3323 }
3324 }
3325 pvs_object_data_provided(vs, upl, vs_offset, lsize);
3326
3327 if (failed_size) {
3328 /*
3329 * There was an error in some part of the range, tell
3330 * the VM. Note that error is explicitly checked again
3331 * since it can be modified above.
3332 */
3333 BS_STAT(psp[beg_pseg]->ps_bs,
3334 psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
3335 }
3336 /*
3337 * we've issued a single I/O that encompassed the original offset
3338 * at this point we either met our speculative request length or
3339 * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3340 * not present or not physically contiguous to the previous one), so
3341 * we're done issuing I/O at this point
3342 */
3343 return (error);
3344 }
3345 }
3346 return error;
3347 }
3348
3349 int vs_do_async_write = 1;
3350
3351 kern_return_t
3352 vs_cluster_write(
3353 vstruct_t vs,
3354 upl_t internal_upl,
3355 upl_offset_t offset,
3356 upl_size_t cnt,
3357 boolean_t dp_internal,
3358 int flags)
3359 {
3360 upl_size_t transfer_size;
3361 int error = 0;
3362 struct clmap clmap;
3363
3364 dp_offset_t actual_offset; /* Offset within paging segment */
3365 paging_segment_t ps;
3366 dp_offset_t mobj_base_addr;
3367 dp_offset_t mobj_target_addr;
3368
3369 upl_t upl;
3370 upl_page_info_t *pl;
3371 int page_index;
3372 unsigned int page_max_index;
3373 int list_size;
3374 int pages_in_cl;
3375 unsigned int cl_size;
3376 int base_index;
3377 unsigned int seg_size;
3378 unsigned int upl_offset_in_object;
3379 boolean_t minimal_clustering = FALSE;
3380 boolean_t found_dirty;
3381
3382 if (!dp_encryption_inited) {
3383 /*
3384 * ENCRYPTED SWAP:
3385 * Once we've started using swap, we
3386 * can't change our mind on whether
3387 * it needs to be encrypted or
3388 * not.
3389 */
3390 dp_encryption_inited = TRUE;
3391 }
3392 if (dp_encryption) {
3393 /*
3394 * ENCRYPTED SWAP:
3395 * the UPL will need to be encrypted...
3396 */
3397 flags |= UPL_PAGING_ENCRYPTED;
3398 }
3399
3400 pages_in_cl = 1 << vs->vs_clshift;
3401 cl_size = pages_in_cl * vm_page_size;
3402
3403 #if CONFIG_FREEZE
3404 minimal_clustering = TRUE;
3405 #else
3406 if (dp_isssd == TRUE)
3407 minimal_clustering = TRUE;
3408 #endif
3409 if (!dp_internal) {
3410 unsigned int page_list_count;
3411 int request_flags;
3412 unsigned int super_size;
3413 int first_dirty;
3414 int num_dirty;
3415 int num_of_pages;
3416 int seg_index;
3417 upl_offset_t upl_offset;
3418 upl_offset_t upl_offset_aligned;
3419 dp_offset_t seg_offset;
3420 dp_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3421 paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3422
3423
3424 if (bs_low)
3425 super_size = cl_size;
3426 else
3427 super_size = VM_SUPER_CLUSTER;
3428
3429 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
3430 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
3431 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
3432
3433 if (dp_encryption) {
3434 /*
3435 * ENCRYPTED SWAP:
3436 * request that the UPL be prepared for
3437 * encryption.
3438 */
3439 request_flags |= UPL_ENCRYPT;
3440 flags |= UPL_PAGING_ENCRYPTED;
3441 }
3442
3443 page_list_count = 0;
3444 memory_object_super_upl_request(vs->vs_control,
3445 (memory_object_offset_t)offset,
3446 cnt, super_size,
3447 &upl, NULL, &page_list_count,
3448 request_flags | UPL_FOR_PAGEOUT);
3449
3450 /*
3451 * The default pager does not handle objects larger than
3452 * 4GB, so it does not deal with offset that don't fit in
3453 * 32-bit. Cast down upl->offset now and make sure we
3454 * did not lose any valuable bits.
3455 */
3456 upl_offset_in_object = (unsigned int) upl->offset;
3457 assert(upl->offset == upl_offset_in_object);
3458
3459 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3460
3461 seg_size = cl_size - (upl_offset_in_object % cl_size);
3462 upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1);
3463 page_index = 0;
3464 page_max_index = upl->size / PAGE_SIZE;
3465 found_dirty = TRUE;
3466
3467 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
3468
3469 unsigned int seg_pgcnt;
3470
3471 seg_pgcnt = seg_size / PAGE_SIZE;
3472
3473 if (minimal_clustering == TRUE) {
3474 unsigned int non_dirty;
3475
3476 non_dirty = 0;
3477 found_dirty = FALSE;
3478
3479 for (; non_dirty < seg_pgcnt; non_dirty++) {
3480 if ((page_index + non_dirty) >= page_max_index)
3481 break;
3482
3483 if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) ||
3484 UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) {
3485 found_dirty = TRUE;
3486 break;
3487 }
3488 }
3489 }
3490 if (found_dirty == TRUE) {
3491 ps_offset[seg_index] =
3492 ps_clmap(vs,
3493 upl_offset_aligned,
3494 &clmap, CL_ALLOC,
3495 cl_size, 0);
3496
3497 if (ps_offset[seg_index] == (dp_offset_t) -1) {
3498 upl_abort(upl, 0);
3499 upl_deallocate(upl);
3500
3501 return KERN_FAILURE;
3502 }
3503 psp[seg_index] = CLMAP_PS(clmap);
3504 }
3505 if (transfer_size > seg_size) {
3506 page_index += seg_pgcnt;
3507 transfer_size -= seg_size;
3508 upl_offset_aligned += cl_size;
3509 seg_size = cl_size;
3510 seg_index++;
3511 } else
3512 transfer_size = 0;
3513 }
3514 /*
3515 * Ignore any non-present pages at the end of the
3516 * UPL.
3517 */
3518 for (page_index = upl->size / vm_page_size; page_index > 0;) {
3519 if (UPL_PAGE_PRESENT(pl, --page_index)) {
3520 page_index++;
3521 break;
3522 }
3523 }
3524 if (page_index == 0) {
3525 /*
3526 * no pages in the UPL
3527 * abort and return
3528 */
3529 upl_abort(upl, 0);
3530 upl_deallocate(upl);
3531
3532 return KERN_SUCCESS;
3533 }
3534 num_of_pages = page_index;
3535
3536 base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE;
3537
3538 for (page_index = 0; page_index < num_of_pages; ) {
3539 /*
3540 * skip over non-dirty pages
3541 */
3542 for ( ; page_index < num_of_pages; page_index++) {
3543 if (UPL_DIRTY_PAGE(pl, page_index)
3544 || UPL_PRECIOUS_PAGE(pl, page_index))
3545 /*
3546 * this is a page we need to write
3547 * go see if we can buddy it up with
3548 * others that are contiguous to it
3549 */
3550 break;
3551 /*
3552 * if the page is not-dirty, but present we
3553 * need to commit it... This is an unusual
3554 * case since we only asked for dirty pages
3555 */
3556 if (UPL_PAGE_PRESENT(pl, page_index)) {
3557 boolean_t empty = FALSE;
3558 upl_commit_range(upl,
3559 page_index * vm_page_size,
3560 vm_page_size,
3561 UPL_COMMIT_NOTIFY_EMPTY,
3562 pl,
3563 page_list_count,
3564 &empty);
3565 if (empty) {
3566 assert(page_index ==
3567 num_of_pages - 1);
3568 upl_deallocate(upl);
3569 }
3570 }
3571 }
3572 if (page_index == num_of_pages)
3573 /*
3574 * no more pages to look at, we're out of here
3575 */
3576 break;
3577
3578 /*
3579 * gather up contiguous dirty pages... we have at
3580 * least 1 * otherwise we would have bailed above
3581 * make sure that each physical segment that we step
3582 * into is contiguous to the one we're currently in
3583 * if it's not, we have to stop and write what we have
3584 */
3585 for (first_dirty = page_index;
3586 page_index < num_of_pages; ) {
3587 if ( !UPL_DIRTY_PAGE(pl, page_index)
3588 && !UPL_PRECIOUS_PAGE(pl, page_index))
3589 break;
3590 page_index++;
3591 /*
3592 * if we just looked at the last page in the UPL
3593 * we don't need to check for physical segment
3594 * continuity
3595 */
3596 if (page_index < num_of_pages) {
3597 int cur_seg;
3598 int nxt_seg;
3599
3600 cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3601 nxt_seg = (base_index + page_index)/pages_in_cl;
3602
3603 if (cur_seg != nxt_seg) {
3604 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3605 /*
3606 * if the segment we're about
3607 * to step into is not
3608 * contiguous to the one we're
3609 * currently in, or it's in a
3610 * different paging file....
3611 * we stop here and generate
3612 * the I/O
3613 */
3614 break;
3615 }
3616 }
3617 }
3618 num_dirty = page_index - first_dirty;
3619
3620 if (num_dirty) {
3621 upl_offset = first_dirty * vm_page_size;
3622 transfer_size = num_dirty * vm_page_size;
3623
3624 while (transfer_size) {
3625
3626 if ((seg_size = cl_size -
3627 ((upl_offset_in_object +
3628 upl_offset) % cl_size))
3629 > transfer_size)
3630 seg_size = transfer_size;
3631
3632 ps_vs_write_complete(
3633 vs,
3634 (upl_offset_in_object +
3635 upl_offset),
3636 seg_size, error);
3637
3638 transfer_size -= seg_size;
3639 upl_offset += seg_size;
3640 }
3641 upl_offset = first_dirty * vm_page_size;
3642 transfer_size = num_dirty * vm_page_size;
3643
3644 seg_index = (base_index + first_dirty) / pages_in_cl;
3645 seg_offset = (upl_offset_in_object + upl_offset) % cl_size;
3646
3647 error = ps_write_file(psp[seg_index],
3648 upl, upl_offset,
3649 ps_offset[seg_index]
3650 + seg_offset,
3651 transfer_size, flags);
3652 }
3653 }
3654
3655 } else {
3656 assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift));
3657 list_size = cnt;
3658
3659 page_index = 0;
3660 /* The caller provides a mapped_data which is derived */
3661 /* from a temporary object. The targeted pages are */
3662 /* guaranteed to be set at offset 0 in the mapped_data */
3663 /* The actual offset however must still be derived */
3664 /* from the offset in the vs in question */
3665 mobj_base_addr = offset;
3666 mobj_target_addr = mobj_base_addr;
3667
3668 for (transfer_size = list_size; transfer_size != 0;) {
3669 actual_offset = ps_clmap(vs, mobj_target_addr,
3670 &clmap, CL_ALLOC,
3671 transfer_size < cl_size ?
3672 transfer_size : cl_size, 0);
3673 if(actual_offset == (dp_offset_t) -1) {
3674 error = 1;
3675 break;
3676 }
3677 cnt = MIN(transfer_size,
3678 (unsigned) CLMAP_NPGS(clmap) * vm_page_size);
3679 ps = CLMAP_PS(clmap);
3680 /* Assume that the caller has given us contiguous */
3681 /* pages */
3682 if(cnt) {
3683 ps_vs_write_complete(vs, mobj_target_addr,
3684 cnt, error);
3685 error = ps_write_file(ps, internal_upl,
3686 0, actual_offset,
3687 cnt, flags);
3688 if (error)
3689 break;
3690 }
3691 if (error)
3692 break;
3693 actual_offset += cnt;
3694 mobj_target_addr += cnt;
3695 transfer_size -= cnt;
3696 cnt = 0;
3697
3698 if (error)
3699 break;
3700 }
3701 }
3702 if(error)
3703 return KERN_FAILURE;
3704 else
3705 return KERN_SUCCESS;
3706 }
3707
3708 vm_size_t
3709 ps_vstruct_allocated_size(
3710 vstruct_t vs)
3711 {
3712 int num_pages;
3713 struct vs_map *vsmap;
3714 unsigned int i, j, k;
3715
3716 num_pages = 0;
3717 if (vs->vs_indirect) {
3718 /* loop on indirect maps */
3719 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3720 vsmap = vs->vs_imap[i];
3721 if (vsmap == NULL)
3722 continue;
3723 /* loop on clusters in this indirect map */
3724 for (j = 0; j < CLMAP_ENTRIES; j++) {
3725 if (VSM_ISCLR(vsmap[j]) ||
3726 VSM_ISERR(vsmap[j]))
3727 continue;
3728 /* loop on pages in this cluster */
3729 for (k = 0; k < VSCLSIZE(vs); k++) {
3730 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3731 num_pages++;
3732 }
3733 }
3734 }
3735 } else {
3736 vsmap = vs->vs_dmap;
3737 if (vsmap == NULL)
3738 return 0;
3739 /* loop on clusters in the direct map */
3740 for (j = 0; j < CLMAP_ENTRIES; j++) {
3741 if (VSM_ISCLR(vsmap[j]) ||
3742 VSM_ISERR(vsmap[j]))
3743 continue;
3744 /* loop on pages in this cluster */
3745 for (k = 0; k < VSCLSIZE(vs); k++) {
3746 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3747 num_pages++;
3748 }
3749 }
3750 }
3751
3752 return ptoa_32(num_pages);
3753 }
3754
3755 unsigned int
3756 ps_vstruct_allocated_pages(
3757 vstruct_t vs,
3758 default_pager_page_t *pages,
3759 unsigned int pages_size)
3760 {
3761 unsigned int num_pages;
3762 struct vs_map *vsmap;
3763 dp_offset_t offset;
3764 unsigned int i, j, k;
3765
3766 num_pages = 0;
3767 offset = 0;
3768 if (vs->vs_indirect) {
3769 /* loop on indirect maps */
3770 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3771 vsmap = vs->vs_imap[i];
3772 if (vsmap == NULL) {
3773 offset += (vm_page_size * CLMAP_ENTRIES *
3774 VSCLSIZE(vs));
3775 continue;
3776 }
3777 /* loop on clusters in this indirect map */
3778 for (j = 0; j < CLMAP_ENTRIES; j++) {
3779 if (VSM_ISCLR(vsmap[j]) ||
3780 VSM_ISERR(vsmap[j])) {
3781 offset += vm_page_size * VSCLSIZE(vs);
3782 continue;
3783 }
3784 /* loop on pages in this cluster */
3785 for (k = 0; k < VSCLSIZE(vs); k++) {
3786 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3787 num_pages++;
3788 if (num_pages < pages_size)
3789 pages++->dpp_offset =
3790 offset;
3791 }
3792 offset += vm_page_size;
3793 }
3794 }
3795 }
3796 } else {
3797 vsmap = vs->vs_dmap;
3798 if (vsmap == NULL)
3799 return 0;
3800 /* loop on clusters in the direct map */
3801 for (j = 0; j < CLMAP_ENTRIES; j++) {
3802 if (VSM_ISCLR(vsmap[j]) ||
3803 VSM_ISERR(vsmap[j])) {
3804 offset += vm_page_size * VSCLSIZE(vs);
3805 continue;
3806 }
3807 /* loop on pages in this cluster */
3808 for (k = 0; k < VSCLSIZE(vs); k++) {
3809 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3810 num_pages++;
3811 if (num_pages < pages_size)
3812 pages++->dpp_offset = offset;
3813 }
3814 offset += vm_page_size;
3815 }
3816 }
3817 }
3818
3819 return num_pages;
3820 }
3821
3822
3823 kern_return_t
3824 ps_vstruct_transfer_from_segment(
3825 vstruct_t vs,
3826 paging_segment_t segment,
3827 upl_t upl)
3828 {
3829 struct vs_map *vsmap;
3830 // struct vs_map old_vsmap;
3831 // struct vs_map new_vsmap;
3832 unsigned int i, j;
3833
3834 VS_LOCK(vs); /* block all work on this vstruct */
3835 /* can't allow the normal multiple write */
3836 /* semantic because writes may conflict */
3837 vs->vs_xfer_pending = TRUE;
3838 vs_wait_for_sync_writers(vs);
3839 vs_start_write(vs);
3840 vs_wait_for_readers(vs);
3841 /* we will unlock the vs to allow other writes while transferring */
3842 /* and will be guaranteed of the persistance of the vs struct */
3843 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3844 /* vs_async_pending */
3845 /* OK we now have guaranteed no other parties are accessing this */
3846 /* vs. Now that we are also supporting simple lock versions of */
3847 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3848 /* our purpose in holding it before was the multiple write case */
3849 /* we now use the boolean xfer_pending to do that. We can use */
3850 /* a boolean instead of a count because we have guaranteed single */
3851 /* file access to this code in its caller */
3852 VS_UNLOCK(vs);
3853 vs_changed:
3854 if (vs->vs_indirect) {
3855 unsigned int vsmap_size;
3856 int clmap_off;
3857 /* loop on indirect maps */
3858 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3859 vsmap = vs->vs_imap[i];
3860 if (vsmap == NULL)
3861 continue;
3862 /* loop on clusters in this indirect map */
3863 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3864 VSCLSIZE(vs) * i);
3865 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3866 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3867 else
3868 vsmap_size = CLMAP_ENTRIES;
3869 for (j = 0; j < vsmap_size; j++) {
3870 if (VSM_ISCLR(vsmap[j]) ||
3871 VSM_ISERR(vsmap[j]) ||
3872 (VSM_PS(vsmap[j]) != segment))
3873 continue;
3874 if(vs_cluster_transfer(vs,
3875 (vm_page_size * (j << vs->vs_clshift))
3876 + clmap_off,
3877 vm_page_size << vs->vs_clshift,
3878 upl)
3879 != KERN_SUCCESS) {
3880 VS_LOCK(vs);
3881 vs->vs_xfer_pending = FALSE;
3882 VS_UNLOCK(vs);
3883 vs_finish_write(vs);
3884 return KERN_FAILURE;
3885 }
3886 /* allow other readers/writers during transfer*/
3887 VS_LOCK(vs);
3888 vs->vs_xfer_pending = FALSE;
3889 VS_UNLOCK(vs);
3890 vs_finish_write(vs);
3891
3892 if (backing_store_abort_compaction || backing_store_stop_compaction) {
3893 backing_store_abort_compaction = FALSE;
3894 dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n"));
3895 return KERN_FAILURE;
3896 }
3897 vnode_pager_throttle();
3898
3899 VS_LOCK(vs);
3900 vs->vs_xfer_pending = TRUE;
3901 vs_wait_for_sync_writers(vs);
3902 vs_start_write(vs);
3903 vs_wait_for_readers(vs);
3904 VS_UNLOCK(vs);
3905 if (!(vs->vs_indirect)) {
3906 goto vs_changed;
3907 }
3908 }
3909 }
3910 } else {
3911 vsmap = vs->vs_dmap;
3912 if (vsmap == NULL) {
3913 VS_LOCK(vs);
3914 vs->vs_xfer_pending = FALSE;
3915 VS_UNLOCK(vs);
3916 vs_finish_write(vs);
3917 return KERN_SUCCESS;
3918 }
3919 /* loop on clusters in the direct map */
3920 for (j = 0; j < vs->vs_size; j++) {
3921 if (VSM_ISCLR(vsmap[j]) ||
3922 VSM_ISERR(vsmap[j]) ||
3923 (VSM_PS(vsmap[j]) != segment))
3924 continue;
3925 if(vs_cluster_transfer(vs,
3926 vm_page_size * (j << vs->vs_clshift),
3927 vm_page_size << vs->vs_clshift,
3928 upl) != KERN_SUCCESS) {
3929 VS_LOCK(vs);
3930 vs->vs_xfer_pending = FALSE;
3931 VS_UNLOCK(vs);
3932 vs_finish_write(vs);
3933 return KERN_FAILURE;
3934 }
3935 /* allow other readers/writers during transfer*/
3936 VS_LOCK(vs);
3937 vs->vs_xfer_pending = FALSE;
3938 VS_UNLOCK(vs);
3939 vs_finish_write(vs);
3940 VS_LOCK(vs);
3941 vs->vs_xfer_pending = TRUE;
3942 vs_wait_for_sync_writers(vs);
3943 vs_start_write(vs);
3944 vs_wait_for_readers(vs);
3945 VS_UNLOCK(vs);
3946 if (vs->vs_indirect) {
3947 goto vs_changed;
3948 }
3949 }
3950 }
3951
3952 VS_LOCK(vs);
3953 vs->vs_xfer_pending = FALSE;
3954 VS_UNLOCK(vs);
3955 vs_finish_write(vs);
3956 return KERN_SUCCESS;
3957 }
3958
3959
3960
3961 vs_map_t
3962 vs_get_map_entry(
3963 vstruct_t vs,
3964 dp_offset_t offset)
3965 {
3966 struct vs_map *vsmap;
3967 dp_offset_t cluster;
3968
3969 cluster = atop_32(offset) >> vs->vs_clshift;
3970 if (vs->vs_indirect) {
3971 long ind_block = cluster/CLMAP_ENTRIES;
3972
3973 /* Is the indirect block allocated? */
3974 vsmap = vs->vs_imap[ind_block];
3975 if(vsmap == (vs_map_t) NULL)
3976 return vsmap;
3977 } else
3978 vsmap = vs->vs_dmap;
3979 vsmap += cluster%CLMAP_ENTRIES;
3980 return vsmap;
3981 }
3982
3983 kern_return_t
3984 vs_cluster_transfer(
3985 vstruct_t vs,
3986 dp_offset_t offset,
3987 dp_size_t cnt,
3988 upl_t upl)
3989 {
3990 dp_offset_t actual_offset;
3991 paging_segment_t ps;
3992 struct clmap clmap;
3993 kern_return_t error = KERN_SUCCESS;
3994 unsigned int size, size_wanted;
3995 int i;
3996 unsigned int residual = 0;
3997 unsigned int unavail_size;
3998 // default_pager_thread_t *dpt;
3999 // boolean_t dealloc;
4000 struct vs_map *vsmap_ptr = NULL;
4001 struct vs_map read_vsmap;
4002 struct vs_map original_read_vsmap;
4003 struct vs_map write_vsmap;
4004 // upl_t sync_upl;
4005 // vm_offset_t ioaddr;
4006
4007 /* vs_cluster_transfer reads in the pages of a cluster and
4008 * then writes these pages back to new backing store. The
4009 * segment the pages are being read from is assumed to have
4010 * been taken off-line and is no longer considered for new
4011 * space requests.
4012 */
4013
4014 /*
4015 * This loop will be executed once per cluster referenced.
4016 * Typically this means once, since it's unlikely that the
4017 * VM system will ask for anything spanning cluster boundaries.
4018 *
4019 * If there are holes in a cluster (in a paging segment), we stop
4020 * reading at the hole, then loop again, hoping to
4021 * find valid pages later in the cluster. This continues until
4022 * the entire range has been examined, and read, if present. The
4023 * pages are written as they are read. If a failure occurs after
4024 * some pages are written the unmap call at the bottom of the loop
4025 * recovers the backing store and the old backing store remains
4026 * in effect.
4027 */
4028
4029 VSM_CLR(write_vsmap);
4030 VSM_CLR(original_read_vsmap);
4031 /* grab the actual object's pages to sync with I/O */
4032 while (cnt && (error == KERN_SUCCESS)) {
4033 vsmap_ptr = vs_get_map_entry(vs, offset);
4034 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
4035
4036 if (actual_offset == (dp_offset_t) -1) {
4037
4038 /*
4039 * Nothing left to write in this cluster at least
4040 * set write cluster information for any previous
4041 * write, clear for next cluster, if there is one
4042 */
4043 unsigned int local_size, clmask, clsize;
4044
4045 clsize = vm_page_size << vs->vs_clshift;
4046 clmask = clsize - 1;
4047 local_size = clsize - (offset & clmask);
4048 ASSERT(local_size);
4049 local_size = MIN(local_size, cnt);
4050
4051 /* This cluster has no data in it beyond what may */
4052 /* have been found on a previous iteration through */
4053 /* the loop "write_vsmap" */
4054 *vsmap_ptr = write_vsmap;
4055 VSM_CLR(write_vsmap);
4056 VSM_CLR(original_read_vsmap);
4057
4058 cnt -= local_size;
4059 offset += local_size;
4060 continue;
4061 }
4062
4063 /*
4064 * Count up contiguous available or unavailable
4065 * pages.
4066 */
4067 ps = CLMAP_PS(clmap);
4068 ASSERT(ps);
4069 size = 0;
4070 unavail_size = 0;
4071 for (i = 0;
4072 (size < cnt) && (unavail_size < cnt) &&
4073 (i < CLMAP_NPGS(clmap)); i++) {
4074 if (CLMAP_ISSET(clmap, i)) {
4075 if (unavail_size != 0)
4076 break;
4077 size += vm_page_size;
4078 BS_STAT(ps->ps_bs,
4079 ps->ps_bs->bs_pages_in++);
4080 } else {
4081 if (size != 0)
4082 break;
4083 unavail_size += vm_page_size;
4084 }
4085 }
4086
4087 if (size == 0) {
4088 ASSERT(unavail_size);
4089 ps_clunmap(vs, offset, unavail_size);
4090 cnt -= unavail_size;
4091 offset += unavail_size;
4092 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
4093 == 0) {
4094 /* There is no more to transfer in this
4095 cluster
4096 */
4097 *vsmap_ptr = write_vsmap;
4098 VSM_CLR(write_vsmap);
4099 VSM_CLR(original_read_vsmap);
4100 }
4101 continue;
4102 }
4103
4104 if(VSM_ISCLR(original_read_vsmap))
4105 original_read_vsmap = *vsmap_ptr;
4106
4107 if(ps->ps_segtype == PS_PARTITION) {
4108 panic("swap partition not supported\n");
4109 /*NOTREACHED*/
4110 error = KERN_FAILURE;
4111 residual = size;
4112 /*
4113 NEED TO ISSUE WITH SYNC & NO COMMIT
4114 error = ps_read_device(ps, actual_offset, &buffer,
4115 size, &residual, flags);
4116 */
4117 } else {
4118 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
4119 error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
4120 size, &residual,
4121 (UPL_IOSYNC | UPL_NOCOMMIT | (dp_encryption ? UPL_PAGING_ENCRYPTED : 0)));
4122 }
4123
4124 read_vsmap = *vsmap_ptr;
4125
4126
4127 /*
4128 * Adjust counts and put data in new BS. Optimize for the
4129 * common case, i.e. no error and/or partial data.
4130 * If there was an error, then we need to error the entire
4131 * range, even if some data was successfully read.
4132 *
4133 */
4134 if ((error == KERN_SUCCESS) && (residual == 0)) {
4135
4136 /*
4137 * Got everything we asked for, supply the data to
4138 * the new BS. Note that as a side effect of supplying
4139 * the data, the buffer holding the supplied data is
4140 * deallocated from the pager's address space unless
4141 * the write is unsuccessful.
4142 */
4143
4144 /* note buffer will be cleaned up in all cases by */
4145 /* internal_cluster_write or if an error on write */
4146 /* the vm_map_copy_page_discard call */
4147 *vsmap_ptr = write_vsmap;
4148
4149 if(vs_cluster_write(vs, upl, offset,
4150 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
4151 error = KERN_FAILURE;
4152 if(!(VSM_ISCLR(*vsmap_ptr))) {
4153 /* unmap the new backing store object */
4154 ps_clunmap(vs, offset, size);
4155 }
4156 /* original vsmap */
4157 *vsmap_ptr = original_read_vsmap;
4158 VSM_CLR(write_vsmap);
4159 } else {
4160 if((offset + size) &
4161 ((vm_page_size << vs->vs_clshift)
4162 - 1)) {
4163 /* There is more to transfer in this
4164 cluster
4165 */
4166 write_vsmap = *vsmap_ptr;
4167 *vsmap_ptr = read_vsmap;
4168 ps_clunmap(vs, offset, size);
4169 } else {
4170 /* discard the old backing object */
4171 write_vsmap = *vsmap_ptr;
4172 *vsmap_ptr = read_vsmap;
4173 ps_clunmap(vs, offset, size);
4174 *vsmap_ptr = write_vsmap;
4175 VSM_CLR(write_vsmap);
4176 VSM_CLR(original_read_vsmap);
4177 }
4178 }
4179 } else {
4180 size_wanted = size;
4181 if (error == KERN_SUCCESS) {
4182 if (residual == size) {
4183 /*
4184 * If a read operation returns no error
4185 * and no data moved, we turn it into
4186 * an error, assuming we're reading at
4187 * or beyond EOF.
4188 * Fall through and error the entire
4189 * range.
4190 */
4191 error = KERN_FAILURE;
4192 *vsmap_ptr = write_vsmap;
4193 if(!(VSM_ISCLR(*vsmap_ptr))) {
4194 /* unmap the new backing store object */
4195 ps_clunmap(vs, offset, size);
4196 }
4197 *vsmap_ptr = original_read_vsmap;
4198 VSM_CLR(write_vsmap);
4199 continue;
4200 } else {
4201 /*
4202 * Otherwise, we have partial read.
4203 * This is also considered an error
4204 * for the purposes of cluster transfer
4205 */
4206 error = KERN_FAILURE;
4207 *vsmap_ptr = write_vsmap;
4208 if(!(VSM_ISCLR(*vsmap_ptr))) {
4209 /* unmap the new backing store object */
4210 ps_clunmap(vs, offset, size);
4211 }
4212 *vsmap_ptr = original_read_vsmap;
4213 VSM_CLR(write_vsmap);
4214 continue;
4215 }
4216 }
4217
4218 }
4219 cnt -= size;
4220 offset += size;
4221
4222 } /* END while (cnt && (error == 0)) */
4223 if(!VSM_ISCLR(write_vsmap))
4224 *vsmap_ptr = write_vsmap;
4225
4226 return error;
4227 }
4228
4229 kern_return_t
4230 default_pager_add_file(
4231 MACH_PORT_FACE backing_store,
4232 vnode_ptr_t vp,
4233 int record_size,
4234 vm_size_t size)
4235 {
4236 backing_store_t bs;
4237 paging_segment_t ps;
4238 int i;
4239 unsigned int j;
4240 int error;
4241
4242 if ((bs = backing_store_lookup(backing_store))
4243 == BACKING_STORE_NULL)
4244 return KERN_INVALID_ARGUMENT;
4245
4246 PSL_LOCK();
4247 for (i = 0; i <= paging_segment_max; i++) {
4248 ps = paging_segments[i];
4249 if (ps == PAGING_SEGMENT_NULL)
4250 continue;
4251 if (ps->ps_segtype != PS_FILE)
4252 continue;
4253
4254 /*
4255 * Check for overlap on same device.
4256 */
4257 if (ps->ps_vnode == (struct vnode *)vp) {
4258 PSL_UNLOCK();
4259 BS_UNLOCK(bs);
4260 return KERN_INVALID_ARGUMENT;
4261 }
4262 }
4263 PSL_UNLOCK();
4264
4265 /*
4266 * Set up the paging segment
4267 */
4268 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
4269 if (ps == PAGING_SEGMENT_NULL) {
4270 BS_UNLOCK(bs);
4271 return KERN_RESOURCE_SHORTAGE;
4272 }
4273
4274 ps->ps_segtype = PS_FILE;
4275 ps->ps_vnode = (struct vnode *)vp;
4276 ps->ps_offset = 0;
4277 ps->ps_record_shift = local_log2(vm_page_size / record_size);
4278 assert((dp_size_t) size == size);
4279 ps->ps_recnum = (dp_size_t) size;
4280 ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift;
4281
4282 ps->ps_pgcount = ps->ps_pgnum;
4283 ps->ps_clshift = local_log2(bs->bs_clsize);
4284 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
4285 ps->ps_special_clusters = 0;
4286 ps->ps_hint = 0;
4287
4288 PS_LOCK_INIT(ps);
4289 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
4290 if (!ps->ps_bmap) {
4291 PS_LOCK_DESTROY(ps);
4292 kfree(ps, sizeof *ps);
4293 BS_UNLOCK(bs);
4294 return KERN_RESOURCE_SHORTAGE;
4295 }
4296 for (j = 0; j < ps->ps_ncls; j++) {
4297 clrbit(ps->ps_bmap, j);
4298 }
4299
4300 if(paging_segment_count == 0) {
4301 ps->ps_state = PS_EMERGENCY_SEGMENT;
4302 if(use_emergency_swap_file_first) {
4303 ps->ps_state |= PS_CAN_USE;
4304 }
4305 emergency_segment_backing_store = backing_store;
4306 } else {
4307 ps->ps_state = PS_CAN_USE;
4308 }
4309
4310 ps->ps_bs = bs;
4311
4312 if ((error = ps_enter(ps)) != 0) {
4313 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
4314 PS_LOCK_DESTROY(ps);
4315 kfree(ps, sizeof *ps);
4316 BS_UNLOCK(bs);
4317 return KERN_RESOURCE_SHORTAGE;
4318 }
4319
4320 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
4321 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
4322 PSL_LOCK();
4323 if(IS_PS_OK_TO_USE(ps)) {
4324 dp_pages_free += ps->ps_pgcount;
4325 } else {
4326 dp_pages_reserve += ps->ps_pgcount;
4327 }
4328 PSL_UNLOCK();
4329
4330 BS_UNLOCK(bs);
4331
4332 bs_more_space(ps->ps_clcount);
4333
4334 /*
4335 * If the paging segment being activated is not the emergency
4336 * segment and we notice that the emergency segment is being
4337 * used then we help recover it. If all goes well, the
4338 * emergency segment will be back to its original state of
4339 * online but not activated (till it's needed the next time).
4340 */
4341 #if CONFIG_FREEZE
4342 if (!memorystatus_freeze_enabled)
4343 #endif
4344 {
4345 ps = paging_segments[EMERGENCY_PSEG_INDEX];
4346 if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) {
4347 if(default_pager_backing_store_delete(emergency_segment_backing_store)) {
4348 dprintf(("Failed to recover emergency paging segment\n"));
4349 } else {
4350 dprintf(("Recovered emergency paging segment\n"));
4351 }
4352 }
4353 }
4354
4355 DP_DEBUG(DEBUG_BS_INTERNAL,
4356 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
4357 device, offset, (dp_size_t) size, record_size,
4358 ps->ps_record_shift, ps->ps_pgnum));
4359
4360 return KERN_SUCCESS;
4361 }
4362
4363
4364
4365 kern_return_t
4366 ps_read_file(
4367 paging_segment_t ps,
4368 upl_t upl,
4369 upl_offset_t upl_offset,
4370 dp_offset_t offset,
4371 upl_size_t size,
4372 unsigned int *residualp,
4373 int flags)
4374 {
4375 vm_object_offset_t f_offset;
4376 int error = 0;
4377 int result;
4378
4379 assert(dp_encryption_inited);
4380
4381 clustered_reads[atop_32(size)]++;
4382
4383 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4384
4385 /*
4386 * for transfer case we need to pass uploffset and flags
4387 */
4388 assert((upl_size_t) size == size);
4389 error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL);
4390
4391 /* The vnode_pagein semantic is somewhat at odds with the existing */
4392 /* device_read semantic. Partial reads are not experienced at this */
4393 /* level. It is up to the bit map code and cluster read code to */
4394 /* check that requested data locations are actually backed, and the */
4395 /* pagein code to either read all of the requested data or return an */
4396 /* error. */
4397
4398 if (error)
4399 result = KERN_FAILURE;
4400 else {
4401 *residualp = 0;
4402 result = KERN_SUCCESS;
4403 }
4404 return result;
4405 }
4406
4407 kern_return_t
4408 ps_write_file(
4409 paging_segment_t ps,
4410 upl_t upl,
4411 upl_offset_t upl_offset,
4412 dp_offset_t offset,
4413 unsigned int size,
4414 int flags)
4415 {
4416 vm_object_offset_t f_offset;
4417 kern_return_t result;
4418
4419 assert(dp_encryption_inited);
4420
4421 clustered_writes[atop_32(size)]++;
4422 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4423
4424 if (flags & UPL_PAGING_ENCRYPTED) {
4425 /*
4426 * ENCRYPTED SWAP:
4427 * encrypt all the pages that we're going
4428 * to pageout.
4429 */
4430 upl_encrypt(upl, upl_offset, size);
4431 }
4432 assert((upl_size_t) size == size);
4433 if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL))
4434 result = KERN_FAILURE;
4435 else
4436 result = KERN_SUCCESS;
4437
4438 return result;
4439 }
4440
4441 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data)
4442 {
4443 #pragma unused(data)
4444 }
4445
4446 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data)
4447 {
4448 #pragma unused(data)
4449 }
4450
4451 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length)
4452 {
4453 #pragma unused(data, map, shift, length)
4454 }
4455
4456 kern_return_t
4457 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
4458 int hi_wat,
4459 int lo_wat,
4460 int flags,
4461 MACH_PORT_FACE trigger_port)
4462 {
4463 MACH_PORT_FACE release = IPC_PORT_NULL;
4464 kern_return_t kr;
4465 clock_sec_t now;
4466 clock_nsec_t nanoseconds_dummy;
4467 static clock_sec_t error_notify = 0;
4468
4469 PSL_LOCK();
4470 if (flags == SWAP_ENCRYPT_ON) {
4471 /* ENCRYPTED SWAP: turn encryption on */
4472 release = trigger_port;
4473 if (!dp_encryption_inited) {
4474 dp_encryption_inited = TRUE;
4475 dp_encryption = TRUE;
4476 kr = KERN_SUCCESS;
4477 } else {
4478 kr = KERN_FAILURE;
4479 }
4480 } else if (flags == SWAP_ENCRYPT_OFF) {
4481 /* ENCRYPTED SWAP: turn encryption off */
4482 release = trigger_port;
4483 if (!dp_encryption_inited) {
4484 dp_encryption_inited = TRUE;
4485 dp_encryption = FALSE;
4486 kr = KERN_SUCCESS;
4487 } else {
4488 kr = KERN_FAILURE;
4489 }
4490 } else if (flags == HI_WAT_ALERT) {
4491 release = min_pages_trigger_port;
4492 #if CONFIG_FREEZE
4493 /* High and low water signals aren't applicable when freeze is */
4494 /* enabled, so release the trigger ports here and return */
4495 /* KERN_FAILURE. */
4496 if (memorystatus_freeze_enabled) {
4497 if (IP_VALID( trigger_port )){
4498 ipc_port_release_send( trigger_port );
4499 }
4500 min_pages_trigger_port = IPC_PORT_NULL;
4501 kr = KERN_FAILURE;
4502 }
4503 else
4504 #endif
4505 {
4506 min_pages_trigger_port = trigger_port;
4507 minimum_pages_remaining = hi_wat/vm_page_size;
4508 bs_low = FALSE;
4509 kr = KERN_SUCCESS;
4510 }
4511 } else if (flags == LO_WAT_ALERT) {
4512 release = max_pages_trigger_port;
4513 #if CONFIG_FREEZE
4514 if (memorystatus_freeze_enabled) {
4515 if (IP_VALID( trigger_port )){
4516 ipc_port_release_send( trigger_port );
4517 }
4518 max_pages_trigger_port = IPC_PORT_NULL;
4519 kr = KERN_FAILURE;
4520 }
4521 else
4522 #endif
4523 {
4524 max_pages_trigger_port = trigger_port;
4525 maximum_pages_free = lo_wat/vm_page_size;
4526 kr = KERN_SUCCESS;
4527 }
4528 } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) {
4529 use_emergency_swap_file_first = TRUE;
4530 release = trigger_port;
4531 kr = KERN_SUCCESS;
4532 } else if (flags == SWAP_FILE_CREATION_ERROR) {
4533 release = trigger_port;
4534 kr = KERN_SUCCESS;
4535 if( paging_segment_count == 1) {
4536 use_emergency_swap_file_first = TRUE;
4537 }
4538 no_paging_space_action();
4539 clock_get_system_nanotime(&now, &nanoseconds_dummy);
4540 if (now > error_notify + 5) {
4541 dprintf(("Swap File Error.\n"));
4542 error_notify = now;
4543 }
4544 } else {
4545 release = trigger_port;
4546 kr = KERN_INVALID_ARGUMENT;
4547 }
4548 PSL_UNLOCK();
4549
4550 if (IP_VALID(release))
4551 ipc_port_release_send(release);
4552
4553 return kr;
4554 }
4555
4556 /*
4557 * Monitor the amount of available backing store vs. the amount of
4558 * required backing store, notify a listener (if present) when
4559 * backing store may safely be removed.
4560 *
4561 * We attempt to avoid the situation where backing store is
4562 * discarded en masse, as this can lead to thrashing as the
4563 * backing store is compacted.
4564 */
4565
4566 #define PF_INTERVAL 3 /* time between free level checks */
4567 #define PF_LATENCY 10 /* number of intervals before release */
4568
4569 static int dp_pages_free_low_count = 0;
4570 thread_call_t default_pager_backing_store_monitor_callout;
4571
4572 void
4573 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
4574 __unused thread_call_param_t p2)
4575 {
4576 // unsigned long long average;
4577 ipc_port_t trigger;
4578 uint64_t deadline;
4579
4580 /*
4581 * We determine whether it will be safe to release some
4582 * backing store by watching the free page level. If
4583 * it remains below the maximum_pages_free threshold for
4584 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4585 * then we deem it safe.
4586 *
4587 * Note that this establishes a maximum rate at which backing
4588 * store will be released, as each notification (currently)
4589 * only results in a single backing store object being
4590 * released.
4591 */
4592 if (dp_pages_free > maximum_pages_free) {
4593 dp_pages_free_low_count++;
4594 } else {
4595 dp_pages_free_low_count = 0;
4596 }
4597
4598 /* decide whether to send notification */
4599 trigger = IP_NULL;
4600 if (max_pages_trigger_port &&
4601 (backing_store_release_trigger_disable == 0) &&
4602 (dp_pages_free_low_count > PF_LATENCY)) {
4603 trigger = max_pages_trigger_port;
4604 max_pages_trigger_port = NULL;
4605 }
4606
4607 /* send notification */
4608 if (trigger != IP_NULL) {
4609 VSL_LOCK();
4610 if(backing_store_release_trigger_disable != 0) {
4611 assert_wait((event_t)
4612 &backing_store_release_trigger_disable,
4613 THREAD_UNINT);
4614 VSL_UNLOCK();
4615 thread_block(THREAD_CONTINUE_NULL);
4616 } else {
4617 VSL_UNLOCK();
4618 }
4619 dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n"));
4620
4621 default_pager_space_alert(trigger, LO_WAT_ALERT);
4622 ipc_port_release_send(trigger);
4623 dp_pages_free_low_count = 0;
4624 }
4625
4626 clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
4627 thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
4628 }
4629
4630 #if CONFIG_FREEZE
4631 unsigned int default_pager_swap_pages_free() {
4632 return dp_pages_free;
4633 }
4634 #endif