2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
26 * File: kern/task_swap.c
28 * Task residency management primitives implementation.
30 #include <mach_assert.h>
31 #include <task_swapper.h>
34 #include <kern/lock.h>
35 #include <kern/queue.h>
36 #include <kern/host.h>
37 #include <kern/task.h>
38 #include <kern/task_swap.h>
39 #include <kern/thread.h>
40 #include <kern/thread_swap.h>
41 #include <kern/host_statistics.h>
42 #include <kern/misc_protos.h>
43 #include <kern/assert.h>
44 #include <mach/policy.h>
46 #include <ipc/ipc_port.h> /* We use something from in here */
49 * Note: if TASK_SWAPPER is disabled, then this file defines only
50 * a stub version of task_swappable(), so that the service can always
51 * be defined, even if swapping has been configured out of the kernel.
55 /* temporary debug flags */
56 #define TASK_SW_DEBUG 1
57 #define TASK_SW_STATS 1
59 int task_swap_debug
= 0;
60 int task_swap_stats
= 0;
61 int task_swap_enable
= 1;
64 queue_head_t swapped_tasks
; /* completely swapped out tasks */
65 queue_head_t swapout_thread_q
; /* threads to be swapped out */
66 mutex_t task_swapper_lock
; /* protects above queue */
68 #define task_swapper_lock() mutex_lock(&task_swapper_lock)
69 #define task_swapper_unlock() mutex_unlock(&task_swapper_lock)
70 #define task_swapper_wakeup() thread_wakeup((event_t)&swapout_thread_q)
71 #define task_swapper_sleep() thread_sleep_mutex((event_t)&swapout_thread_q, \
76 queue_head_t eligible_tasks
; /* tasks eligible for swapout */
77 mutex_t task_swapout_list_lock
; /* protects above queue */
78 #define task_swapout_lock() mutex_lock(&task_swapout_list_lock)
79 #define task_swapout_unlock() mutex_unlock(&task_swapout_list_lock)
82 * The next section of constants and globals are tunable parameters
83 * used in making swapping decisions. They may be changed dynamically
84 * without adversely affecting the robustness of the system; however,
85 * the policy will change, one way or the other.
88 #define SHORT_AVG_INTERVAL 5 /* in seconds */
89 #define LONG_AVG_INTERVAL 30 /* in seconds */
90 #define AVE_SCALE 1024
92 unsigned int short_avg_interval
= SHORT_AVG_INTERVAL
;
93 unsigned int long_avg_interval
= LONG_AVG_INTERVAL
;
95 #ifndef MIN_SWAP_PAGEOUT_RATE
96 #define MIN_SWAP_PAGEOUT_RATE 10
100 * The following are all stored in fixed-point representation (the actual
101 * value times AVE_SCALE), to allow more accurate computing of decaying
102 * averages. So all variables that end with "avg" must be divided by
103 * AVE_SCALE to convert them or compare them to ints.
105 unsigned int vm_grab_rate_avg
;
106 unsigned int vm_pageout_rate_avg
= MIN_SWAP_PAGEOUT_RATE
* AVE_SCALE
;
107 unsigned int vm_pageout_rate_longavg
= MIN_SWAP_PAGEOUT_RATE
* AVE_SCALE
;
108 unsigned int vm_pageout_rate_peakavg
= MIN_SWAP_PAGEOUT_RATE
* AVE_SCALE
;
109 unsigned int vm_page_free_avg
; /* average free pages over short_avg_interval */
110 unsigned int vm_page_free_longavg
; /* avg free pages over long_avg_interval */
113 * Trigger task swapping when paging activity reaches
114 * SWAP_HIGH_WATER_MARK per cent of the maximum paging activity ever observed.
115 * Turn off task swapping when paging activity goes back down to below
116 * SWAP_PAGEOUT_LOW_WATER_MARK per cent of the maximum.
117 * These numbers have been found empirically and might need some tuning...
119 #ifndef SWAP_PAGEOUT_HIGH_WATER_MARK
120 #define SWAP_PAGEOUT_HIGH_WATER_MARK 30
122 #ifndef SWAP_PAGEOUT_LOW_WATER_MARK
123 #define SWAP_PAGEOUT_LOW_WATER_MARK 10
126 #ifndef MAX_GRAB_RATE
127 #define MAX_GRAB_RATE ((unsigned int) -1) /* XXX no maximum */
131 * swap_{start,stop}_pageout_rate start at the minimum value, then increase
132 * to adjust to the hardware's performance, following the paging rate peaks.
134 unsigned int swap_pageout_high_water_mark
= SWAP_PAGEOUT_HIGH_WATER_MARK
;
135 unsigned int swap_pageout_low_water_mark
= SWAP_PAGEOUT_LOW_WATER_MARK
;
136 unsigned int swap_start_pageout_rate
= MIN_SWAP_PAGEOUT_RATE
* AVE_SCALE
*
137 SWAP_PAGEOUT_HIGH_WATER_MARK
/ 100;
138 unsigned int swap_stop_pageout_rate
= MIN_SWAP_PAGEOUT_RATE
* AVE_SCALE
*
139 SWAP_PAGEOUT_LOW_WATER_MARK
/ 100;
141 unsigned int fixed_swap_start_pageout_rate
= 0; /* only for testing purpose */
142 unsigned int fixed_swap_stop_pageout_rate
= 0; /* only for testing purpose */
143 #endif /* TASK_SW_DEBUG */
144 unsigned int max_grab_rate
= MAX_GRAB_RATE
;
146 #ifndef MIN_SWAP_TIME
147 #define MIN_SWAP_TIME 1
150 int min_swap_time
= MIN_SWAP_TIME
; /* in seconds */
153 #define MIN_RES_TIME 6
156 int min_res_time
= MIN_RES_TIME
; /* in seconds */
158 #ifndef MIN_ACTIVE_TASKS
159 #define MIN_ACTIVE_TASKS 4
162 int min_active_tasks
= MIN_ACTIVE_TASKS
;
164 #ifndef TASK_SWAP_CYCLE_TIME
165 #define TASK_SWAP_CYCLE_TIME 2
168 int task_swap_cycle_time
= TASK_SWAP_CYCLE_TIME
; /* in seconds */
170 int last_task_swap_cycle
= 0;
172 /* temporary statistics */
173 int task_swapouts
= 0;
174 int task_swapins
= 0;
175 int task_swaprss_out
= 0; /* total rss at swapout time */
176 int task_swaprss_in
= 0; /* total rss at swapin time */
177 int task_swap_total_time
= 0; /* total time spent swapped out */
178 int tasks_swapped_out
= 0; /* number of tasks swapped out now */
181 #define TASK_STATS_INCR(cnt) (cnt)++
183 #define TASK_STATS_INCR(cnt)
184 #endif /* TASK_SW_STATS */
187 boolean_t
on_swapped_list(task_t task
); /* forward */
189 * Debug function to determine if a task is already on the
190 * swapped out tasks list. It also checks for tasks on the list
191 * that are in an illegal state (i.e. swapped in).
194 on_swapped_list(task_t task
)
197 /* task_swapper_lock is locked. */
199 if (queue_empty(&swapped_tasks
)) {
202 ltask
= (task_t
)queue_first(&swapped_tasks
);
203 while (!queue_end(&swapped_tasks
, (queue_entry_t
)ltask
)) {
204 /* check for illegal state */
205 if (ltask
->swap_state
== TASK_SW_IN
) {
206 printf("on_swapped_list and in: 0x%X\n",ltask
);
211 ltask
= (task_t
)queue_next(<ask
->swapped_tasks
);
215 #endif /* TASK_SW_DEBUG */
218 * task_swapper_init: [exported]
223 queue_init(&swapped_tasks
);
224 queue_init(&eligible_tasks
);
225 queue_init(&swapout_thread_q
);
226 mutex_init(&task_swapper_lock
, ETAP_THREAD_TASK_SWAP
);
227 mutex_init(&task_swapout_list_lock
, ETAP_THREAD_TASK_SWAPOUT
);
228 vm_page_free_avg
= vm_page_free_count
* AVE_SCALE
;
229 vm_page_free_longavg
= vm_page_free_count
* AVE_SCALE
;
232 #endif /* TASK_SWAPPER */
235 * task_swappable: [exported]
237 * Make a task swappable or non-swappable. If made non-swappable,
238 * it will be swapped in.
240 * Locking: task_swapout_lock is taken before task lock.
244 host_priv_t host_priv
,
246 boolean_t make_swappable
)
248 if (host_priv
== HOST_PRIV_NULL
)
249 return(KERN_INVALID_ARGUMENT
);
251 if (task
== TASK_NULL
)
252 return(KERN_INVALID_ARGUMENT
);
257 * If we don't support swapping, this call is purely advisory.
259 return(KERN_SUCCESS
);
261 #else /* TASK_SWAPPER */
264 if (make_swappable
) {
265 /* make task swappable */
266 if (task
->swap_state
== TASK_SW_UNSWAPPABLE
) {
267 task
->swap_state
= TASK_SW_IN
;
269 task_swapout_eligible(task
);
272 switch (task
->swap_state
) {
274 task
->swap_state
= TASK_SW_UNSWAPPABLE
;
276 task_swapout_ineligible(task
);
278 case TASK_SW_UNSWAPPABLE
:
283 * swap_state could be TASK_SW_OUT, TASK_SW_GOING_OUT,
284 * or TASK_SW_COMING_IN. task_swapin handles all
285 * three, and its default case will catch any bad
289 task_swapin(task
, TRUE
);
293 return(KERN_SUCCESS
);
295 #endif /* TASK_SWAPPER */
303 * A reference to the task must be held.
305 * Start swapping out a task by sending an AST_SWAPOUT to each thread.
306 * When the threads reach a clean point, they queue themselves up on the
307 * swapout_thread_q to be swapped out by the task_swap_swapout_thread.
308 * The task can be swapped in at any point in this process.
310 * A task will not be fully swapped out (i.e. its map residence count
311 * at zero) until all currently-swapped threads run and reach
312 * a clean point, at which time they will be swapped again,
313 * decrementing the swap_ast_waiting count on the task.
315 * Locking: no locks held upon entry and exit.
316 * Task_lock is held throughout this function.
319 task_swapout(task_t task
)
321 thread_act_t thr_act
;
329 * NOTE: look into turning these into assertions if they
332 if ((task
->swap_state
!= TASK_SW_IN
) || (!task
->active
)) {
334 task_swapout_unlock();
335 return(KERN_FAILURE
);
337 if (task
->swap_flags
& TASK_SW_ELIGIBLE
) {
338 queue_remove(&eligible_tasks
, task
, task_t
, swapped_tasks
);
339 task
->swap_flags
&= ~TASK_SW_ELIGIBLE
;
341 task_swapout_unlock();
343 /* set state to avoid races with task_swappable(FALSE) */
344 task
->swap_state
= TASK_SW_GOING_OUT
;
345 task
->swap_rss
= pmap_resident_count(task
->map
->pmap
);
346 task_swaprss_out
+= task
->swap_rss
;
347 task
->swap_ast_waiting
= task
->thr_act_count
;
350 * halt all threads in this task:
351 * We don't need the thread list lock for traversal.
353 list
= &task
->thr_acts
;
354 thr_act
= (thread_act_t
) queue_first(list
);
355 while (!queue_end(list
, (queue_entry_t
) thr_act
)) {
359 thread
= act_lock_thread(thr_act
);
362 swappable
= (thr_act
->swap_state
!= TH_SW_UNSWAPPABLE
);
366 for (ract
= thread
->top_act
; ract
; ract
= ract
->lower
)
367 if (ract
->swap_state
== TH_SW_UNSWAPPABLE
) {
373 thread_ast_set(thr_act
, AST_SWAPOUT
);
375 thread_unlock(thread
);
377 assert((thr_act
->ast
& AST_TERMINATE
) == 0);
378 act_unlock_thread(thr_act
);
379 thr_act
= (thread_act_t
) queue_next(&thr_act
->thr_acts
);
382 task
->swap_stamp
= sched_tick
;
384 assert((task
->swap_flags
&TASK_SW_WANT_IN
) == 0);
385 /* put task on the queue of swapped out tasks */
388 if (task_swap_debug
&& on_swapped_list(task
)) {
389 printf("task 0x%X already on list\n", task
);
392 #endif /* TASK_SW_DEBUG */
393 queue_enter(&swapped_tasks
, task
, task_t
, swapped_tasks
);
396 task_swapper_unlock();
399 return(KERN_SUCCESS
);
403 int task_sw_race_in
= 0;
404 int task_sw_race_coming_in
= 0;
405 int task_sw_race_going_out
= 0;
406 int task_sw_before_ast
= 0;
407 int task_sw_before_swap
= 0;
408 int task_sw_after_swap
= 0;
409 int task_sw_race_in_won
= 0;
410 int task_sw_unswappable
= 0;
411 int task_sw_act_inactive
= 0;
412 #endif /* TASK_SW_STATS */
415 * thread_swapout_enqueue is called by thread_halt_self when it
416 * processes AST_SWAPOUT to enqueue threads to be swapped out.
417 * It must be called at normal interrupt priority for the
418 * sake of the task_swapper_lock.
420 * There can be races with task swapin here.
421 * First lock task and decrement swap_ast_waiting count, and if
422 * it's 0, we can decrement the residence count on the task's map
423 * and set the task's swap state to TASK_SW_OUT.
426 thread_swapout_enqueue(thread_act_t thr_act
)
428 task_t task
= thr_act
->task
;
431 * If the swap_state is not TASK_SW_GOING_OUT, then
432 * task_swapin has beaten us to this operation, and
433 * we have nothing to do.
435 if (task
->swap_state
!= TASK_SW_GOING_OUT
) {
439 if (--task
->swap_ast_waiting
== 0) {
440 vm_map_t map
= task
->map
;
441 task
->swap_state
= TASK_SW_OUT
;
443 mutex_lock(&map
->s_lock
);
444 vm_map_res_deallocate(map
);
445 mutex_unlock(&map
->s_lock
);
451 if (! (thr_act
->swap_state
& TH_SW_TASK_SWAPPING
)) {
453 * We lost a race with task_swapin(): don't enqueue.
456 queue_enter(&swapout_thread_q
, thr_act
,
457 thread_act_t
, swap_queue
);
458 task_swapper_wakeup();
461 task_swapper_unlock();
465 * task_swap_swapout_thread: [exported]
467 * Executes as a separate kernel thread.
468 * Its job is to swap out threads that have been halted by AST_SWAPOUT.
471 task_swap_swapout_thread(void)
473 thread_act_t thr_act
;
474 thread_t thread
, nthread
;
478 thread_swappable(current_act(), FALSE
);
479 stack_privilege(current_thread());
485 while (! queue_empty(&swapout_thread_q
)) {
487 queue_remove_first(&swapout_thread_q
, thr_act
,
488 thread_act_t
, swap_queue
);
490 * If we're racing with task_swapin, we need
491 * to make it safe for it to do remque on the
492 * thread, so make its links point to itself.
493 * Allowing this ugliness is cheaper than
494 * making task_swapin search the entire queue.
497 queue_init((queue_t
) &thr_act
->swap_queue
);
499 task_swapper_unlock();
501 * Wait for thread's RUN bit to be deasserted.
503 thread
= act_lock_thread(thr_act
);
504 if (thread
== THREAD_NULL
)
505 act_unlock_thread(thr_act
);
509 thread_reference(thread
);
510 thread_hold(thr_act
);
511 act_unlock_thread(thr_act
);
512 r
= thread_stop_wait(thread
);
513 nthread
= act_lock_thread(thr_act
);
514 thread_release(thr_act
);
515 thread_deallocate(thread
);
516 act_unlock_thread(thr_act
);
517 if (!r
|| nthread
!= thread
) {
522 task
= thr_act
->task
;
525 * we can race with swapin, which would set the
526 * state to TASK_SW_IN.
528 if ((task
->swap_state
!= TASK_SW_OUT
) &&
529 (task
->swap_state
!= TASK_SW_GOING_OUT
)) {
532 TASK_STATS_INCR(task_sw_race_in_won
);
533 if (thread
!= THREAD_NULL
)
534 thread_unstop(thread
);
537 nthread
= act_lock_thread(thr_act
);
538 if (nthread
!= thread
|| thr_act
->active
== FALSE
) {
539 act_unlock_thread(thr_act
);
542 TASK_STATS_INCR(task_sw_act_inactive
);
543 if (thread
!= THREAD_NULL
)
544 thread_unstop(thread
);
548 if (thread
!= THREAD_NULL
)
551 * Thread cannot have been swapped out yet because
552 * TH_SW_TASK_SWAPPING was set in AST. If task_swapin
553 * beat us here, we either wouldn't have found it on
554 * the queue, or the task->swap_state would have
555 * changed. The synchronization is on the
556 * task's swap_state and the task_lock.
557 * The thread can't be swapped in any other way
558 * because its task has been swapped.
560 assert(thr_act
->swap_state
& TH_SW_TASK_SWAPPING
);
561 assert(thread
== THREAD_NULL
||
562 !(thread
->state
& (TH_SWAPPED_OUT
|TH_RUN
)));
563 assert((thr_act
->swap_state
& TH_SW_STATE
) == TH_SW_IN
);
564 /* assert(thread->state & TH_HALTED); */
565 /* this also clears TH_SW_TASK_SWAPPING flag */
566 thr_act
->swap_state
= TH_SW_GOING_OUT
;
567 if (thread
!= THREAD_NULL
) {
568 if (thread
->top_act
== thr_act
) {
569 thread
->state
|= TH_SWAPPED_OUT
;
571 * Once we unlock the task, things can happen
572 * to the thread, so make sure it's consistent
573 * for thread_swapout.
577 thread_unlock(thread
);
578 thread_unstop(thread
);
581 act_locked_act_reference(thr_act
);
582 act_unlock_thread(thr_act
);
585 thread_swapout(thr_act
); /* do the work */
587 if (thread
!= THREAD_NULL
)
588 thread_deallocate(thread
);
589 act_deallocate(thr_act
);
592 task_swapper_sleep();
599 * Make a task resident.
600 * Performs all of the work to make a task resident and possibly
601 * non-swappable. If we race with a competing task_swapin call,
602 * we wait for its completion, then return.
604 * Locking: no locks held upon entry and exit.
606 * Note that TASK_SW_MAKE_UNSWAPPABLE can only be set when the
607 * state is TASK_SW_COMING_IN.
611 task_swapin(task_t task
, boolean_t make_unswappable
)
613 register queue_head_t
*list
;
614 register thread_act_t thr_act
, next
;
617 boolean_t swappable
= TRUE
;
620 switch (task
->swap_state
) {
623 vm_map_t map
= task
->map
;
625 * Task has made it all the way out, which means
626 * that vm_map_res_deallocate has been done; set
627 * state to TASK_SW_COMING_IN, then bring map
628 * back in. We could actually be racing with
629 * the thread_swapout_enqueue, which does the
630 * vm_map_res_deallocate, but that race is covered.
632 task
->swap_state
= TASK_SW_COMING_IN
;
633 assert(task
->swap_ast_waiting
== 0);
634 assert(map
->res_count
>= 0);
636 mutex_lock(&map
->s_lock
);
637 vm_map_res_reference(map
);
638 mutex_unlock(&map
->s_lock
);
640 assert(task
->swap_state
== TASK_SW_COMING_IN
);
644 case TASK_SW_GOING_OUT
:
646 * Task isn't all the way out yet. There is
647 * still at least one thread not swapped, and
648 * vm_map_res_deallocate has not been done.
650 task
->swap_state
= TASK_SW_COMING_IN
;
651 assert(task
->swap_ast_waiting
> 0 ||
652 (task
->swap_ast_waiting
== 0 &&
653 task
->thr_act_count
== 0));
654 assert(task
->map
->res_count
> 0);
655 TASK_STATS_INCR(task_sw_race_going_out
);
658 assert(task
->map
->res_count
> 0);
661 if (task_swap_debug
&& on_swapped_list(task
)) {
662 printf("task 0x%X on list, state is SW_IN\n",
666 task_swapper_unlock();
667 #endif /* TASK_SW_DEBUG */
668 TASK_STATS_INCR(task_sw_race_in
);
669 if (make_unswappable
) {
670 task
->swap_state
= TASK_SW_UNSWAPPABLE
;
672 task_swapout_ineligible(task
);
675 return(KERN_SUCCESS
);
676 case TASK_SW_COMING_IN
:
678 * Raced with another task_swapin and lost;
679 * wait for other one to complete first
681 assert(task
->map
->res_count
>= 0);
683 * set MAKE_UNSWAPPABLE so that whoever is swapping
684 * the task in will make it unswappable, and return
686 if (make_unswappable
)
687 task
->swap_flags
|= TASK_SW_MAKE_UNSWAPPABLE
;
688 task
->swap_flags
|= TASK_SW_WANT_IN
;
689 assert_wait((event_t
)&task
->swap_state
, THREAD_UNINT
);
691 thread_block(THREAD_CONTINUE_NULL
);
692 TASK_STATS_INCR(task_sw_race_coming_in
);
693 return(KERN_SUCCESS
);
694 case TASK_SW_UNSWAPPABLE
:
696 * This can happen, since task_terminate
697 * unconditionally calls task_swapin.
700 return(KERN_SUCCESS
);
702 panic("task_swapin bad state");
705 if (make_unswappable
)
706 task
->swap_flags
|= TASK_SW_MAKE_UNSWAPPABLE
;
707 assert(task
->swap_state
== TASK_SW_COMING_IN
);
710 if (task_swap_debug
&& !on_swapped_list(task
)) {
711 printf("task 0x%X not on list\n", task
);
714 #endif /* TASK_SW_DEBUG */
715 queue_remove(&swapped_tasks
, task
, task_t
, swapped_tasks
);
718 task_swapper_unlock();
721 * Iterate through all threads for this task and
722 * release them, as required. They may not have been swapped
723 * out yet. The task remains locked throughout.
725 list
= &task
->thr_acts
;
726 thr_act
= (thread_act_t
) queue_first(list
);
727 while (!queue_end(list
, (queue_entry_t
) thr_act
)) {
728 boolean_t need_to_release
;
729 next
= (thread_act_t
) queue_next(&thr_act
->thr_acts
);
731 * Keep task_swapper_lock across thread handling
732 * to synchronize with task_swap_swapout_thread
735 thread
= act_lock_thread(thr_act
);
737 if (thr_act
->ast
& AST_SWAPOUT
) {
738 /* thread hasn't gotten the AST yet, just clear it */
739 thread_ast_clear(thr_act
, AST_SWAPOUT
);
740 need_to_release
= FALSE
;
741 TASK_STATS_INCR(task_sw_before_ast
);
743 act_unlock_thread(thr_act
);
746 * If AST_SWAPOUT was cleared, then thread_hold,
747 * or equivalent was done.
749 need_to_release
= TRUE
;
751 * Thread has hit AST, but it may not have
752 * been dequeued yet, so we need to check.
753 * NOTE: the thread may have been dequeued, but
754 * has not yet been swapped (the task_swapper_lock
755 * has been dropped, but the thread is not yet
756 * locked), and the TH_SW_TASK_SWAPPING flag may
757 * not have been cleared. In this case, we will do
758 * an extra remque, which the task_swap_swapout_thread
759 * has made safe, and clear the flag, which is also
760 * checked by the t_s_s_t before doing the swapout.
764 if (thr_act
->swap_state
& TH_SW_TASK_SWAPPING
) {
766 * hasn't yet been dequeued for swapout,
767 * so clear flags and dequeue it first.
769 thr_act
->swap_state
&= ~TH_SW_TASK_SWAPPING
;
770 assert(thr_act
->thread
== THREAD_NULL
||
771 !(thr_act
->thread
->state
&
773 queue_remove(&swapout_thread_q
, thr_act
,
774 thread_act_t
, swap_queue
);
775 TASK_STATS_INCR(task_sw_before_swap
);
777 TASK_STATS_INCR(task_sw_after_swap
);
779 * It's possible that the thread was
780 * made unswappable before hitting the
781 * AST, in which case it's still running.
783 if (thr_act
->swap_state
== TH_SW_UNSWAPPABLE
) {
784 need_to_release
= FALSE
;
785 TASK_STATS_INCR(task_sw_unswappable
);
789 thread_unlock(thread
);
791 act_unlock_thread(thr_act
);
793 task_swapper_unlock();
796 * thread_release will swap in the thread if it's been
799 if (need_to_release
) {
800 act_lock_thread(thr_act
);
801 thread_release(thr_act
);
802 act_unlock_thread(thr_act
);
807 if (task
->swap_flags
& TASK_SW_MAKE_UNSWAPPABLE
) {
808 task
->swap_flags
&= ~TASK_SW_MAKE_UNSWAPPABLE
;
809 task
->swap_state
= TASK_SW_UNSWAPPABLE
;
812 task
->swap_state
= TASK_SW_IN
;
815 task_swaprss_in
+= pmap_resident_count(task
->map
->pmap
);
816 task_swap_total_time
+= sched_tick
- task
->swap_stamp
;
817 /* note when task came back in */
818 task
->swap_stamp
= sched_tick
;
819 if (task
->swap_flags
& TASK_SW_WANT_IN
) {
820 task
->swap_flags
&= ~TASK_SW_WANT_IN
;
821 thread_wakeup((event_t
)&task
->swap_state
);
823 assert((task
->swap_flags
& TASK_SW_ELIGIBLE
) == 0);
827 if (task_swap_debug
&& on_swapped_list(task
)) {
828 printf("task 0x%X on list at end of swap in\n", task
);
831 task_swapper_unlock();
832 #endif /* TASK_SW_DEBUG */
834 * Make the task eligible to be swapped again
837 task_swapout_eligible(task
);
838 return(KERN_SUCCESS
);
841 void wake_task_swapper(boolean_t now
); /* forward */
844 * wake_task_swapper: [exported]
846 * Wakes up task swapper if now == TRUE or if at least
847 * task_swap_cycle_time has elapsed since the last call.
849 * NOTE: this function is not multithreaded, so if there is
850 * more than one caller, it must be modified.
853 wake_task_swapper(boolean_t now
)
855 /* last_task_swap_cycle may require locking */
857 (sched_tick
> (last_task_swap_cycle
+ task_swap_cycle_time
))) {
858 last_task_swap_cycle
= sched_tick
;
860 printf("wake_task_swapper: waking swapper\n");
861 thread_wakeup((event_t
)&swapped_tasks
); /* poke swapper */
865 task_t
pick_intask(void); /* forward */
868 * returns a task to be swapped in, or TASK_NULL if nothing suitable is found.
870 * current algorithm: Return the task that has been swapped out the
871 * longest, as long as it is > min_swap_time. It will be dequeued
872 * if actually swapped in.
874 * NOTE:**********************************************
875 * task->swap_rss (the size when the task was swapped out) could be used to
876 * further refine the selection. Another possibility would be to look at
877 * the state of the thread(s) to see if the task/threads would run if they
879 * ***************************************************
881 * Locking: no locks held upon entry and exit.
886 register task_t task
= TASK_NULL
;
889 /* the oldest task is the first one */
890 if (!queue_empty(&swapped_tasks
)) {
891 task
= (task_t
) queue_first(&swapped_tasks
);
892 assert(task
!= TASK_NULL
);
893 /* Make sure it's been out min_swap_time */
894 if ((sched_tick
- task
->swap_stamp
) < min_swap_time
)
897 task_swapper_unlock();
901 * This code looks at the entire list of swapped tasks, but since
902 * it does not yet do anything but look at time swapped, we
903 * can simply use the fact that the queue is ordered, and take
904 * the first one off the queue.
906 task
= (task_t
)queue_first(&swapped_tasks
);
907 while (!queue_end(&swapped_tasks
, (queue_entry_t
)task
)) {
909 tmp_time
= sched_tick
- task
->swap_stamp
;
910 if (tmp_time
> min_swap_time
&& tmp_time
> time_swapped
) {
912 time_swapped
= tmp_time
;
915 task
= (task_t
)queue_next(&task
->swapped_tasks
);
917 task_swapper_unlock();
922 task_t
pick_outtask(void); /* forward */
925 * returns a task to be swapped out, with a reference on the task,
926 * or NULL if no suitable task is found.
930 * Examine all eligible tasks. While looking, use the first thread in
931 * each task as an indication of the task's activity. Count up
932 * "active" threads (those either runnable or sleeping). If the task
933 * is active (by these criteria), swapped in, and resident
934 * for at least min_res_time, then select the task with the largest
935 * number of pages in memory. If there are less
936 * than min_active_tasks active tasks in the system, then don't
937 * swap anything out (this avoids swapping out the only running task
938 * in the system, for example).
940 * NOTE: the task selected will not be removed from the eligible list.
941 * This means that it will be selected again if it is not swapped
942 * out, where it is removed from the list.
944 * Locking: no locks held upon entry and exit. Task_swapout_lock must be
945 * taken before task locks.
947 * ***************************************************
949 * This algorithm only examines the first thread in the task. Currently, since
950 * most swappable tasks in the system are single-threaded, this generalization
951 * works reasonably well. However, the algorithm should be changed
952 * to consider all threads in the task if more multi-threaded tasks were used.
953 * ***************************************************
957 int inactive_task_count
= 0;
958 int empty_task_count
= 0;
959 #endif /* TASK_SW_STATS */
964 register task_t task
;
965 register task_t target_task
= TASK_NULL
;
966 unsigned long task_rss
;
967 unsigned long target_rss
= 0;
973 if (queue_empty(&eligible_tasks
)) {
974 /* not likely to happen */
975 task_swapout_unlock();
978 task
= (task_t
)queue_first(&eligible_tasks
);
979 while (!queue_end(&eligible_tasks
, (queue_entry_t
)task
)) {
981 register thread_act_t thr_act
;
987 * Don't swap real-time tasks.
988 * XXX Should we enforce that or can we let really critical
989 * tasks use task_swappable() to make sure they never end up
990 * n the eligible list ?
992 if (task
->policy
& POLICYCLASS_FIXEDPRI
) {
996 TASK_STATS_INCR(inactive_task_count
);
999 if (task
->res_act_count
== 0) {
1000 TASK_STATS_INCR(empty_task_count
);
1003 assert(!queue_empty(&task
->thr_acts
));
1004 thr_act
= (thread_act_t
)queue_first(&task
->thr_acts
);
1006 th
= act_lock_thread(thr_act
);
1008 if (th
!= THREAD_NULL
)
1010 if ((th
== THREAD_NULL
) ||
1011 (th
->state
== TH_RUN
) ||
1012 (th
->state
& TH_WAIT
)) {
1014 * thread is "active": either runnable
1015 * or sleeping. Count it and examine
1021 if (th
!= THREAD_NULL
)
1024 act_unlock_thread(thr_act
);
1026 (task
->swap_state
== TASK_SW_IN
) &&
1027 ((sched_tick
- task
->swap_stamp
) > min_res_time
)) {
1028 long rescount
= pmap_resident_count(task
->map
->pmap
);
1030 * thread must be "active", task must be swapped
1031 * in and resident for at least min_res_time
1034 /* DEBUG Test round-robin strategy. Picking biggest task could cause extreme
1035 * unfairness to such large interactive programs as xterm. Instead, pick the
1036 * first task that has any pages resident:
1042 task_swapout_unlock();
1043 return(target_task
);
1046 if (rescount
> target_rss
) {
1048 * task is not swapped, and it has the
1049 * largest rss seen so far.
1052 target_rss
= rescount
;
1053 assert(target_task
!= task
);
1054 if (target_task
!= TASK_NULL
)
1055 task_deallocate(target_task
);
1062 task
= (task_t
)queue_next(&task
->swapped_tasks
);
1064 task_swapout_unlock();
1065 /* only swap out if there are at least min_active_tasks */
1066 if (nactive
< min_active_tasks
) {
1067 if (target_task
!= TASK_NULL
) {
1068 task_deallocate(target_task
);
1069 target_task
= TASK_NULL
;
1072 return(target_task
);
1076 void print_pid(task_t task
, unsigned long n1
, unsigned long n2
,
1077 const char *comp
, const char *inout
); /* forward */
1088 rescount
= pmap_resident_count(task
->map
->pmap
);
1090 printf("task_swapper: swapped %s task %x; %d %s %d; res=%d\n",
1091 inout
, task
, n1
, comp
, n2
, rescount
);
1096 * task_swapper: [exported]
1098 * Executes as a separate kernel thread.
1104 task_t outtask
, intask
;
1107 boolean_t start_swapping
;
1108 boolean_t stop_swapping
;
1109 int local_page_free_avg
;
1112 thread_swappable(current_act(), FALSE
);
1113 stack_privilege(current_thread());
1118 local_page_free_avg
= vm_page_free_avg
;
1121 if (task_swap_debug
)
1122 printf("task_swapper: top of loop; cnt = %d\n",loopcnt
);
1124 intask
= pick_intask();
1126 start_swapping
= ((vm_pageout_rate_avg
> swap_start_pageout_rate
) ||
1127 (vm_grab_rate_avg
> max_grab_rate
));
1128 stop_swapping
= (vm_pageout_rate_avg
< swap_stop_pageout_rate
);
1131 * If a lot of paging is going on, or another task should come
1132 * in but memory is tight, find something to swap out and start
1133 * it. Don't swap any task out if task swapping is disabled.
1134 * vm_page_queue_free_lock protects the vm globals.
1136 outtask
= TASK_NULL
;
1137 if (start_swapping
||
1138 (!stop_swapping
&& intask
&&
1139 ((local_page_free_avg
/ AVE_SCALE
) < vm_page_free_target
))
1141 if (task_swap_enable
&&
1142 (outtask
= pick_outtask()) &&
1143 (task_swapout(outtask
) == KERN_SUCCESS
)) {
1146 if (task_swap_debug
)
1147 print_pid(outtask
, local_page_free_avg
/ AVE_SCALE
,
1148 vm_page_free_target
, "<",
1151 rss
= outtask
->swap_rss
;
1152 if (outtask
->swap_nswap
== 1)
1153 rss
/= 2; /* divide by 2 if never out */
1154 local_page_free_avg
+= (rss
/short_avg_interval
) * AVE_SCALE
;
1156 if (outtask
!= TASK_NULL
)
1157 task_deallocate(outtask
);
1161 * If there is an eligible task to bring in and there are at
1162 * least vm_page_free_target free pages, swap it in. If task
1163 * swapping has been disabled, bring the task in anyway.
1165 if (intask
&& ((local_page_free_avg
/ AVE_SCALE
) >=
1166 vm_page_free_target
||
1167 stop_swapping
|| !task_swap_enable
)) {
1168 if (task_swapin(intask
, FALSE
) == KERN_SUCCESS
) {
1171 if (task_swap_debug
)
1172 print_pid(intask
, local_page_free_avg
/ AVE_SCALE
,
1173 vm_page_free_target
, ">=",
1176 rss
= intask
->swap_rss
;
1177 if (intask
->swap_nswap
== 1)
1178 rss
/= 2; /* divide by 2 if never out */
1179 local_page_free_avg
-= (rss
/short_avg_interval
) * AVE_SCALE
;
1184 * Here we have to decide whether to continue swapping
1185 * in and/or out before sleeping. The decision should
1186 * be made based on the previous action (swapin/out) and
1187 * current system parameters, such as paging rates and
1189 * The function, compute_vm_averages, which does these
1190 * calculations, depends on being called every second,
1191 * so we can't just do the same thing.
1193 if (++loopcnt
< MAX_LOOP
)
1197 * Arrange to be awakened if paging is still heavy or there are
1198 * any tasks partially or completely swapped out. (Otherwise,
1199 * the wakeup will come from the external trigger(s).)
1203 timeout
= task_swap_cycle_time
;
1205 task_swapper_lock();
1206 if (!queue_empty(&swapped_tasks
))
1207 timeout
= min_swap_time
;
1208 task_swapper_unlock();
1210 assert_wait((event_t
)&swapped_tasks
, THREAD_UNINT
);
1212 if (task_swap_debug
)
1213 printf("task_swapper: set timeout of %d\n",
1215 thread_set_timeout(timeout
, NSEC_PER_SEC
);
1217 if (task_swap_debug
)
1218 printf("task_swapper: blocking\n");
1219 thread_block(THREAD_CONTINUE_NULL
);
1221 thread_cancel_timeout(current_thread());
1225 local_page_free_avg
= vm_page_free_avg
;
1231 #define ave(smooth, cnt, time) \
1232 smooth = ((time - 1) * (smooth) + ((cnt) * AVE_SCALE)) / (time)
1235 * We estimate the system paging load in more than one metric:
1236 * 1) the total number of calls into the function, vm_page_grab,
1237 * which allocates all page frames for real pages.
1238 * 2) the total number of pages paged in and out of paging files.
1239 * This is a measure of page cleaning and faulting from backing
1242 * When either metric passes a threshold, tasks are swapped out.
1244 long last_grab_count
= 0;
1245 long last_pageout_count
= 0;
1248 * compute_vm_averages: [exported]
1250 * This function is to be called once a second to calculate average paging
1251 * demand and average numbers of free pages for use by the task swapper.
1252 * Can also be used to wake up task swapper at desired thresholds.
1254 * NOTE: this function is single-threaded, and requires locking if
1255 * ever there are multiple callers.
1258 compute_vm_averages(void)
1260 extern unsigned long vm_page_grab_count
;
1261 long grab_count
, pageout_count
;
1264 ave(vm_page_free_avg
, vm_page_free_count
, short_avg_interval
);
1265 ave(vm_page_free_longavg
, vm_page_free_count
, long_avg_interval
);
1268 * NOTE: the vm_page_grab_count and vm_stat structure are
1269 * under control of vm_page_queue_free_lock. We're simply reading
1270 * memory here, and the numbers don't depend on each other, so
1274 grab_count
= vm_page_grab_count
;
1276 for (i
= 0; i
< NCPUS
; i
++) {
1277 pageout_count
+= vm_stat
[i
].pageouts
;
1280 ave(vm_pageout_rate_avg
, pageout_count
- last_pageout_count
,
1281 short_avg_interval
);
1282 ave(vm_pageout_rate_longavg
, pageout_count
- last_pageout_count
,
1284 ave(vm_grab_rate_avg
, grab_count
- last_grab_count
,
1285 short_avg_interval
);
1286 last_grab_count
= grab_count
;
1287 last_pageout_count
= pageout_count
;
1290 * Adjust swap_{start,stop}_pageout_rate to the paging rate peak.
1291 * This is an attempt to find the optimum paging rates at which
1292 * to trigger task swapping on or off to regulate paging activity,
1293 * depending on the hardware capacity.
1295 if (vm_pageout_rate_avg
> vm_pageout_rate_peakavg
) {
1296 unsigned int desired_max
;
1298 vm_pageout_rate_peakavg
= vm_pageout_rate_avg
;
1299 swap_start_pageout_rate
=
1300 vm_pageout_rate_peakavg
* swap_pageout_high_water_mark
/ 100;
1301 swap_stop_pageout_rate
=
1302 vm_pageout_rate_peakavg
* swap_pageout_low_water_mark
/ 100;
1307 * For measurements, allow fixed values.
1309 if (fixed_swap_start_pageout_rate
)
1310 swap_start_pageout_rate
= fixed_swap_start_pageout_rate
;
1311 if (fixed_swap_stop_pageout_rate
)
1312 swap_stop_pageout_rate
= fixed_swap_stop_pageout_rate
;
1313 #endif /* TASK_SW_DEBUG */
1316 if (task_swap_stats
)
1317 printf("vm_avgs: pageout_rate: %d %d (on/off: %d/%d); page_free: %d %d (tgt: %d)\n",
1318 vm_pageout_rate_avg
/ AVE_SCALE
,
1319 vm_pageout_rate_longavg
/ AVE_SCALE
,
1320 swap_start_pageout_rate
/ AVE_SCALE
,
1321 swap_stop_pageout_rate
/ AVE_SCALE
,
1322 vm_page_free_avg
/ AVE_SCALE
,
1323 vm_page_free_longavg
/ AVE_SCALE
,
1324 vm_page_free_target
);
1325 #endif /* TASK_SW_DEBUG */
1327 if (vm_page_free_avg
/ AVE_SCALE
<= vm_page_free_target
) {
1329 /* The following is a delicate attempt to balance the
1330 * need for reasonably rapid response to system
1331 * thrashing, with the equally important desire to
1332 * prevent the onset of swapping simply because of a
1333 * short burst of paging activity.
1335 if ((vm_pageout_rate_longavg
> swap_stop_pageout_rate
) &&
1336 (vm_pageout_rate_avg
> swap_start_pageout_rate
) ||
1337 (vm_pageout_rate_avg
> vm_pageout_rate_peakavg
) ||
1338 (vm_grab_rate_avg
> max_grab_rate
))
1339 wake_task_swapper(FALSE
);
1341 } else /* page demand is low; should consider swapin */ {
1342 if (tasks_swapped_out
!= 0)
1343 wake_task_swapper(TRUE
);
1348 task_swapout_eligible(task_t task
)
1351 task_swapper_lock();
1352 if (task_swap_debug
&& on_swapped_list(task
)) {
1353 printf("swapout_eligible: task 0x%X on swapped list\n", task
);
1356 task_swapper_unlock();
1358 task_swapout_lock();
1361 if (task
->swap_flags
& TASK_SW_ELIGIBLE
) {
1362 printf("swapout_eligible: task 0x%X already eligible\n", task
);
1364 #endif /* TASK_SW_DEBUG */
1365 if ((task
->swap_state
== TASK_SW_IN
) &&
1366 ((task
->swap_flags
& TASK_SW_ELIGIBLE
) == 0)) {
1367 queue_enter(&eligible_tasks
,task
,task_t
,swapped_tasks
);
1368 task
->swap_flags
|= TASK_SW_ELIGIBLE
;
1371 task_swapout_unlock();
1375 task_swapout_ineligible(task_t task
)
1378 task_swapper_lock();
1379 if (task_swap_debug
&& on_swapped_list(task
)) {
1380 printf("swapout_ineligible: task 0x%X on swapped list\n", task
);
1383 task_swapper_unlock();
1385 task_swapout_lock();
1388 if (!(task
->swap_flags
& TASK_SW_ELIGIBLE
))
1389 printf("swapout_ineligible: task 0x%X already inel.\n", task
);
1390 #endif /* TASK_SW_DEBUG */
1391 if ((task
->swap_state
!= TASK_SW_IN
) &&
1392 (task
->swap_flags
& TASK_SW_ELIGIBLE
)) {
1393 queue_remove(&eligible_tasks
, task
, task_t
, swapped_tasks
);
1394 task
->swap_flags
&= ~TASK_SW_ELIGIBLE
;
1397 task_swapout_unlock();
1400 int task_swap_ast_aborted
= 0;
1403 * Process an AST_SWAPOUT.
1412 act
= current_act();
1415 * Task is being swapped out. First mark it as suspended
1416 * and halted, then call thread_swapout_enqueue to put
1417 * the thread on the queue for task_swap_swapout_threads
1418 * to swap out the thread.
1421 * Don't swap unswappable threads
1423 thread
= act_lock_thread(act
);
1426 thread_lock(thread
);
1427 if ((act
->ast
& AST_SWAPOUT
) == 0) {
1429 * Race with task_swapin. Abort swapout.
1431 task_swap_ast_aborted
++; /* not locked XXX */
1433 thread_unlock(thread
);
1435 act_unlock_thread(act
);
1436 } else if (act
->swap_state
== TH_SW_IN
) {
1438 * Mark swap_state as TH_SW_TASK_SWAPPING to avoid
1439 * race with thread swapper, which will only
1440 * swap thread if swap_state is TH_SW_IN.
1441 * This way, the thread can only be swapped by
1442 * the task swapping mechanism.
1444 act
->swap_state
|= TH_SW_TASK_SWAPPING
;
1445 /* assert(act->suspend_count == 0); XXX ? */
1447 thread_unlock(thread
);
1448 if (act
->suspend_count
++ == 0) /* inline thread_hold */
1449 install_special_handler(act
);
1450 /* self->state |= TH_HALTED; */
1451 thread_ast_clear(act
, AST_SWAPOUT
);
1453 * Initialize the swap_queue fields to allow an extra
1454 * queue_remove() in task_swapin if we lose the race
1455 * (task_swapin can be called before we complete
1456 * thread_swapout_enqueue).
1458 queue_init((queue_t
) &act
->swap_queue
);
1460 act_unlock_thread(act
);
1461 /* this must be called at normal interrupt level */
1462 thread_swapout_enqueue(act
);
1464 /* thread isn't swappable; continue running */
1465 assert(act
->swap_state
== TH_SW_UNSWAPPABLE
);
1467 thread_unlock(thread
);
1468 thread_ast_clear(act
, AST_SWAPOUT
);
1470 act_unlock_thread(act
);
1474 #endif /* TASK_SWAPPER */