+/*
+ * on entry the throttle_lock is held...
+ * this function is responsible for taking
+ * and dropping the reference on the info
+ * structure which will keep it from going
+ * away while the timer is running if it
+ * happens to have been dynamically allocated by
+ * a network fileystem kext which is now trying
+ * to free it
+ */
+static uint32_t
+throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count, int wakelevel)
+{
+ struct timeval elapsed;
+ struct timeval now;
+ struct timeval period;
+ uint64_t elapsed_msecs;
+ int throttle_level;
+ int level;
+ int msecs;
+ boolean_t throttled = FALSE;
+ boolean_t need_timer = FALSE;
+
+ microuptime(&now);
+
+ if (update_io_count == TRUE) {
+ info->throttle_io_count_begin = info->throttle_io_count;
+ info->throttle_io_period_num++;
+
+ while (wakelevel >= THROTTLE_LEVEL_THROTTLED)
+ info->throttle_start_IO_period_timestamp[wakelevel--] = now;
+
+ info->throttle_min_timer_deadline = now;
+
+ msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED];
+ period.tv_sec = msecs / 1000;
+ period.tv_usec = (msecs % 1000) * 1000;
+
+ timevaladd(&info->throttle_min_timer_deadline, &period);
+ }
+ for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
+
+ elapsed = now;
+ timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
+ elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
+
+ for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
+
+ if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) {
+
+ if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level] || info->throttle_inflight_count[throttle_level]) {
+ /*
+ * we had an I/O occur at a higher priority tier within
+ * this tier's throttle window
+ */
+ throttled = TRUE;
+ }
+ /*
+ * we assume that the windows are the same or longer
+ * as we drop through the throttling tiers... thus
+ * we can stop looking once we run into a tier with
+ * threads to schedule regardless of whether it's
+ * still in its throttling window or not
+ */
+ break;
+ }
+ }
+ if (throttled == TRUE)
+ break;
+ }
+ if (throttled == TRUE) {
+ uint64_t deadline = 0;
+ struct timeval target;
+ struct timeval min_target;
+
+ /*
+ * we've got at least one tier still in a throttled window
+ * so we need a timer running... compute the next deadline
+ * and schedule it
+ */
+ for (level = throttle_level+1; level <= THROTTLE_LEVEL_END; level++) {
+
+ if (TAILQ_EMPTY(&info->throttle_uthlist[level]))
+ continue;
+
+ target = info->throttle_start_IO_period_timestamp[level];
+
+ msecs = info->throttle_io_periods[level];
+ period.tv_sec = msecs / 1000;
+ period.tv_usec = (msecs % 1000) * 1000;
+
+ timevaladd(&target, &period);
+
+ if (need_timer == FALSE || timevalcmp(&target, &min_target, <)) {
+ min_target = target;
+ need_timer = TRUE;
+ }
+ }
+ if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) {
+ if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >))
+ min_target = info->throttle_min_timer_deadline;
+ }
+
+ if (info->throttle_timer_active) {
+ if (thread_call_cancel(info->throttle_timer_call) == FALSE) {
+ /*
+ * couldn't kill the timer because it's already
+ * been dispatched, so don't try to start a new
+ * one... once we drop the lock, the timer will
+ * proceed and eventually re-run this function
+ */
+ need_timer = FALSE;
+ } else
+ info->throttle_timer_active = 0;
+ }
+ if (need_timer == TRUE) {
+ /*
+ * This is defined as an int (32-bit) rather than a 64-bit
+ * value because it would need a really big period in the
+ * order of ~500 days to overflow this. So, we let this be
+ * 32-bit which allows us to use the clock_interval_to_deadline()
+ * routine.
+ */
+ int target_msecs;
+
+ if (info->throttle_timer_ref == 0) {
+ /*
+ * take a reference for the timer
+ */
+ throttle_info_ref(info);
+
+ info->throttle_timer_ref = 1;
+ }
+ elapsed = min_target;
+ timevalsub(&elapsed, &now);
+ target_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
+
+ if (target_msecs <= 0) {
+ /*
+ * we may have computed a deadline slightly in the past
+ * due to various factors... if so, just set the timer
+ * to go off in the near future (we don't need to be precise)
+ */
+ target_msecs = 1;
+ }
+ clock_interval_to_deadline(target_msecs, 1000000, &deadline);
+
+ thread_call_enter_delayed(info->throttle_timer_call, deadline);
+ info->throttle_timer_active = 1;
+ }
+ }
+ return (throttle_level);
+}
+
+
+static void
+throttle_timer(struct _throttle_io_info_t *info)
+{
+ uthread_t ut, utlist;
+ struct timeval elapsed;
+ struct timeval now;
+ uint64_t elapsed_msecs;
+ int throttle_level;
+ int level;
+ int wake_level;
+ caddr_t wake_address = NULL;
+ boolean_t update_io_count = FALSE;
+ boolean_t need_wakeup = FALSE;
+ boolean_t need_release = FALSE;
+
+ ut = NULL;
+ lck_mtx_lock(&info->throttle_lock);
+
+ info->throttle_timer_active = 0;
+ microuptime(&now);
+
+ elapsed = now;
+ timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]);
+ elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
+
+ if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) {
+
+ wake_level = info->throttle_next_wake_level;
+
+ for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) {
+
+ elapsed = now;
+ timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]);
+ elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
+
+ if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
+ /*
+ * we're closing out the current IO period...
+ * if we have a waiting thread, wake it up
+ * after we have reset the I/O window info
+ */
+ need_wakeup = TRUE;
+ update_io_count = TRUE;
+
+ info->throttle_next_wake_level = wake_level - 1;
+
+ if (info->throttle_next_wake_level == THROTTLE_LEVEL_START)
+ info->throttle_next_wake_level = THROTTLE_LEVEL_END;
+
+ break;
+ }
+ wake_level--;
+
+ if (wake_level == THROTTLE_LEVEL_START)
+ wake_level = THROTTLE_LEVEL_END;
+ }
+ }
+ if (need_wakeup == TRUE) {
+ if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
+
+ ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]);
+ TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist);
+ ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
+ ut->uu_is_throttled = FALSE;
+
+ wake_address = (caddr_t)&ut->uu_on_throttlelist;
+ }
+ } else
+ wake_level = THROTTLE_LEVEL_START;
+
+ throttle_level = throttle_timer_start(info, update_io_count, wake_level);
+
+ if (wake_address != NULL)
+ wakeup(wake_address);
+
+ for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) {
+
+ TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) {
+
+ TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist);
+ ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
+ ut->uu_is_throttled = FALSE;
+
+ wakeup(&ut->uu_on_throttlelist);
+ }
+ }
+ if (info->throttle_timer_active == 0 && info->throttle_timer_ref) {
+ info->throttle_timer_ref = 0;
+ need_release = TRUE;
+ }
+ lck_mtx_unlock(&info->throttle_lock);
+
+ if (need_release == TRUE)
+ throttle_info_rel(info);
+}
+
+
+static int
+throttle_add_to_list(struct _throttle_io_info_t *info, uthread_t ut, int mylevel, boolean_t insert_tail)
+{
+ boolean_t start_timer = FALSE;
+ int level = THROTTLE_LEVEL_START;
+
+ if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) {
+ info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel];
+ start_timer = TRUE;
+ }
+
+ if (insert_tail == TRUE)
+ TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
+ else
+ TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
+
+ ut->uu_on_throttlelist = mylevel;
+
+ if (start_timer == TRUE) {
+ /* we may need to start or rearm the timer */
+ level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START);
+
+ if (level == THROTTLE_LEVEL_END) {
+ if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
+ TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
+
+ ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
+ }
+ }
+ }
+ return (level);
+}
+
+static void
+throttle_init_throttle_window(void)
+{
+ int throttle_window_size;
+
+ /*
+ * The hierarchy of throttle window values is as follows:
+ * - Global defaults
+ * - Device tree properties
+ * - Boot-args
+ * All values are specified in msecs.
+ */
+
+ /* Override global values with device-tree properties */
+ if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size)))
+ throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
+
+ if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size)))
+ throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
+
+ if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size)))
+ throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
+
+ /* Override with boot-args */
+ if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size)))
+ throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
+
+ if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size)))
+ throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
+
+ if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size)))
+ throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
+}
+
+static void
+throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd)
+{
+ int throttle_period_size;
+
+ /*
+ * The hierarchy of throttle period values is as follows:
+ * - Global defaults
+ * - Device tree properties
+ * - Boot-args
+ * All values are specified in msecs.
+ */
+
+ /* Assign global defaults */
+ if ((isssd == TRUE) && (info->throttle_is_fusion_with_priority == 0))
+ info->throttle_io_periods = &throttle_io_period_ssd_msecs[0];
+ else
+ info->throttle_io_periods = &throttle_io_period_msecs[0];
+
+ /* Override global values with device-tree properties */
+ if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size)))
+ info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
+
+ if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size)))
+ info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
+
+ if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size)))
+ info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
+
+ /* Override with boot-args */
+ if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size)))
+ info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
+
+ if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size)))
+ info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
+
+ if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size)))
+ info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
+
+}
+
+#if CONFIG_IOSCHED
+extern void vm_io_reprioritize_init(void);
+int iosched_enabled = 1;
+#endif
+
+void
+throttle_init(void)
+{
+ struct _throttle_io_info_t *info;
+ int i;
+ int level;
+#if CONFIG_IOSCHED
+ int iosched;
+#endif
+ /*
+ * allocate lock group attribute and group
+ */
+ throttle_lock_grp_attr = lck_grp_attr_alloc_init();
+ throttle_lock_grp = lck_grp_alloc_init("throttle I/O", throttle_lock_grp_attr);
+
+ /* Update throttle parameters based on device tree configuration */
+ throttle_init_throttle_window();
+
+ /*
+ * allocate the lock attribute
+ */
+ throttle_lock_attr = lck_attr_alloc_init();
+
+ for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
+ info = &_throttle_io_info[i];
+
+ lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
+ info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
+
+ for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
+ TAILQ_INIT(&info->throttle_uthlist[level]);
+ info->throttle_last_IO_pid[level] = 0;
+ info->throttle_inflight_count[level] = 0;
+ }
+ info->throttle_next_wake_level = THROTTLE_LEVEL_END;
+ info->throttle_disabled = 0;
+ info->throttle_is_fusion_with_priority = 0;
+ }
+#if CONFIG_IOSCHED
+ if (PE_parse_boot_argn("iosched", &iosched, sizeof(iosched))) {
+ iosched_enabled = iosched;
+ }
+ if (iosched_enabled) {
+ /* Initialize I/O Reprioritization mechanism */
+ vm_io_reprioritize_init();
+ }
+#endif
+}
+
+void
+sys_override_io_throttle(int flag)
+{
+ if (flag == THROTTLE_IO_ENABLE)
+ lowpri_throttle_enabled = 1;
+
+ if (flag == THROTTLE_IO_DISABLE)
+ lowpri_throttle_enabled = 0;
+}
+
+int rethrottle_wakeups = 0;
+
+/*
+ * the uu_rethrottle_lock is used to synchronize this function
+ * with "throttle_lowpri_io" which is where a throttled thread
+ * will block... that function will grab this lock before beginning
+ * it's decision making process concerning the need to block, and
+ * hold it through the assert_wait. When that thread is awakened
+ * for any reason (timer or rethrottle), it will reacquire the
+ * uu_rethrottle_lock before determining if it really is ok for
+ * it to now run. This is the point at which the thread could
+ * enter a different throttling queue and reblock or return from
+ * the throttle w/o having waited out it's entire throttle if
+ * the rethrottle has now moved it out of any currently
+ * active throttle window.
+ *
+ *
+ * NOTES:
+ * 1 - This may be called with the task lock held.
+ * 2 - This may be called with preemption and interrupts disabled
+ * in the kqueue wakeup path so we can't take the throttle_lock which is a mutex
+ * 3 - This cannot safely dereference uu_throttle_info, as it may
+ * get deallocated out from under us
+ */
+
+void
+rethrottle_thread(uthread_t ut)
+{
+ /*
+ * If uthread doesn't have throttle state, then there's no chance
+ * of it needing a rethrottle.
+ */
+ if (ut->uu_throttle_info == NULL)
+ return;
+
+ boolean_t s = ml_set_interrupts_enabled(FALSE);
+ lck_spin_lock(&ut->uu_rethrottle_lock);
+
+ if (ut->uu_is_throttled == FALSE)
+ ut->uu_was_rethrottled = TRUE;
+ else {
+ int my_new_level = throttle_get_thread_throttle_level(ut);
+
+ if (my_new_level != ut->uu_on_throttlelist) {
+ /*
+ * ut is currently blocked (as indicated by
+ * ut->uu_is_throttled == TRUE)
+ * and we're changing it's throttle level, so
+ * we need to wake it up.
+ */
+ ut->uu_is_throttled = FALSE;
+ wakeup(&ut->uu_on_throttlelist);
+
+ rethrottle_wakeups++;
+ KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 102)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, my_new_level, 0, 0);
+ }
+ }
+ lck_spin_unlock(&ut->uu_rethrottle_lock);
+ ml_set_interrupts_enabled(s);
+}
+
+