+/*
+ * Copyright (c) 2007 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/affinity.h>
+#include <kern/task.h>
+#include <kern/kalloc.h>
+#include <machine/cpu_affinity.h>
+
+/*
+ * Affinity involves 2 objects:
+ * - affinity namespace:
+ * shared by a task family, this controls affinity tag lookup and
+ * allocation; it anchors all affinity sets in one namespace
+ * - affinity set:
+ * anchors all threads with membership of this affinity set
+ * and which share an affinity tag in the owning namespace.
+ *
+ * Locking:
+ * - The task lock protects the creation of an affinity namespace.
+ * - The affinity namespace mutex protects the inheritance of a namespace
+ * and its thread membership. This includes its destruction when the task
+ * reference count goes to zero.
+ * - The thread mutex protects a thread's affinity set membership, but in
+ * addition, the thread_lock is taken to write thread->affinity_set since this
+ * field (representng the active affinity set) is read by the scheduler.
+ *
+ * The lock ordering is: task lock, thread mutex, namespace mutex, thread lock.
+ */
+
+#if AFFINITY_DEBUG
+#define DBG(x...) kprintf("DBG: " x)
+#else
+#define DBG(x...)
+#endif
+
+struct affinity_space {
+ mutex_t aspc_lock;
+ uint32_t aspc_task_count;
+ queue_head_t aspc_affinities;
+};
+typedef struct affinity_space *affinity_space_t;
+
+static affinity_space_t affinity_space_alloc(void);
+static void affinity_space_free(affinity_space_t aspc);
+static affinity_set_t affinity_set_alloc(void);
+static void affinity_set_free(affinity_set_t aset);
+static affinity_set_t affinity_set_find(affinity_space_t aspc, uint32_t tag);
+static void affinity_set_place(affinity_space_t aspc, affinity_set_t aset);
+static void affinity_set_add(affinity_set_t aset, thread_t thread);
+static affinity_set_t affinity_set_remove(affinity_set_t aset, thread_t thread);
+
+/*
+ * The following globals may be modified by the sysctls
+ * kern.affinity_sets_enabled - disables hinting if cleared
+ * kern.affinity_sets_mapping - controls cache distribution policy
+ * See bsd/kern_sysctl.c
+ */
+boolean_t affinity_sets_enabled = TRUE;
+int affinity_sets_mapping = 1;
+
+boolean_t
+thread_affinity_is_supported(void)
+{
+ return (ml_get_max_affinity_sets() != 0);
+}
+
+
+/*
+ * thread_affinity_get()
+ * Return the affinity tag for a thread.
+ * Called with the thread mutex held.
+ */
+uint32_t
+thread_affinity_get(thread_t thread)
+{
+ uint32_t tag;
+
+ if (thread->affinity_set != NULL)
+ tag = thread->affinity_set->aset_tag;
+ else
+ tag = THREAD_AFFINITY_TAG_NULL;
+
+ return tag;
+}
+
+
+/*
+ * thread_affinity_set()
+ * Place a thread in an affinity set identified by a tag.
+ * Called with thread referenced but not locked.
+ */
+kern_return_t
+thread_affinity_set(thread_t thread, uint32_t tag)
+{
+ affinity_set_t aset;
+ affinity_set_t empty_aset = NULL;
+ affinity_space_t aspc;
+ affinity_space_t new_aspc = NULL;
+
+ DBG("thread_affinity_set(%p,%u)\n", thread, tag);
+
+ task_lock(thread->task);
+ aspc = thread->task->affinity_space;
+ if (aspc == NULL) {
+ task_unlock(thread->task);
+ new_aspc = affinity_space_alloc();
+ if (new_aspc == NULL)
+ return KERN_RESOURCE_SHORTAGE;
+ task_lock(thread->task);
+ if (thread->task->affinity_space == NULL) {
+ thread->task->affinity_space = new_aspc;
+ new_aspc = NULL;
+ }
+ aspc = thread->task->affinity_space;
+ }
+ task_unlock(thread->task);
+ if (new_aspc)
+ affinity_space_free(new_aspc);
+
+ thread_mtx_lock(thread);
+ if (!thread->active) {
+ /* Beaten to lock and the thread is dead */
+ thread_mtx_unlock(thread);
+ return KERN_TERMINATED;
+ }
+
+ mutex_lock(&aspc->aspc_lock);
+ aset = thread->affinity_set;
+ if (aset != NULL) {
+ /*
+ * Remove thread from current affinity set
+ */
+ DBG("thread_affinity_set(%p,%u) removing from aset %p\n",
+ thread, tag, aset);
+ empty_aset = affinity_set_remove(aset, thread);
+ }
+
+ if (tag != THREAD_AFFINITY_TAG_NULL) {
+ aset = affinity_set_find(aspc, tag);
+ if (aset != NULL) {
+ /*
+ * Add thread to existing affinity set
+ */
+ DBG("thread_affinity_set(%p,%u) found aset %p\n",
+ thread, tag, aset);
+ } else {
+ /*
+ * Use the new affinity set, add this thread
+ * and place it in a suitable processor set.
+ */
+ if (empty_aset != NULL) {
+ aset = empty_aset;
+ empty_aset = NULL;
+ } else {
+ aset = affinity_set_alloc();
+ if (aset == NULL) {
+ mutex_unlock(&aspc->aspc_lock);
+ thread_mtx_unlock(thread);
+ return KERN_RESOURCE_SHORTAGE;
+ }
+ }
+ DBG("thread_affinity_set(%p,%u) (re-)using aset %p\n",
+ thread, tag, aset);
+ aset->aset_tag = tag;
+ affinity_set_place(aspc, aset);
+ }
+ affinity_set_add(aset, thread);
+ }
+
+ mutex_unlock(&aspc->aspc_lock);
+ thread_mtx_unlock(thread);
+
+ /*
+ * If we wound up not using an empty aset we created,
+ * free it here.
+ */
+ if (empty_aset != NULL)
+ affinity_set_free(empty_aset);
+
+ if (thread == current_thread())
+ thread_block(THREAD_CONTINUE_NULL);
+
+ return KERN_SUCCESS;
+}
+
+/*
+ * task_affinity_create()
+ * Called from task create.
+ */
+void
+task_affinity_create(task_t parent_task, task_t child_task)
+{
+ affinity_space_t aspc = parent_task->affinity_space;
+
+ DBG("task_affinity_create(%p,%p)\n", parent_task, child_task);
+
+ assert(aspc);
+
+ /*
+ * Bump the task reference count on the shared namespace and
+ * give it to the child.
+ */
+ mutex_lock(&aspc->aspc_lock);
+ aspc->aspc_task_count++;
+ child_task->affinity_space = aspc;
+ mutex_unlock(&aspc->aspc_lock);
+}
+
+/*
+ * task_affinity_deallocate()
+ * Called from task_deallocate() when there's a namespace to dereference.
+ */
+void
+task_affinity_deallocate(task_t task)
+{
+ affinity_space_t aspc = task->affinity_space;
+
+ DBG("task_affinity_deallocate(%p) aspc %p task_count %d\n",
+ task, aspc, aspc->aspc_task_count);
+
+ mutex_lock(&aspc->aspc_lock);
+ if (--(aspc->aspc_task_count) == 0) {
+ assert(queue_empty(&aspc->aspc_affinities));
+ mutex_unlock(&aspc->aspc_lock);
+ affinity_space_free(aspc);
+ } else {
+ mutex_unlock(&aspc->aspc_lock);
+ }
+}
+
+/*
+ * task_affinity_info()
+ * Return affinity tag info (number, min, max) for the task.
+ */
+kern_return_t
+task_affinity_info(
+ task_t task,
+ task_info_t task_info_out,
+ mach_msg_type_number_t *task_info_count)
+{
+ affinity_set_t aset;
+ affinity_space_t aspc;
+ task_affinity_tag_info_t info;
+
+ *task_info_count = TASK_AFFINITY_TAG_INFO_COUNT;
+ info = (task_affinity_tag_info_t) task_info_out;
+ info->set_count = 0;
+ info->task_count = 0;
+ info->min = THREAD_AFFINITY_TAG_NULL;
+ info->max = THREAD_AFFINITY_TAG_NULL;
+
+ task_lock(task);
+ aspc = task->affinity_space;
+ if (aspc) {
+ mutex_lock(&aspc->aspc_lock);
+ queue_iterate(&aspc->aspc_affinities,
+ aset, affinity_set_t, aset_affinities) {
+ info->set_count++;
+ if (info->min == THREAD_AFFINITY_TAG_NULL ||
+ aset->aset_tag < (uint32_t) info->min)
+ info->min = aset->aset_tag;
+ if (info->max == THREAD_AFFINITY_TAG_NULL ||
+ aset->aset_tag > (uint32_t) info->max)
+ info->max = aset->aset_tag;
+ }
+ info->task_count = aspc->aspc_task_count;
+ mutex_unlock(&aspc->aspc_lock);
+ }
+ task_unlock(task);
+ return KERN_SUCCESS;
+}
+
+/*
+ * Called from thread_dup() during fork() with child's mutex held.
+ * Set the child into the parent's affinity set.
+ * Note the affinity space is shared.
+ */
+void
+thread_affinity_dup(thread_t parent, thread_t child)
+{
+ affinity_set_t aset;
+ affinity_space_t aspc;
+
+ thread_mtx_lock(parent);
+ aset = parent->affinity_set;
+ DBG("thread_affinity_dup(%p,%p) aset %p\n", parent, child, aset);
+ if (aset == NULL) {
+ thread_mtx_unlock(parent);
+ return;
+ }
+
+ aspc = aset->aset_space;
+ assert(aspc == parent->task->affinity_space);
+ assert(aspc == child->task->affinity_space);
+
+ mutex_lock(&aspc->aspc_lock);
+ affinity_set_add(aset, child);
+ mutex_unlock(&aspc->aspc_lock);
+
+ thread_mtx_unlock(parent);
+}
+
+/*
+ * thread_affinity_terminate()
+ * Remove thread from any affinity set.
+ * Called with the thread mutex locked.
+ */
+void
+thread_affinity_terminate(thread_t thread)
+{
+ affinity_set_t aset = thread->affinity_set;
+ affinity_space_t aspc;
+
+ DBG("thread_affinity_terminate(%p)\n", thread);
+
+ aspc = aset->aset_space;
+ mutex_lock(&aspc->aspc_lock);
+ if (affinity_set_remove(aset, thread)) {
+ affinity_set_free(aset);
+ }
+ mutex_unlock(&aspc->aspc_lock);
+}
+
+/*
+ * Create an empty affinity namespace data structure.
+ */
+static affinity_space_t
+affinity_space_alloc(void)
+{
+ affinity_space_t aspc;
+
+ aspc = (affinity_space_t) kalloc(sizeof(struct affinity_space));
+ if (aspc == NULL)
+ return NULL;
+
+ mutex_init(&aspc->aspc_lock, 0);
+ queue_init(&aspc->aspc_affinities);
+ aspc->aspc_task_count = 1;
+
+ DBG("affinity_space_create() returns %p\n", aspc);
+ return aspc;
+}
+
+/*
+ * Destroy the given empty affinity namespace data structure.
+ */
+static void
+affinity_space_free(affinity_space_t aspc)
+{
+ assert(queue_empty(&aspc->aspc_affinities));
+
+ DBG("affinity_space_free(%p)\n", aspc);
+ kfree(aspc, sizeof(struct affinity_space));
+}
+
+
+/*
+ * Create an empty affinity set data structure
+ * entering it into a list anchored by the owning task.
+ */
+static affinity_set_t
+affinity_set_alloc(void)
+{
+ affinity_set_t aset;
+
+ aset = (affinity_set_t) kalloc(sizeof(struct affinity_set));
+ if (aset == NULL)
+ return NULL;
+
+ aset->aset_thread_count = 0;
+ queue_init(&aset->aset_affinities);
+ queue_init(&aset->aset_threads);
+ aset->aset_num = 0;
+ aset->aset_pset = PROCESSOR_SET_NULL;
+ aset->aset_space = NULL;
+
+ DBG("affinity_set_create() returns %p\n", aset);
+ return aset;
+}
+
+/*
+ * Destroy the given empty affinity set data structure
+ * after removing it from the parent task.
+ */
+static void
+affinity_set_free(affinity_set_t aset)
+{
+ assert(queue_empty(&aset->aset_threads));
+
+ DBG("affinity_set_free(%p)\n", aset);
+ kfree(aset, sizeof(struct affinity_set));
+}
+
+/*
+ * Add a thread to an affinity set.
+ * The caller must have the thread mutex and space locked.
+ */
+static void
+affinity_set_add(affinity_set_t aset, thread_t thread)
+{
+ spl_t s;
+
+ DBG("affinity_set_add(%p,%p)\n", aset, thread);
+ queue_enter(&aset->aset_threads,
+ thread, thread_t, affinity_threads);
+ aset->aset_thread_count++;
+ s = splsched();
+ thread_lock(thread);
+ thread->affinity_set = affinity_sets_enabled ? aset : NULL;
+ thread_unlock(thread);
+ splx(s);
+}
+
+/*
+ * Remove a thread from an affinity set returning the set if now empty.
+ * The caller must have the thread mutex and space locked.
+ */
+static affinity_set_t
+affinity_set_remove(affinity_set_t aset, thread_t thread)
+{
+ spl_t s;
+
+ s = splsched();
+ thread_lock(thread);
+ thread->affinity_set = NULL;
+ thread_unlock(thread);
+ splx(s);
+
+ aset->aset_thread_count--;
+ queue_remove(&aset->aset_threads,
+ thread, thread_t, affinity_threads);
+ if (queue_empty(&aset->aset_threads)) {
+ queue_remove(&aset->aset_space->aspc_affinities,
+ aset, affinity_set_t, aset_affinities);
+ assert(aset->aset_thread_count == 0);
+ aset->aset_tag = THREAD_AFFINITY_TAG_NULL;
+ aset->aset_num = 0;
+ aset->aset_pset = PROCESSOR_SET_NULL;
+ aset->aset_space = NULL;
+ DBG("affinity_set_remove(%p,%p) set now empty\n", aset, thread);
+ return aset;
+ } else {
+ DBG("affinity_set_remove(%p,%p)\n", aset, thread);
+ return NULL;
+ }
+}
+
+/*
+ * Find an affinity set in the parent task with the given affinity tag.
+ * The caller must have the space locked.
+ */
+static affinity_set_t
+affinity_set_find(affinity_space_t space, uint32_t tag)
+{
+ affinity_set_t aset;
+
+ queue_iterate(&space->aspc_affinities,
+ aset, affinity_set_t, aset_affinities) {
+ if (aset->aset_tag == tag) {
+ DBG("affinity_set_find(%p,%u) finds %p\n",
+ space, tag, aset);
+ return aset;
+ }
+ }
+ DBG("affinity_set_find(%p,%u) not found\n", space, tag);
+ return NULL;
+}
+
+/*
+ * affinity_set_place() assigns an affinity set to a suitable processor_set.
+ * The selection criteria is:
+ * - the set currently occupied by the least number of affinities
+ * belonging to the owning the task.
+ * The caller must have the space locked.
+ */
+static void
+affinity_set_place(affinity_space_t aspc, affinity_set_t new_aset)
+{
+ unsigned int num_cpu_asets = ml_get_max_affinity_sets();
+ unsigned int set_occupancy[num_cpu_asets];
+ unsigned int i;
+ unsigned int i_least_occupied;
+ affinity_set_t aset;
+
+ for (i = 0; i < num_cpu_asets; i++)
+ set_occupancy[i] = 0;
+
+ /*
+ * Scan the affinity sets calculating the number of sets
+ * occupy the available physical affinities.
+ */
+ queue_iterate(&aspc->aspc_affinities,
+ aset, affinity_set_t, aset_affinities) {
+ set_occupancy[aset->aset_num]++;
+ }
+
+ /*
+ * Find the least occupied set (or the first empty set).
+ * To distribute placements somewhat, start searching from
+ * a cpu affinity chosen randomly per namespace:
+ * [(unsigned int)aspc % 127] % num_cpu_asets
+ * unless this mapping policy is overridden.
+ */
+ if (affinity_sets_mapping == 0)
+ i_least_occupied = 0;
+ else
+ i_least_occupied = ((unsigned int)aspc % 127) % num_cpu_asets;
+ for (i = 0; i < num_cpu_asets; i++) {
+ unsigned int j = (i_least_occupied + i) % num_cpu_asets;
+ if (set_occupancy[j] == 0) {
+ i_least_occupied = j;
+ break;
+ }
+ if (set_occupancy[j] < set_occupancy[i_least_occupied])
+ i_least_occupied = j;
+ }
+ new_aset->aset_num = i_least_occupied;
+ new_aset->aset_pset = ml_affinity_to_pset(i_least_occupied);
+
+ /* Add the new affinity set to the group */
+ new_aset->aset_space = aspc;
+ queue_enter(&aspc->aspc_affinities,
+ new_aset, affinity_set_t, aset_affinities);
+
+ DBG("affinity_set_place(%p,%p) selected affinity %u pset %p\n",
+ aspc, new_aset, new_aset->aset_num, new_aset->aset_pset);
+}